2222
2323#include < algorithm>
2424
25+ #include " common/config.h"
2526#include " common/logging.h"
27+ #include " gen_cpp/internal_service.pb.h"
28+ #include " runtime/cdc_client_mgr.h"
29+ #include " runtime/exec_env.h"
2630
2731namespace doris ::io {
2832
@@ -84,44 +88,100 @@ HttpFileReader::HttpFileReader(const OpenFileInfo& fileInfo, std::string url, in
8488 }
8589 }
8690
91+ // Parse chunk response configuration; chunk response implies no Range support
92+ auto chunk_iter = _extend_kv.find (" http.enable.chunk.response" );
93+ if (chunk_iter != _extend_kv.end ()) {
94+ std::string value = chunk_iter->second ;
95+ std::transform (value.begin (), value.end (), value.begin (), ::tolower);
96+ _enable_chunk_response = (value == " true" || value == " 1" );
97+ if (_enable_chunk_response) {
98+ _range_supported = false ;
99+ }
100+ }
101+
87102 _read_buffer = std::make_unique<char []>(READ_BUFFER_SIZE);
88103}
89104
90105HttpFileReader::~HttpFileReader () {
91106 static_cast <void >(close ());
92107}
93108
109+ Status HttpFileReader::setup_cdc_client () {
110+ auto enable_cdc_iter = _extend_kv.find (" enable_cdc_client" );
111+ if (enable_cdc_iter == _extend_kv.end () || enable_cdc_iter->second != " true" ) {
112+ return Status::OK ();
113+ }
114+
115+ LOG (INFO) << " CDC client is enabled, starting CDC client for " << _url;
116+ ExecEnv* env = ExecEnv::GetInstance ();
117+ if (env == nullptr || env->cdc_client_mgr () == nullptr ) {
118+ return Status::InternalError (" ExecEnv or CdcClientMgr is not initialized" );
119+ }
120+
121+ PRequestCdcClientResult result;
122+ Status start_st = env->cdc_client_mgr ()->start_cdc_client (&result);
123+ if (!start_st.ok ()) {
124+ LOG (ERROR) << " Failed to start CDC client, status=" << start_st.to_string ();
125+ return start_st;
126+ }
127+
128+ // Replace CDC_CLIENT_PORT placeholder with actual CDC client port
129+ const std::string placeholder = " CDC_CLIENT_PORT" ;
130+ size_t pos = _url.find (placeholder);
131+ if (pos != std::string::npos) {
132+ _url.replace (pos, placeholder.size (), std::to_string (doris::config::cdc_client_port));
133+ }
134+ LOG (INFO) << " CDC client started successfully for " << _url;
135+ return Status::OK ();
136+ }
137+
94138Status HttpFileReader::open (const FileReaderOptions& opts) {
139+ // CDC client setup must run before the _initialized guard.
140+ // See setup_cdc_client() for lifecycle details.
141+ RETURN_IF_ERROR (setup_cdc_client ());
142+
143+ // Skip metadata detection when file size was pre-supplied by the caller.
95144 if (_initialized) {
96145 return Status::OK ();
97146 }
98147
99- // Step 1: HEAD request to get file metadata
100- RETURN_IF_ERROR (prepare_client (/* set_fail_on_error=*/ true ));
101- _client->set_method (HttpMethod::HEAD);
102- RETURN_IF_ERROR (_client->execute ());
148+ // Step 1: HEAD request to get file metadata (skip for chunk response)
149+ if (_enable_chunk_response) {
150+ // Chunk streaming response: size is unknown until the stream completes.
151+ // _range_supported is already false (set in constructor).
152+ _size_known = false ;
153+ // Reset _file_size from the SIZE_MAX default to 0 so that any caller of
154+ // size() (e.g. NewJsonReader::_read_one_message) does not attempt to
155+ // allocate SIZE_MAX bytes before the download completes.
156+ _file_size = 0 ;
157+ LOG (INFO) << " Chunk response mode enabled, skipping HEAD request for " << _url;
158+ } else {
159+ // Normal mode: execute HEAD request to get file metadata
160+ RETURN_IF_ERROR (prepare_client (/* set_fail_on_error=*/ true ));
161+ _client->set_method (HttpMethod::HEAD);
162+ RETURN_IF_ERROR (_client->execute ());
103163
104- uint64_t content_length = 0 ;
105- RETURN_IF_ERROR (_client->get_content_length (&content_length));
164+ uint64_t content_length = 0 ;
165+ RETURN_IF_ERROR (_client->get_content_length (&content_length));
106166
107- _file_size = content_length;
108- _size_known = true ;
167+ _file_size = content_length;
168+ _size_known = true ;
169+ }
109170
110- // Step 2: Check if Range request is disabled by configuration
111- if (!_enable_range_request) {
112- // User explicitly disabled Range requests, use non-Range mode directly
171+ // Step 2: Check if Range request is disabled by configuration.
172+ // Chunk response mode always has _range_supported=false (set in constructor), so only
173+ // the non-chunk non-Range path needs the file size guard.
174+ if (_enable_chunk_response) {
175+ // Nothing to do: _range_supported already false, size check not applicable
176+ } else if (!_enable_range_request) {
113177 _range_supported = false ;
114- LOG (INFO) << " Range requests disabled by configuration for " << _url
115- << " , using non-Range mode. File size: " << _file_size << " bytes" ;
116-
117- // Check if file size exceeds limit for non-Range mode
178+ LOG (INFO) << " Range requests disabled by configuration for " << _url;
118179 if (_file_size > _max_request_size_bytes) {
119180 return Status::InternalError (
120- " Non-Range mode: file size ({} bytes) exceeds maximum allowed size ({} bytes, "
121- " configured by http.max.request.size.bytes). URL: {}" ,
181+ " Non-Range mode: file size ({} bytes) exceeds maximum allowed size ({} "
182+ " bytes, configured by http.max.request.size.bytes). URL: {}" ,
122183 _file_size, _max_request_size_bytes, _url);
123184 }
124-
125185 LOG (INFO) << " Non-Range mode validated for " << _url << " , file size: " << _file_size
126186 << " bytes, max allowed: " << _max_request_size_bytes << " bytes" ;
127187 } else {
@@ -224,9 +284,29 @@ Status HttpFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r
224284 VLOG (2 ) << " Issuing HTTP GET request: offset=" << offset << " req_len=" << req_len
225285 << " with_range=" << _range_supported;
226286
227- // Prepare and initialize the HTTP client for GET request
287+ // Prepare and initialize the HTTP client for request
228288 RETURN_IF_ERROR (prepare_client (/* set_fail_on_error=*/ false ));
229- _client->set_method (HttpMethod::GET);
289+
290+ // Determine HTTP method from configuration (default: GET)
291+ HttpMethod method = HttpMethod::GET;
292+ auto method_iter = _extend_kv.find (" http.method" );
293+ if (method_iter != _extend_kv.end ()) {
294+ method = to_http_method (method_iter->second .c_str ());
295+ if (method == HttpMethod::UNKNOWN) {
296+ LOG (WARNING) << " Invalid http.method value: " << method_iter->second
297+ << " , falling back to GET" ;
298+ method = HttpMethod::GET;
299+ }
300+ }
301+ _client->set_method (method);
302+
303+ // Set payload if configured (supports POST, PUT, DELETE, etc.)
304+ auto payload_iter = _extend_kv.find (" http.payload" );
305+ if (payload_iter != _extend_kv.end () && !payload_iter->second .empty ()) {
306+ _client->set_payload (payload_iter->second );
307+ _client->set_content_type (" application/json" );
308+ VLOG (2 ) << " HTTP request with payload, size=" << payload_iter->second .size ();
309+ }
230310
231311 _client->set_header (" Expect" , " " );
232312 _client->set_header (" Connection" , " close" );
@@ -270,6 +350,21 @@ Status HttpFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r
270350 long http_status = _client->get_http_status ();
271351 VLOG (2 ) << " HTTP response: status=" << http_status << " received_bytes=" << buf.size ();
272352
353+ // Check for HTTP error status codes (4xx, 5xx)
354+ if (http_status >= 400 ) {
355+ std::string error_body;
356+ if (buf.empty ()) {
357+ error_body = " (empty response body)" ;
358+ } else {
359+ // Limit error message to 1024 bytes to avoid excessive logging
360+ size_t max_len = std::min (buf.size (), static_cast <size_t >(1024 ));
361+ error_body = buf.substr (0 , max_len);
362+ }
363+
364+ return Status::InternalError (" HTTP request failed with status {}: {}." , http_status,
365+ error_body);
366+ }
367+
273368 if (buf.empty ()) {
274369 *bytes_read = buffer_offset;
275370 return Status::OK ();
@@ -295,6 +390,11 @@ Status HttpFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_r
295390 // Cache the complete file content for subsequent reads
296391 _full_file_cache = std::move (buf);
297392 _full_file_cached = true ;
393+ // Now that the full content is in hand, update _file_size to the actual
394+ // byte count. This replaces the 0 placeholder set in open() for chunk
395+ // response mode, so subsequent calls to size() return a correct value.
396+ _file_size = _full_file_cache.size ();
397+ _size_known = true ;
298398
299399 VLOG (2 ) << " Cached full file: " << _full_file_cache.size () << " bytes" ;
300400
0 commit comments