[feature](load) introduce single-stream-multi-table load (apache#20006)

For routine load (kafka load), user can produce all data for different table into single topic and doris will dispatch them into corresponding table. Signed-off-by: freemandealer <[email protected]>
HappenLee · Jun 7, 2023 · 09344ea · 09344ea
1 parent fbbf4c4
commit 09344ea
Show file tree

Hide file tree

Showing 25 changed files with 588 additions and 42 deletions.
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
@@ -784,6 +784,10 @@ DEFINE_String(kafka_broker_version_fallback, "0.10.0");
 // Change this size to 0 to fix it temporarily.
 DEFINE_Int32(routine_load_consumer_pool_size, "10");
 
+// Used in single-stream-multi-table load. When receive a batch of messages from kafka,
+// if the size of batch is more than this threshold, we will request plans for all related tables.
+DEFINE_Int32(multi_table_batch_plan_threshold, "200");
+
 // When the timeout of a load task is less than this threshold,
 // Doris treats it as a high priority task.
 // high priority tasks use a separate thread pool for flush and do not block rpc by memory cleanup logic.

diff --git a/be/src/common/config.h b/be/src/common/config.h
@@ -821,6 +821,10 @@ DECLARE_String(kafka_broker_version_fallback);
 // Change this size to 0 to fix it temporarily.
 DECLARE_Int32(routine_load_consumer_pool_size);
 
+// Used in single-stream-multi-table load. When receive a batch of messages from kafka,
+// if the size of batch is more than this threshold, we will request plans for all related tables.
+DECLARE_Int32(multi_table_batch_plan_threshold);
+
 // When the timeout of a load task is less than this threshold,
 // Doris treats it as a high priority task.
 // high priority tasks use a separate thread pool for flush and do not block rpc by memory cleanup logic.

diff --git a/be/src/io/CMakeLists.txt b/be/src/io/CMakeLists.txt
@@ -43,6 +43,7 @@ set(IO_FILES
     fs/broker_file_writer.cpp
     fs/buffered_reader.cpp
     fs/stream_load_pipe.cpp
+    fs/multi_table_pipe.cpp
     fs/err_utils.cpp
     fs/fs_utils.cpp
     cache/dummy_file_cache.cpp

diff --git a/be/src/io/file_factory.cpp b/be/src/io/file_factory.cpp
@@ -29,6 +29,7 @@
 #include "io/fs/file_reader_options.h"
 #include "io/fs/hdfs_file_system.h"
 #include "io/fs/local_file_system.h"
+#include "io/fs/multi_table_pipe.h"
 #include "io/fs/s3_file_system.h"
 #include "io/fs/stream_load_pipe.h"
 #include "io/hdfs_builder.h"
@@ -144,12 +145,24 @@ Status FileFactory::create_file_reader(RuntimeProfile* profile,
 }
 
 // file scan node/stream load pipe
-Status FileFactory::create_pipe_reader(const TUniqueId& load_id, io::FileReaderSPtr* file_reader) {
+Status FileFactory::create_pipe_reader(const TUniqueId& load_id, io::FileReaderSPtr* file_reader,
+                                       const TUniqueId& fragment_instance_id) {
     auto stream_load_ctx = ExecEnv::GetInstance()->new_load_stream_mgr()->get(load_id);
     if (!stream_load_ctx) {
         return Status::InternalError("unknown stream load id: {}", UniqueId(load_id).to_string());
     }
+
     *file_reader = stream_load_ctx->pipe;
+
+    if (file_reader->get() != nullptr) {
+        auto multi_table_pipe = std::dynamic_pointer_cast<io::MultiTablePipe>(*file_reader);
+        if (multi_table_pipe != nullptr) {
+            *file_reader = multi_table_pipe->getPipe(fragment_instance_id);
+            LOG(INFO) << "create pipe reader for fragment instance: " << fragment_instance_id
+                      << " pipe: " << (*file_reader).get();
+        }
+    }
+
     return Status::OK();
 }
 

diff --git a/be/src/io/file_factory.h b/be/src/io/file_factory.h
@@ -76,7 +76,8 @@ class FileFactory {
             io::FileReaderOptions reader_options = NO_CACHE_READER_OPTIONS);
 
     // Create FileReader for stream load pipe
-    static Status create_pipe_reader(const TUniqueId& load_id, io::FileReaderSPtr* file_reader);
+    static Status create_pipe_reader(const TUniqueId& load_id, io::FileReaderSPtr* file_reader,
+                                     const TUniqueId& fragment_instance_id);
 
     static Status create_hdfs_reader(const THdfsParams& hdfs_params, const std::string& path,
                                      std::shared_ptr<io::FileSystem>* hdfs_file_system,

diff --git a/be/src/io/fs/kafka_consumer_pipe.h b/be/src/io/fs/kafka_consumer_pipe.h
@@ -28,7 +28,7 @@ class KafkaConsumerPipe : public StreamLoadPipe {
 
     ~KafkaConsumerPipe() override = default;
 
-    Status append_with_line_delimiter(const char* data, size_t size) {
+    virtual Status append_with_line_delimiter(const char* data, size_t size) {
         Status st = append(data, size);
         if (!st.ok()) {
             return st;
@@ -39,7 +39,9 @@ class KafkaConsumerPipe : public StreamLoadPipe {
         return st;
     }
 
-    Status append_json(const char* data, size_t size) { return append_and_flush(data, size); }
+    virtual Status append_json(const char* data, size_t size) {
+        return append_and_flush(data, size);
+    }
 };
 } // namespace io
 } // end namespace doris
diff --git a/be/src/io/fs/multi_table_pipe.cpp b/be/src/io/fs/multi_table_pipe.cpp
@@ -0,0 +1,288 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "multi_table_pipe.h"
+
+#include <gen_cpp/FrontendService.h>
+#include <gen_cpp/FrontendService_types.h>
+#include <gen_cpp/HeartbeatService_types.h>
+#include <gen_cpp/Types_types.h>
+#include <thrift/protocol/TDebugProtocol.h>
+
+#include "common/status.h"
+#include "runtime/client_cache.h"
+#include "runtime/fragment_mgr.h"
+#include "runtime/runtime_state.h"
+#include "runtime/stream_load/new_load_stream_mgr.h"
+#include "util/thrift_rpc_helper.h"
+#include "util/thrift_util.h"
+#include "util/time.h"
+
+namespace doris {
+namespace io {
+
+Status MultiTablePipe::append_with_line_delimiter(const char* data, size_t size) {
+    const std::string& table = parse_dst_table(data, size);
+    if (table.empty()) {
+        return Status::InternalError("table name is empty");
+    }
+    size_t prefix_len = table.length() + 1;
+    AppendFunc cb = &KafkaConsumerPipe::append_with_line_delimiter;
+    return dispatch(table, data + prefix_len, size - prefix_len, cb);
+}
+
+Status MultiTablePipe::append_json(const char* data, size_t size) {
+    const std::string& table = parse_dst_table(data, size);
+    if (table.empty()) {
+        return Status::InternalError("table name is empty");
+    }
+    size_t prefix_len = table.length() + 1;
+    AppendFunc cb = &KafkaConsumerPipe::append_json;
+    return dispatch(table, data + prefix_len, size - prefix_len, cb);
+}
+
+KafkaConsumerPipePtr MultiTablePipe::get_pipe_by_table(const std::string& table) {
+    auto pipe = _planned_pipes.find(table);
+    DCHECK(pipe != _planned_pipes.end());
+    return pipe->second;
+}
+
+static std::string_view get_first_part(const char* dat, char delimiter) {
+    const char* delimiterPos = std::strchr(dat, delimiter);
+
+    if (delimiterPos != nullptr) {
+        std::ptrdiff_t length = delimiterPos - dat;
+        return std::string_view(dat, length);
+    } else {
+        return std::string_view(dat);
+    }
+}
+
+Status MultiTablePipe::finish() {
+    for (auto& pair : _planned_pipes) {
+        RETURN_IF_ERROR(pair.second->finish());
+    }
+    return Status::OK();
+}
+
+void MultiTablePipe::cancel(const std::string& reason) {
+    for (auto& pair : _planned_pipes) {
+        pair.second->cancel(reason);
+    }
+}
+
+std::string MultiTablePipe::parse_dst_table(const char* data, size_t size) {
+    return std::string(get_first_part(data, '|'));
+}
+
+Status MultiTablePipe::dispatch(const std::string& table, const char* data, size_t size,
+                                AppendFunc cb) {
+    if (size == 0 || strlen(data) == 0) {
+        LOG(WARNING) << "empty data for table: " << table;
+        return Status::InternalError("empty data");
+    }
+    KafkaConsumerPipePtr pipe = nullptr;
+    auto iter = _planned_pipes.find(table);
+    if (iter != _planned_pipes.end()) {
+        pipe = iter->second;
+        LOG(INFO) << "dispatch for planned pipe: " << pipe.get();
+        RETURN_NOT_OK_STATUS_WITH_WARN((pipe.get()->*cb)(data, size),
+                                       "append failed in planned kafka pipe");
+    } else {
+        iter = _unplanned_pipes.find(table);
+        if (iter == _unplanned_pipes.end()) {
+            pipe = std::make_shared<io::KafkaConsumerPipe>();
+            LOG(INFO) << "create new unplanned pipe: " << pipe.get();
+            _unplanned_pipes.emplace(table, pipe);
+        } else {
+            pipe = iter->second;
+        }
+        LOG(INFO) << "dispatch for unplanned pipe: " << pipe.get();
+        RETURN_NOT_OK_STATUS_WITH_WARN((pipe.get()->*cb)(data, size),
+                                       "append failed in unplanned kafka pipe");
+
+        ++_unplanned_row_cnt;
+        size_t threshold = config::multi_table_batch_plan_threshold;
+        if (_unplanned_row_cnt >= threshold) {
+            LOG(INFO) << fmt::format("unplanned row cnt={} reach threshold={}, plan them",
+                                     _unplanned_row_cnt, threshold);
+            Status st = request_and_exec_plans();
+            _unplanned_row_cnt = 0;
+            if (!st.ok()) {
+                return st;
+            }
+        }
+    }
+    return Status::OK();
+}
+
+#ifndef BE_TEST
+Status MultiTablePipe::request_and_exec_plans() {
+    if (_unplanned_pipes.empty()) return Status::OK();
+
+    // get list of table names in unplanned pipes
+    std::vector<std::string> tables;
+    fmt::memory_buffer log_buffer;
+    log_buffer.clear();
+    fmt::format_to(log_buffer, "request plans for {} tables: [ ", _unplanned_pipes.size());
+    for (auto& pair : _unplanned_pipes) {
+        tables.push_back(pair.first);
+        fmt::format_to(log_buffer, "{} ", pair.first);
+    }
+    fmt::format_to(log_buffer, "]");
+    LOG(INFO) << fmt::to_string(log_buffer);
+
+    TStreamLoadPutRequest request;
+    set_request_auth(&request, _ctx->auth);
+    request.db = _ctx->db;
+    request.table_names = tables;
+    request.__isset.table_names = true;
+    request.txnId = _ctx->txn_id;
+    request.formatType = _ctx->format;
+    request.__set_compress_type(_ctx->compress_type);
+    request.__set_header_type(_ctx->header_type);
+    request.__set_loadId(_ctx->id.to_thrift());
+    request.fileType = TFileType::FILE_STREAM;
+    request.__set_thrift_rpc_timeout_ms(config::thrift_rpc_timeout_ms);
+    // no need to register new_load_stream_mgr coz it is already done in routineload submit task
+
+    // plan this load
+    ExecEnv* exec_env = doris::ExecEnv::GetInstance();
+    TNetworkAddress master_addr = exec_env->master_info()->network_address;
+    int64_t stream_load_put_start_time = MonotonicNanos();
+    RETURN_IF_ERROR(ThriftRpcHelper::rpc<FrontendServiceClient>(
+            master_addr.hostname, master_addr.port,
+            [&request, this](FrontendServiceConnection& client) {
+                client->streamLoadMultiTablePut(_ctx->multi_table_put_result, request);
+            }));
+    _ctx->stream_load_put_cost_nanos = MonotonicNanos() - stream_load_put_start_time;
+
+    Status plan_status(_ctx->multi_table_put_result.status);
+    if (!plan_status.ok()) {
+        LOG(WARNING) << "plan streaming load failed. errmsg=" << plan_status << _ctx->brief();
+        return plan_status;
+    }
+
+    // put unplanned pipes into planned pipes and clear unplanned pipes
+    for (auto& pipe : _unplanned_pipes) {
+        _ctx->table_list.push_back(pipe.first);
+        _planned_pipes.emplace(pipe.first, pipe.second);
+    }
+    LOG(INFO) << fmt::format("{} tables plan complete, planned table cnt={}, returned plan cnt={}",
+                             _unplanned_pipes.size(), _planned_pipes.size(),
+                             _ctx->multi_table_put_result.params.size());
+    _unplanned_pipes.clear();
+
+    for (auto& plan : _ctx->multi_table_put_result.params) {
+        // TODO: use pipeline in the future (currently is buggy for load)
+        ++_inflight_plan_cnt;
+        DCHECK_EQ(plan.__isset.table_name, true);
+        DCHECK(_planned_pipes.find(plan.table_name) != _planned_pipes.end());
+        putPipe(plan.params.fragment_instance_id, _planned_pipes[plan.table_name]);
+        LOG(INFO) << "fragment_instance_id=" << plan.params.fragment_instance_id
+                  << " table=" << plan.table_name;
+        exec_env->fragment_mgr()->exec_plan_fragment(plan, [this](RuntimeState* state,
+                                                                  Status* status) {
+            --_inflight_plan_cnt;
+            _tablet_commit_infos.insert(_tablet_commit_infos.end(),
+                                        state->tablet_commit_infos().begin(),
+                                        state->tablet_commit_infos().end());
+            _number_total_rows += state->num_rows_load_total();
+            _number_loaded_rows += state->num_rows_load_success();
+            _number_filtered_rows += state->num_rows_load_filtered();
+            _number_unselected_rows += state->num_rows_load_unselected();
+
+            // check filtered ratio for this plan fragment
+            int64_t num_selected_rows =
+                    state->num_rows_load_total() - state->num_rows_load_unselected();
+            if (num_selected_rows > 0 &&
+                (double)state->num_rows_load_filtered() / num_selected_rows >
+                        _ctx->max_filter_ratio) {
+                *status = Status::InternalError("too many filtered rows");
+            }
+            if (_number_filtered_rows > 0 && !state->get_error_log_file_path().empty()) {
+                _ctx->error_url = to_load_error_http_path(state->get_error_log_file_path());
+            }
+
+            // if any of the plan fragment exec failed, set the status to the first failed plan
+            if (!status->ok()) {
+                LOG(WARNING) << "plan fragment exec failed. errmsg=" << *status << _ctx->brief();
+                _status = *status;
+            }
+
+            if (_inflight_plan_cnt == 0 && is_consume_finished()) {
+                _ctx->number_total_rows = _number_total_rows;
+                _ctx->number_loaded_rows = _number_loaded_rows;
+                _ctx->number_filtered_rows = _number_filtered_rows;
+                _ctx->number_unselected_rows = _number_unselected_rows;
+                _ctx->commit_infos = _tablet_commit_infos;
+                LOG(INFO) << "all plan for multi-table load complete. number_total_rows="
+                          << _ctx->number_total_rows
+                          << " number_loaded_rows=" << _ctx->number_loaded_rows
+                          << " number_filtered_rows=" << _ctx->number_filtered_rows
+                          << " number_unselected_rows=" << _ctx->number_unselected_rows;
+                _ctx->promise.set_value(_status); // when all done, finish the routine load task
+            }
+        });
+    }
+
+    return Status::OK();
+}
+#else
+Status MultiTablePipe::request_and_exec_plans() {
+    // put unplanned pipes into planned pipes
+    for (auto& pipe : _unplanned_pipes) {
+        _planned_pipes.emplace(pipe.first, pipe.second);
+    }
+    LOG(INFO) << fmt::format("{} tables plan complete, planned table cnt={}",
+                             _unplanned_pipes.size(), _planned_pipes.size());
+    _unplanned_pipes.clear();
+    return Status::OK();
+}
+#endif
+
+Status MultiTablePipe::putPipe(const TUniqueId& fragment_instance_id,
+                               std::shared_ptr<io::StreamLoadPipe> pipe) {
+    std::lock_guard<std::mutex> l(_pipe_map_lock);
+    auto it = _pipe_map.find(fragment_instance_id);
+    if (it != std::end(_pipe_map)) {
+        return Status::InternalError("id already exist");
+    }
+    _pipe_map.emplace(fragment_instance_id, pipe);
+    return Status::OK();
+}
+
+std::shared_ptr<io::StreamLoadPipe> MultiTablePipe::getPipe(const TUniqueId& fragment_instance_id) {
+    std::lock_guard<std::mutex> l(_pipe_map_lock);
+    auto it = _pipe_map.find(fragment_instance_id);
+    if (it == std::end(_pipe_map)) {
+        return std::shared_ptr<io::StreamLoadPipe>(nullptr);
+    }
+    return it->second;
+}
+
+void MultiTablePipe::removePipe(const TUniqueId& fragment_instance_id) {
+    std::lock_guard<std::mutex> l(_pipe_map_lock);
+    auto it = _pipe_map.find(fragment_instance_id);
+    if (it != std::end(_pipe_map)) {
+        _pipe_map.erase(it);
+        VLOG_NOTICE << "remove stream load pipe: " << fragment_instance_id;
+    }
+}
+
+} // namespace io
+} // namespace doris