【已解决】2.1.3/2.1.5,COLLECT_SET开窗导致be节点宕机

Viewed 129

版本: selectdb-doris-2.1.5-rc02-5733c9d84d/selectdb-doris-2.1.3-rc09-zyxfjr01-734c50b0ef
现象: 在执行SQL时会导致所有的be节点崩溃, 即使查询内部表也会导致所有be宕机, 主要是使用 COLLECT_SET + OVER 开窗时存在这个问题, 使用其它函数时如COLLECT_SET暂时没有发现
SQL为:

WITH nns AS (SELECT JSON_ARRAY_GET(hosts, pos) host, hosts
             FROM (SELECT '["dw11","dw21","dw31"]' hosts) t1 /* namenode 的 hostname,后续有变更,请修改这个地方 */
    LATERAL VIEW EXPLODE_NUMBERS(JSON_ARRAY_SIZE(hosts)) tmp AS pos
    )
SELECT COUNT(1) OVER ()                                                    node_cnt
     , host
     , COLLECT_SET(IF(state IS NULL, host, null)) OVER () dead_node
     , COLLECT_SET(IF(state IS NULL, null, host)) OVER () alive_node /* 这一个查询 导致所有BE节点宕机, 如果不使用开窗 OVER 则也不会宕机*/
     , COUNT(IF(state IN ('active', 'standby'), 1, NULL)) OVER ()          alive_cnt
     , COUNT(IF(state = 'active', 1, null)) OVER ()                        active_cnt
FROM (SELECT host,
             hosts,
             REPLACE(JSON_EXTRACT(JSON_ARRAY_GET(JSON_EXTRACT(bean, '$.beans'), 0), '$.State'), '"',
                     '') state /* json_array_get(TEXT, INT) RETURN STRING 获取jsonarray指定位置上的值  */
      FROM (SELECT host,
                   hosts,
                   HTTP_GET(CONCAT('http://', host, ':9870/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus'),
                            '') bean /* 自定义 http_get(TEXT, TEXT) RETURN STRING, 调用 get 接口, 返回 string*/
            FROM nns) t1) t2;

日志:
be.out:

*** Query id: bc393123984046e5-a4548deca2f99dd9 ***
*** is nereids: 1 ***
*** tablet id: 0 ***
*** Aborted at 1723448918 (unix time) try "date -d @1723448918" if you are using GNU date ***
*** Current BE git commitID: 5733c9d84d ***
*** SIGSEGV address not mapped to object (@0x10) received by PID 163183 (TID 164849 OR 0x7fb0c0df7700) from PID 16; stack trace: ***
 0# doris::signal::(anonymous namespace)::FailureSignalHandler(int, siginfo_t*, void*) at /home/zcp/repo_center/doris_enterprise/doris/be/src/common/signal_handler.h:421
 1# os::Linux::chained_handler(int, siginfo_t*, void*) in /data/software/doris/java8/jre/lib/amd64/server/libjvm.so
 2# JVM_handle_linux_signal in /data/software/doris/java8/jre/lib/amd64/server/libjvm.so
 3# signalHandler(int, siginfo_t*, void*) in /data/software/doris/java8/jre/lib/amd64/server/libjvm.so
 4# 0x00007FB3AC8B0400 in /lib64/libc.so.6
 5# doris::vectorized::AggregateFunctionCollectSetData<doris::StringRef, std::integral_constant<bool, false> >::add(doris::vectorized::IColumn const&, unsigned long, doris::vectorized::Arena*) at /home/zcp/repo_center/doris_enterprise/doris/be/src/vec/aggregate_functions/aggregate_function_collect.h:128
 6# doris::vectorized::IAggregateFunctionHelper<doris::vectorized::AggregateFunctionNullUnaryInline<doris::vectorized::AggregateFunctionCollect<doris::vectorized::AggregateFunctionCollectSetData<doris::StringRef, std::integral_constant<bool, false> >, std::integral_constant<bool, false>, std::integral_constant<bool, false> >, false> >::add_range_single_place(long, long, long, long, char*, doris::vectorized::IColumn const**, doris::vectorized::Arena*) const at /home/zcp/repo_center/doris_enterprise/doris/be/src/vec/aggregate_functions/aggregate_function.h:302
 7# doris::pipeline::AnalyticLocalState::_execute_for_win_func(long, long, long, long) at /home/zcp/repo_center/doris_enterprise/doris/be/src/pipeline/exec/analytic_source_operator.cpp:295
 8# doris::pipeline::AnalyticLocalState::_get_next_for_range(unsigned long) in /data/software/doris/be/lib/doris_be
 9# std::_Function_handler<doris::Status (unsigned long), std::_Bind_result<doris::Status, doris::Status (doris::pipeline::AnalyticLocalState::*(doris::pipeline::AnalyticLocalState*, std::_Placeholder<1>))(unsigned long)> >::_M_invoke(std::_Any_data const&, unsigned long&&) at /var/local/ldb-toolchain/bin/../lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/std_function.h:291
10# doris::pipeline::AnalyticSourceOperatorX::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /home/zcp/repo_center/doris_enterprise/doris/be/src/pipeline/exec/analytic_source_operator.cpp:545
11# doris::pipeline::OperatorXBase::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) at /home/zcp/repo_center/doris_enterprise/doris/be/src/pipeline/pipeline_x/operator.cpp:276
12# doris::pipeline::PipelineXTask::execute(bool*) at /home/zcp/repo_center/doris_enterprise/doris/be/src/pipeline/pipeline_x/pipeline_x_task.cpp:325
13# doris::pipeline::TaskScheduler::_do_work(unsigned long) at /home/zcp/repo_center/doris_enterprise/doris/be/src/pipeline/task_scheduler.cpp:347
14# doris::ThreadPool::dispatch_thread() in /data/software/doris/be/lib/doris_be
15# doris::Thread::supervise_thread(void*) at /home/zcp/repo_center/doris_enterprise/doris/be/src/util/thread.cpp:499
16# start_thread in /lib64/libpthread.so.0
17# clone in /lib64/libc.so.6

be.warn:

W20240812 15:46:08.356520 163183 timezone_utils.cpp:94] Meet illegal tzdata file: iso3166.tab. skipped
W20240812 15:46:08.363566 163183 timezone_utils.cpp:94] Meet illegal tzdata file: leapseconds. skipped
W20240812 15:46:08.363616 163183 timezone_utils.cpp:94] Meet illegal tzdata file: tzdata.zi. skipped
W20240812 15:46:08.363636 163183 timezone_utils.cpp:94] Meet illegal tzdata file: zone.tab. skipped
W20240812 15:46:08.363658 163183 timezone_utils.cpp:94] Meet illegal tzdata file: zone1970.tab. skipped
E20240812 15:46:08.633955 163183 variable.cpp:179] Already exposed `doris_cache_data_page_cache' whose value is `0'
E20240812 15:46:08.633993 163183 variable.cpp:179] Already exposed `doris_cache_data_page_cache_persecond' whose value is `0'
E20240812 15:46:08.634099 163183 variable.cpp:179] Already exposed `doris_cache_index_page_cache' whose value is `0'
E20240812 15:46:08.634106 163183 variable.cpp:179] Already exposed `doris_cache_index_page_cache_persecond' whose value is `0'
E20240812 15:46:08.634223 163183 variable.cpp:179] Already exposed `doris_cache_pkindex_page_cache' whose value is `0'
E20240812 15:46:08.634243 163183 variable.cpp:179] Already exposed `doris_cache_pkindex_page_cache_persecond' whose value is `0'
E20240812 15:46:08.634328 163183 variable.cpp:179] Already exposed `doris_cache_point_query_row_cache' whose value is `0'
E20240812 15:46:08.634344 163183 variable.cpp:179] Already exposed `doris_cache_point_query_row_cache_persecond' whose value is `0'
E20240812 15:46:08.634394 163183 variable.cpp:179] Already exposed `doris_cache_segment_cache' whose value is `0'
E20240812 15:46:08.634403 163183 variable.cpp:179] Already exposed `doris_cache_segment_cache_persecond' whose value is `0'
E20240812 15:46:08.634450 163183 variable.cpp:179] Already exposed `doris_cache_schema_cache' whose value is `0'
E20240812 15:46:08.634460 163183 variable.cpp:179] Already exposed `doris_cache_schema_cache_persecond' whose value is `0'
E20240812 15:46:08.634488 163183 variable.cpp:179] Already exposed `doris_cache_common_obj_lrucache' whose value is `0'
E20240812 15:46:08.634493 163183 variable.cpp:179] Already exposed `doris_cache_common_obj_lrucache_persecond' whose value is `0'
E20240812 15:46:08.634527 163183 variable.cpp:179] Already exposed `doris_cache_point_query_lookup_connection_cache' whose value is `0'
E20240812 15:46:08.634534 163183 variable.cpp:179] Already exposed `doris_cache_point_query_lookup_connection_cache_persecond' whose value is `0'
E20240812 15:46:08.634646 163183 variable.cpp:179] Already exposed `doris_cache_inverted_index_searcher_cache' whose value is `0'
E20240812 15:46:08.634658 163183 variable.cpp:179] Already exposed `doris_cache_inverted_index_searcher_cache_persecond' whose value is `0'
E20240812 15:46:08.634778 163183 variable.cpp:179] Already exposed `doris_cache_inverted_index_query_cache' whose value is `0'
E20240812 15:46:08.634785 163183 variable.cpp:179] Already exposed `doris_cache_inverted_index_query_cache_persecond' whose value is `0'
E20240812 15:46:08.634951 163183 variable.cpp:179] Already exposed `doris_cache_last_success_channel_cache' whose value is `0'
E20240812 15:46:08.634961 163183 variable.cpp:179] Already exposed `doris_cache_last_success_channel_cache_persecond' whose value is `0'
E20240812 15:46:08.635197 163183 variable.cpp:179] Already exposed `doris_cache_tablet_schema_cache' whose value is `0'
E20240812 15:46:08.635211 163183 variable.cpp:179] Already exposed `doris_cache_tablet_schema_cache_persecond' whose value is `0'
E20240812 15:46:08.635794 163183 variable.cpp:179] Already exposed `doris_cache_mow_tablet_version_cache' whose value is `0'
E20240812 15:46:08.635808 163183 variable.cpp:179] Already exposed `doris_cache_mow_tablet_version_cache_persecond' whose value is `0'
E20240812 15:46:08.635854 163183 variable.cpp:179] Already exposed `doris_cache_create_tablet_rridx_cache' whose value is `0'
E20240812 15:46:08.635864 163183 variable.cpp:179] Already exposed `doris_cache_create_tablet_rridx_cache_persecond' whose value is `0'
E20240812 15:46:08.694494 163807 variable.cpp:179] Already exposed `doris_cache_mow_delete_bitmap_agg_cache' whose value is `0'
E20240812 15:46:08.694511 163807 variable.cpp:179] Already exposed `doris_cache_mow_delete_bitmap_agg_cache_persecond' whose value is `0'
W20240812 15:46:08.711439 163848 olap_server.cpp:712] Have not get FE Master heartbeat yet
W20240812 15:48:53.322433 167525 timezone_utils.cpp:94] Meet illegal tzdata file: iso3166.tab. skipped
W20240812 15:48:53.323212 167525 timezone_utils.cpp:94] Meet illegal tzdata file: leapseconds. skipped
W20240812 15:48:53.323239 167525 timezone_utils.cpp:94] Meet illegal tzdata file: tzdata.zi. skipped
W20240812 15:48:53.323257 167525 timezone_utils.cpp:94] Meet illegal tzdata file: zone.tab. skipped
W20240812 15:48:53.323276 167525 timezone_utils.cpp:94] Meet illegal tzdata file: zone1970.tab. skipped
E20240812 15:48:53.583994 167525 variable.cpp:179] Already exposed `doris_cache_data_page_cache' whose value is `0'
E20240812 15:48:53.584022 167525 variable.cpp:179] Already exposed `doris_cache_data_page_cache_persecond' whose value is `0'
E20240812 15:48:53.584110 167525 variable.cpp:179] Already exposed `doris_cache_index_page_cache' whose value is `0'
E20240812 15:48:53.584116 167525 variable.cpp:179] Already exposed `doris_cache_index_page_cache_persecond' whose value is `0'
E20240812 15:48:53.584239 167525 variable.cpp:179] Already exposed `doris_cache_pkindex_page_cache' whose value is `0'
E20240812 15:48:53.584249 167525 variable.cpp:179] Already exposed `doris_cache_pkindex_page_cache_persecond' whose value is `0'
E20240812 15:48:53.584319 167525 variable.cpp:179] Already exposed `doris_cache_point_query_row_cache' whose value is `0'
E20240812 15:48:53.584336 167525 variable.cpp:179] Already exposed `doris_cache_point_query_row_cache_persecond' whose value is `0'
E20240812 15:48:53.584398 167525 variable.cpp:179] Already exposed `doris_cache_segment_cache' whose value is `0'
E20240812 15:48:53.584405 167525 variable.cpp:179] Already exposed `doris_cache_segment_cache_persecond' whose value is `0'
E20240812 15:48:53.584439 167525 variable.cpp:179] Already exposed `doris_cache_schema_cache' whose value is `0'
E20240812 15:48:53.584444 167525 variable.cpp:179] Already exposed `doris_cache_schema_cache_persecond' whose value is `0'
E20240812 15:48:53.584471 167525 variable.cpp:179] Already exposed `doris_cache_common_obj_lrucache' whose value is `0'
E20240812 15:48:53.584477 167525 variable.cpp:179] Already exposed `doris_cache_common_obj_lrucache_persecond' whose value is `0'
E20240812 15:48:53.584506 167525 variable.cpp:179] Already exposed `doris_cache_point_query_lookup_connection_cache' whose value is `0'
E20240812 15:48:53.584512 167525 variable.cpp:179] Already exposed `doris_cache_point_query_lookup_connection_cache_persecond' whose value is `0'
E20240812 15:48:53.584615 167525 variable.cpp:179] Already exposed `doris_cache_inverted_index_searcher_cache' whose value is `0'
E20240812 15:48:53.584625 167525 variable.cpp:179] Already exposed `doris_cache_inverted_index_searcher_cache_persecond' whose value is `0'
E20240812 15:48:53.584726 167525 variable.cpp:179] Already exposed `doris_cache_inverted_index_query_cache' whose value is `0'
E20240812 15:48:53.584733 167525 variable.cpp:179] Already exposed `doris_cache_inverted_index_query_cache_persecond' whose value is `0'
E20240812 15:48:53.584878 167525 variable.cpp:179] Already exposed `doris_cache_last_success_channel_cache' whose value is `0'
E20240812 15:48:53.584890 167525 variable.cpp:179] Already exposed `doris_cache_last_success_channel_cache_persecond' whose value is `0'
E20240812 15:48:53.585129 167525 variable.cpp:179] Already exposed `doris_cache_tablet_schema_cache' whose value is `0'
E20240812 15:48:53.585143 167525 variable.cpp:179] Already exposed `doris_cache_tablet_schema_cache_persecond' whose value is `0'
E20240812 15:48:53.585736 167525 variable.cpp:179] Already exposed `doris_cache_mow_tablet_version_cache' whose value is `0'
E20240812 15:48:53.585747 167525 variable.cpp:179] Already exposed `doris_cache_mow_tablet_version_cache_persecond' whose value is `0'
E20240812 15:48:53.585780 167525 variable.cpp:179] Already exposed `doris_cache_create_tablet_rridx_cache' whose value is `0'
E20240812 15:48:53.585786 167525 variable.cpp:179] Already exposed `doris_cache_create_tablet_rridx_cache_persecond' whose value is `0'
E20240812 15:48:53.604631 168123 variable.cpp:179] Already exposed `doris_cache_mow_delete_bitmap_agg_cache' whose value is `0'
E20240812 15:48:53.604646 168123 variable.cpp:179] Already exposed `doris_cache_mow_delete_bitmap_agg_cache_persecond' whose value is `0'
W20240812 15:48:53.621595 168154 olap_server.cpp:712] Have not get FE Master heartbeat yet
1 Answers