版本:doris-2.1.5
建表语句如下:
CREATE TABLE IF NOT EXISTS aws.backend_service_log
(
time_stamp datetime(3),
row_id varchar(100), -- 随机32位字符串
service_name varchar(64), --服务名称:eks,lambda
resource_identifier varchar(64), --资源标签:lambda的function,kds-name,ec2-id 等
region varchar(64), --区域
meta_data VARIANT,
logs string,
index idx_time_stamp(time_stamp) using inverted,
index idx_service_name(service_name) using inverted,
index idx_resource_identifier(resource_identifier) using inverted,
index idx_logs(logs) using inverted properties("parser" = "unicode", "support_phrase" = "true"),
index idx_meta_data(meta_data) using inverted,
index idx_row_id(row_id) using inverted
)
ENGINE = OLAP
DUPLICATE KEY(time_stamp)
PARTITION BY RANGE(time_stamp)(FROM ("2024-12-10") TO ("2025-01-10") INTERVAL 1 DAY)
DISTRIBUTED BY random BUCKETS 100
PROPERTIES(
"replication_allocation" = "tag.location.default: 2",
"storage_medium" = "SSD",
"compression" = "zstd",
"compaction_policy" = "time_series",
"storage_policy" = "aws_policy",
"dynamic_partition.replication_num" = "2",
"dynamic_partition.enable" = "true",
"dynamic_partition.storage_medium" = "SSD",
"dynamic_partition.time_unit" = "DAY",
"dynamic_partition.end" = "1",
"dynamic_partition.prefix" = "p",
"dynamic_partition.buckets" = "100"
);
冷热分离时间为4d
CREATE STORAGE POLICY aws_policy
PROPERTIES(
"storage_resource" = "aws_resource",
"cooldown_ttl" = "4d"
);
今天发现,从2024-12-27开始有些应该变冷的数据还留在磁盘上
查找be.INFO日志:发现有些tablet_id每天被上传30w次,正常每个tablet_id上传上百次就完成
排查其中一个tablet_id(21896911),磁盘上还存在文件
日志每分钟上传一次tablet_id=21896911的数据到s3,日志中上传的文件名和s3一致,但是和磁盘目录上的文件名称不一样
接下来,我们做md5的文件校验,检验s3与磁盘不同名文件的md5值是否一致
在be数据目录下显示
在s3显示
两边md5值一样,认为是同一个文件,但是在上传时,doris改变了文件名称,为什么?为什么磁盘上文件到变冷时间还停留在磁盘上?
每天上传30w次,每次都和上次文件不一样,导致s3上tablet_id=21896911的数据远远超过正常数量,希望帮忙排查下,如果需要其他数据,我们会提供,如果可以,远程沟通下
日志中的报错信息