问题:1、 如果表是3副本,冷数据转移到hdfs上面后副本存储单副本还是3副本
2、如何把数据的缓存给清理掉 这个lru 没整明白
1、创建hdfs存储位置存放冷数据
Drop RESOURCE if exists remote_hdfs;
CREATE RESOURCE "remote_hdfs" PROPERTIES (
"type"="hdfs",
"fs.defaultFS"="hdfs://xxx:8020",
"dfs.client.failover.proxy.provider" = "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
);
2、创建冷热分层存储策略 并且设置数据迁移倒计
cooldown_ttl:迁移数据距离当前时间的倒计时,单位s。与cooldown_datetime二选一即可
Drop STORAGE POLICY if exists test_policy;
CREATE STORAGE POLICY test_policy PROPERTIES (
"storage_resource" = "remote_hdfs",
"cooldown_ttl" = "120"
);
3、创建表并且 关联分层存储策略
Drop table if exists policy_test;
CREATE TABLE policy_test
(
id
int(11) NULL,
name
varchar(50) NULL,
age
DECIMAL NULL,
create_date
date NULL,
update_date
datetime DEFAULT CURRENT_TIMESTAMP
) ENGINE=OLAP
DUPLICATE KEY(id
)
COMMENT 'OLAP'
DISTRIBUTED BY HASH(id
) BUCKETS 3
PROPERTIES (
"replication_allocation" = "tag.location.default: 3",
"storage_policy" = "test_policy"
);
4、添加数据
insert into policy_test values
(1,"1",23,"2024-01-15","2024-01-15 09:43:00"),
(2,"2",24,"2024-01-15","2024-01-15 09:43:00"),
(3,"3",25,"2024-01-15","2024-01-15 09:43:00"),
(4,"4",26,"2024-01-15","2024-01-15 09:43:00"),
(5,"5",23,"2024-01-15","2024-01-15 09:43:00"),
(6,"6",24,"2024-01-15","2024-01-15 09:43:00"),
(7,"7",26,"2024-01-15","2024-01-15 09:43:00");
5、验证等待
--查看集群的冷数据 RemoteUsedCapacity列
show proc '/backends';
-- 查看RemoteDataSize列
show tablets from policy_test;
通过 show tablets from tableName 可以查看到表的每个 tablet 占用的对象大小,RemoteDataSize 项。
-- 查看tablet 位置
cd */tabletID && pwd
-- 转移成功后本地文件并不会直接删除,会标记为垃圾数据,等be统一清理
SHOW TRASH;
ADMIN CLEAN TRASH;
下面是我查询本地整理的文件