当前位置: 首页 > 工具软件 > Apache Druid > 使用案例 >

Apache Druid的数据删除

程博学
2023-12-01

1. 加载测试数据

从quickstart/tutorial/wikiticker-2015-09-12-sampled.json.gz数据文件读取wikipedia数据,创建一个名称为deletion-tutorial的数据源

deletion-index.json内容如下,创建的segment为小时粒度

[root@bigdata001 apache-druid-0.22.1]# cat quickstart/tutorial/deletion-index.json 
{
  "type" : "index_parallel",
  "spec" : {
    "dataSchema" : {
      "dataSource" : "deletion-tutorial",
      "timestampSpec": {
        "column": "time",
        "format": "iso"
      },
      "dimensionsSpec" : {
        "dimensions" : [
          "channel",
          "cityName",
          "comment",
          "countryIsoCode",
          "countryName",
          "isAnonymous",
          "isMinor",
          "isNew",
          "isRobot",
          "isUnpatrolled",
          "metroCode",
          "namespace",
          "page",
          "regionIsoCode",
          "regionName",
          "user",
          { "name": "added", "type": "long" },
          { "name": "deleted", "type": "long" },
          { "name": "delta", "type": "long" }
        ]
      },
      "metricsSpec" : [],
      "granularitySpec" : {
        "type" : "uniform",
        "segmentGranularity" : "hour",
        "queryGranularity" : "none",
        "intervals" : ["2015-09-12/2015-09-13"],
        "rollup" : false
      }
    },
    "ioConfig" : {
      "type" : "index_parallel",
      "inputSource" : {
        "type" : "local",
        "baseDir" : "quickstart/tutorial/",
        "filter" : "wikiticker-2015-09-12-sampled.json.gz"
      },
      "inputFormat" : {
        "type" : "json"
      },
      "appendToExisting" : false
    },
    "tuningConfig" : {
      "type" : "index_parallel",
      "maxRowsPerSegment" : 5000000,
      "maxRowsInMemory" : 25000
    }
  }
}
[root@bigdata001 apache-druid-0.22.1]# 

在命令行执行task

[root@bigdata001 apache-druid-0.22.1]# 
[root@bigdata001 apache-druid-0.22.1]# bin/post-index-task --file quickstart/tutorial/deletion-index.json --url http://bigdata003:9081
Beginning indexing data for deletion-tutorial
Task started: index_parallel_deletion-tutorial_eeegkkll_2022-04-01T08:44:41.767Z
Task log:     http://bigdata003:9081/druid/indexer/v1/task/index_parallel_deletion-tutorial_eeegkkll_2022-04-01T08:44:41.767Z/log
Task status:  http://bigdata003:9081/druid/indexer/v1/task/index_parallel_deletion-tutorial_eeegkkll_2022-04-01T08:44:41.767Z/status
Task index_parallel_deletion-tutorial_eeegkkll_2022-04-01T08:44:41.767Z still running...
Task index_parallel_deletion-tutorial_eeegkkll_2022-04-01T08:44:41.767Z still running...
Task index_parallel_deletion-tutorial_eeegkkll_2022-04-01T08:44:41.767Z still running...
Task index_parallel_deletion-tutorial_eeegkkll_2022-04-01T08:44:41.767Z still running...
Task finished with status: SUCCESS
Completed indexing data for deletion-tutorial. Now loading indexed data onto the cluster...
[root@bigdata001 apache-druid-0.22.1]# 

查询数据源数据

dsql> select * from "deletion-tutorial" limit 1;
┌──────────────────────────┬───────┬───────────────┬──────────┬───────────────┬────────────────┬─────────────┬─────────┬───────┬─────────────┬─────────┬───────┬─────────┬───────────────┬───────────┬───────────┬──────────────────────┬───────────────┬────────────┬──────────────┐
│ __time                   │ added │ channel       │ cityName │ comment       │ countryIsoCode │ countryName │ deleted │ delta │ isAnonymous │ isMinor │ isNew │ isRobot │ isUnpatrolled │ metroCode │ namespace │ page                 │ regionIsoCode │ regionName │ user         │
├──────────────────────────┼───────┼───────────────┼──────────┼───────────────┼────────────────┼─────────────┼─────────┼───────┼─────────────┼─────────┼───────┼─────────┼───────────────┼───────────┼───────────┼──────────────────────┼───────────────┼────────────┼──────────────┤
│ 2015-09-12T00:46:58.771Z │    36 │ #en.wikipedia │          │ added project │                │             │       0 │    36 │ false       │ false   │ false │ false   │ false         │           │ Talk      │ Talk:Oswald Tilghman │               │            │ GELongstreet │
└──────────────────────────┴───────┴───────────────┴──────────┴───────────────┴────────────────┴─────────────┴─────────┴───────┴─────────────┴─────────┴───────┴─────────┴───────────────┴───────────┴───────────┴──────────────────────┴───────────────┴────────────┴──────────────┘
Retrieved 1 row in 0.06s.

dsql> 
dsql> select count(*) from "deletion-tutorial" limit 1;
┌────────┐
│ EXPR$0 │
├────────┤
│  39244 │
└────────┘
Retrieved 1 row in 0.46s.

dsql> 

2. 永久删除数据

永久删除一个segment需要两步:

  1. 先将segment标记为"unused"。可以用Coordinator API按时间interval或和segment ID进行标记
  2. 一个Kill任务删除Druid的metadata store和deep storage中任何“unused”的segment

2.1 按时间interval进行标记

下面的命令对18和19小时的segment进行标记

[root@bigdata001 apache-druid-0.22.1]# curl -X 'POST' -H 'Content-Type:application/json' -d '{"interval" : "2015-09-12T18:00:00.000Z/2015-09-12T20:00:00.000Z"}' http://bigdata003:9081/druid/coordinator/v1/datasources/deletion-tutorial/markUnused
{"numChangedSegments":2}[root@bigdata001 apache-druid-0.22.1]#

2.2 按segment ID进行标记

对quickstart/tutorial/deletion-disable-segments.json进行修改,修改成我们自己的segment ID

这里我们对13和14小时的segment进行标记,内容如下

然后同步到Druid集群所有服务器的quickstart/tutorial目录下

[root@bigdata001 apache-druid-0.22.1]# cat quickstart/tutorial/deletion-disable-segments.json 
{
  "segmentIds":
  [
    "deletion-tutorial_2015-09-12T13:00:00.000Z_2015-09-12T14:00:00.000Z_2022-04-01T08:44:41.783Z",
    "deletion-tutorial_2015-09-12T14:00:00.000Z_2015-09-12T15:00:00.000Z_2022-04-01T08:44:41.783Z"
  ]
}
[root@bigdata001 apache-druid-0.22.1]# 

在命令行执行task

[root@bigdata001 apache-druid-0.22.1]# curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/deletion-disable-segments.json http://bigdata003:9081/druid/coordinator/v1/datasources/deletion-tutorial/markUnused
{"numChangedSegments":2}[root@bigdata001 apache-druid-0.22.1]#

2.3 执行kill任务

deletion-kill.json的内容如下

[root@bigdata001 apache-druid-0.22.1]# 
[root@bigdata001 apache-druid-0.22.1]# cat quickstart/tutorial/deletion-kill.json 
{
  "type": "kill",
  "dataSource": "deletion-tutorial",
  "interval" : "2015-09-12/2015-09-13"
}
[root@bigdata001 apache-druid-0.22.1]#

提交kill任务到Overload

[root@bigdata001 apache-druid-0.22.1]# curl -X 'POST' -H 'Content-Type:application/json' -d @quickstart/tutorial/deletion-kill.json http://bigdata003:9081/druid/indexer/v1/task
{"task":"kill_deletion-tutorial_iilainag_2015-09-12T00:00:00.000Z_2015-09-13T00:00:00.000Z_2022-04-01T09:42:04.383Z"}[root@bigdata001 apache-druid-0.22.1]#

segment将从Deep storage上删除

 类似资料: