问题：

DynamoDB表复制通过数据管道产生不完整的重复

慕璞

2023-03-14

我有一个DynamoDB表是14.05GB，有140,000,000项。我试图使用数据管道克隆它（到同一区域），但当管道完成时，目标表只有大约160,000个项目，我等了6个小时才能查看项目计数。

我将每个表的吞吐量设置为256，管道需要大约20分钟才能完成。有没有可能导致管道只复制表的一部分？尺寸和物品数量是否有无形的限制？我已经尝试了3次，每次都有类似的结果，“完成”的目标表只包含90150k的140M项。

我还确保最大执行时间设置得非常高。

数据管道是快速复制Dynamo表的最简单方法吗？

谢谢

共有2个答案

水瀚漠

2023-03-14

利用AWS数据管道的实用性，可以从一个Dynamodb表复制到另一个Dynamodb表。下面是一个示例管道定义。

{
"objects": [
    {
        "startAt": "FIRST_ACTIVATION_DATE_TIME",
        "name": "DailySchedule",
        "id": "DailySchedule",
        "period": "1 day",
        "type": "Schedule",
        "occurrences": "1"
    },
    {
        "id": "Default",
        "name": "Default",
        "scheduleType": "CRON",
        "pipelineLogUri": "#{myS3LogsPath}",
        "schedule": {
            "ref": "DailySchedule"
        },
        "failureAndRerunMode": "CASCADE",
        "role": "DataPipelineDefaultRole",
        "resourceRole": "DataPipelineDefaultResourceRole"
    },
   {
        "id": "DDBSourceTable",
        "tableName": "#{myDDBSourceTableName}",
        "name": "DDBSourceTable",
        "type": "DynamoDBDataNode",
        "readThroughputPercent": "#{myDDBReadThroughputRatio}"
    },
    {
        "name": "S3TempLocation",
        "id": "S3TempLocation",
        "type": "S3DataNode",
        "directoryPath": "#{myTempS3Folder}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}"
    },
    {
        "id": "DDBDestinationTable",
        "tableName": "#{myDDBDestinationTableName}",
        "name": "DDBDestinationTable",
        "type": "DynamoDBDataNode",
        "writeThroughputPercent": "#{myDDBWriteThroughputRatio}"
    },
    {
        "id": "EmrClusterForBackup",
        "name": "EmrClusterForBackup",
        "releaseLabel": "emr-4.2.0",
        "masterInstanceType": "m3.xlarge",
        "coreInstanceType": "m3.xlarge",
        "coreInstanceCount": "1",
        "region": "#{myDDBSourceRegion}",
        "terminateAfter": "6 Hours",
        "type": "EmrCluster"
    },
    {
        "id": "EmrClusterForLoad",
        "name": "EmrClusterForLoad",
        "releaseLabel": "emr-4.2.0",
        "masterInstanceType": "m3.xlarge",
        "coreInstanceType": "m3.xlarge",
        "coreInstanceCount": "1",
        "region": "#{myDDBDestinationRegion}",
        "terminateAfter": "6 Hours",
        "type": "EmrCluster"
    },
    {
        "id": "TableLoadActivity",
        "name": "TableLoadActivity",
        "runsOn": {
            "ref": "EmrClusterForLoad"
        },
        "input": {
            "ref": "S3TempLocation"
        },
        "output": {
            "ref": "DDBDestinationTable"
        },
        "type": "EmrActivity",
        "maximumRetries": "2",
        "dependsOn": {
           "ref": "TableBackupActivity"
        },
        "resizeClusterBeforeRunning": "true",
        "step": [
            "s3://dynamodb-emr-#{myDDBDestinationRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbImport,#{input.directoryPath},#{output.tableName},#{output.writeThroughputPercent}"
        ]
    },
   {
        "id": "TableBackupActivity",
        "name": "TableBackupActivity",
        "input": {
            "ref": "DDBSourceTable"
        },
        "output": {
            "ref": "S3TempLocation"
        },
        "runsOn": {
            "ref": "EmrClusterForBackup"
        },
        "resizeClusterBeforeRunning": "true",
        "type": "EmrActivity",
        "maximumRetries": "2",
        "step": [
            "s3://dynamodb-emr-#{myDDBSourceRegion}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}"
        ]
    },
    {
        "dependsOn": {
            "ref": "TableLoadActivity"
        },
        "name": "S3CleanupActivity",
        "id": "S3CleanupActivity",
        "input": {
            "ref": "S3TempLocation"
        },
        "runsOn": {
           "ref": "EmrClusterForBackup"
        },
        "type": "ShellCommandActivity",
        "command": "(sudo yum -y update aws-cli) && (aws s3 rm #{input.directoryPath} --recursive)"
    }
],
"parameters": [
    {
        "myComment": "This Parameter specifies the S3 logging path for the pipeline.  It is used by the 'Default' object to set the 'pipelineLogUri' value.",
        "id" : "myS3LogsPath",
        "type" : "AWS::S3::ObjectKey",
        "description" : "S3 path for pipeline logs."
    },
    {
        "id": "myDDBSourceTableName",
        "type": "String",
        "description": "Source DynamoDB table name"
    },
    {
        "id": "myDDBDestinationTableName",
        "type": "String",
        "description": "Target DynamoDB table name"
    },
    {
        "id": "myDDBWriteThroughputRatio",
        "type": "Double",
        "description": "DynamoDB write throughput ratio",
        "default": "0.25",
        "watermark": "Enter value between 0.1-1.0"
    },
    {
        "id": "myDDBSourceRegion",
        "type": "String",
        "description": "Region of the DynamoDB table",
        "default": "us-east-1",
        "watermark": "us-east-1"
    },
    {
        "id": "myDDBDestinationRegion",
        "type": "String",
        "description": "Region of the DynamoDB table",
        "default": "us-east-1",
        "watermark": "us-east-1"
    },
    {
        "id": "myDDBReadThroughputRatio",
        "type": "Double",
        "description": "DynamoDB read throughput ratio",
        "default": "0.25",
        "watermark": "Enter value between 0.1-1.0"
    },
    {
        "myComment": "Temporary S3 path to store the dynamodb backup csv files, backup files will be deleted after the copy completes",
        "id": "myTempS3Folder",
        "type": "AWS::S3::ObjectKey",
        "description": "Temporary S3 folder"
    }
]
}

燕烨

2023-03-14

亚马逊已经回复了我的机票，并确认这是数据管道中的一个已知问题（错误）。

他们向我推荐了这个Java程序https://github.com/awslabs/dynamodb-import-export-tool 首先将其导出到S3，然后将其导入回DynamoDB

DynamoDB表复制通过数据管道产生不完整的重复

共有2个答案

相关问答

相关文章

相关阅读

相关工具

相关文档