Created
November 10, 2017 04:02
-
-
Save iamatypeofwalrus/373731be810abba3d02fa4163979163d to your computer and use it in GitHub Desktop.
Data Pipelines in Cloudformation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
AWSTemplateFormatVersion: '2010-09-09' | |
Description: 'Backup DynamoDB table to S3, convert export to Parquet, and add table to Athena' | |
Parameters: | |
TableName: | |
Description: DynamoDB table name | |
Type: String | |
BackupMaximumConsumedReadThroughput: | |
Description: Percentage of table read throughput a backup can use. Expressed between 0.01 and 1.0. Defaults to 20% of available read throughput | |
Type: Number | |
MinValue: 0.01 | |
MaxValue: 1.0 | |
Default: 0.2 | |
BackupTimeout: | |
Description: Backup timeout in hours | |
Type: String | |
Default: 3 | |
Resources: | |
# TODOs | |
# * Use exports.json as a template | |
# * use dynamodb backups bucket for logs as backup-logs | |
# * infer region from the cloudformation template and not as a parameter | |
BackupDataPipeline: | |
Type: "AWS::DataPipeline::Pipeline" | |
Properties: | |
Name: | |
Fn::Join: | |
- "" | |
- - Ref: TableName | |
- Backup | |
Description: "Pipeline to backup DynamoDB data to S3" | |
Activate: true | |
ParameterObjects: | |
- Id: "myDDBReadThroughputRatio" | |
Attributes: | |
- Key: "description" | |
StringValue: "DynamoDB read throughput ratio" | |
- Key: "type" | |
StringValue: "Double" | |
- Key: "default" | |
StringValue: "0.2" | |
- Id: "myOutputS3Loc" | |
Attributes: | |
- Key: "description" | |
StringValue: "S3 output bucket" | |
- Key: "type" | |
StringValue: "AWS::S3::ObjectKey" | |
- Id: "myDDBTableName" | |
Attributes: | |
- Key: "description" | |
StringValue: "DynamoDB Table Name " | |
- Key: "type" | |
StringValue: "String" | |
ParameterValues: | |
- Id: "myDDBTableName" | |
StringValue: | |
Ref: "TableName" | |
- Id: "myOutputS3Loc" | |
StringValue: | |
Fn::Join: | |
- "/" | |
- - "s3:/" | |
- Fn::ImportValue: "dynamodb-backup-s3-bucket" | |
- "raw" | |
PipelineObjects: | |
- Id: "S3BackupLocation" | |
Name: "Copy data to this S3 location" | |
Fields: | |
- Key: "type" | |
StringValue: "S3DataNode" | |
- Key: "dataFormat" | |
RefValue: "DDBExportFormat" | |
- Key: "directoryPath" | |
StringValue: "#{myOutputS3Loc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}" | |
- Id: "DDBSourceTable" | |
Name: "DDBSourceTable" | |
Fields: | |
- Key: "tableName" | |
StringValue: "#{myDDBTableName}" | |
- Key: "type" | |
StringValue: "DynamoDBDataNode" | |
- Key: "dataFormat" | |
RefValue: "DDBExportFormat" | |
- Key: "readThroughputPercent" | |
StringValue: "#{myDDBReadThroughputRatio}" | |
- Id: "DDBExportFormat" | |
Name: "DDBExportFormat" | |
Fields: | |
- Key: "type" | |
StringValue: "DynamoDBExportDataFormat" | |
- Id: "TableBackupActivity" | |
Name: "TableBackupActivity" | |
Fields: | |
- Key: "resizeClusterBeforeRunning" | |
StringValue: "true" | |
- Key: "type" | |
StringValue: "HiveCopyActivity" | |
- Key: "input" | |
RefValue: "DDBSourceTable" | |
- Key: "runsOn" | |
RefValue: "EmrClusterForBackup" | |
- Key: "output" | |
RefValue: "S3BackupLocation" | |
- Id: "Default" | |
Name: "Default" | |
Fields: | |
- Key: "type" | |
StringValue: "Default" | |
- Key: "scheduleType" | |
StringValue: "ondemand" | |
- Key: "failureAndRerunMode" | |
StringValue: "CASCADE" | |
- Key: "role" | |
StringValue: | |
Fn::ImportValue: "dynamodb-backup-data-pipelines-role" | |
- Key: "resourceRole" | |
StringValue: | |
Fn::ImportValue: "dynamodb-backup-ec2-instance-profile" | |
- Id: "EmrClusterForBackup" | |
Name: "EmrClusterForBackup" | |
Fields: | |
- Key: "terminateAfter" | |
StringValue: | |
Fn::Join: | |
- " " | |
- - Ref: BackupTimeout | |
- "hours" | |
- Key: "amiVersion" | |
StringValue: "3.3.2" | |
- Key: "masterInstanceType" | |
StringValue: "m1.medium" | |
- Key: "coreInstanceType" | |
StringValue: "m1.medium" | |
- Key: "coreInstanceCount" | |
StringValue: "1" | |
- Key: "type" | |
StringValue: "EmrCluster" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment