chris · January 15, 2025 23:18
diff --git a/dynamodb_to_athena.sh b/dynamodb_to_athena.sh
 #!/bin/bash
 set -e
 #
 # This script refreshes an Athena database for data from a DynamoDB table. It creates
 # a DynamoDB S3 export of the DynamoDB table, then creates an AWS Glue crawler
 # which when run, creates a Glue table. This can then be queried by Athena.
 # Finally, it cleans up older exports and tables.
 #

 datestamp=$(date +%Y%m%d)
 dynamodb_table_arn="arn:aws:dynamodb:us-east-2:YOURACCOUNT:table/YOURTABLENAME"
 s3_bucket="my-dynamodb-exports"
 s3_export_path="s3://${s3_bucket}/${datestamp}/AWSDynamoDB/"
 glue_database_name="YOURTABLENAME-ddb-s3"
 crawler_basename="My DynamoDB Crawler Name"
 glue_table_prefix="YOURTABLENAME_${datestamp}_"

 echo "Using new date stamp of: ${datestamp}"

 echo "Starting DynamoDB S3 export."
 # create a DynamoDB export to S3 and capture the export ARN
 dynamo_export_arn=$(aws dynamodb export-table-to-point-in-time \
    --table-arn "${dynamodb_table_arn}" \
    --s3-bucket "${s3_bucket}" \
    --s3-prefix ${datestamp} \
    --export-format DYNAMODB_JSON \
    --export-type FULL_EXPORT \
    --s3-sse-algorithm AES256 \
    --output json \
    --no-cli-pager | jq -r '.ExportDescription.ExportArn')

 echo "DynamoDB S3 Export ARN: ${dynamo_export_arn}"

 echo "Waiting for DynamoDB S3 Export to complete (usually about 15-20 minutes)..."
 # poll status to wait until done - run this periodically, until finds "COMPLETED"
 # this usually takes about 15 minutes
 while ! aws dynamodb describe-export --export-arn "${dynamo_export_arn}" | grep "COMPLETED"; do
  echo -n "."
  sleep 60
 done
 echo "DynamoDB S3 Export completed"

 # list the bucket to find the ID/full path we need for the next command
 s3_export_id=$(aws s3 ls ${s3_export_path} | awk '{print $2}')
 s3_path="${s3_export_path}${s3_export_id}data/"
 echo "S3 Path for Glue Crawler: ${s3_path}"

 # Create a new Glue table using a crawler - first create crawler, then run it to create the table
 crawler_name="${crawler_basename} ${datestamp}"
 aws glue create-crawler \
  --name "${crawler_name}" \
  --role arn:aws:iam::YOURACCOUNT:role/Glue-Dynamo-S3-Athena \
  --database-name "${glue_database_name}" \
  --table-prefix "${glue_table_prefix}" \
  --targets '{"S3Targets": [{"Path": "'${s3_path}'"}]}' \
  --no-cli-pager

 echo "Created Glue crawler, name: ${crawler_name}"

 aws glue start-crawler --name "${crawler_name}"
 echo "Started Glue crawler - wait for this to finish before querying the table."

 #
 # clean up previous setup
 #

 # Find the previous S3 export and delete it - there should only be one, which this relies on
 old_export=$(aws s3 ls s3://${s3_bucket}/ | grep -v ${datestamp} | awk '{print $2}')
 # make sure old_export is not empty
 if [ -z "$old_export" ]; then
  echo "No old S3 export found, assuming nothing to clean up, exiting."
  exit 0
 fi

 echo "Deleting old S3 export: ${old_export}"
 aws s3 rm "s3://${s3_bucket}/${old_export}" --recursive

 # for the datestamp, remove the slash off the old_export dir name we just got
 old_export_datestamp=${old_export%/}

 # Delete older Glue crawler
 aws glue delete-crawler --name "${crawler_basename} ${old_export_datestamp}"
 echo "Deleted older Glue crawler."

 # Delete older Glue table
 aws glue delete-table --database-name ${glue_database_name} --name "naturedose_${old_export_datestamp}_data"
 echo "Deleted older Glue table."

 echo "Done - update your Athena queries to use the table: ${glue_table_prefix}data"
	#!/bin/bash
	set -e
	#
	# This script refreshes an Athena database for data from a DynamoDB table. It creates
	# a DynamoDB S3 export of the DynamoDB table, then creates an AWS Glue crawler
	# which when run, creates a Glue table. This can then be queried by Athena.
	# Finally, it cleans up older exports and tables.
	#

	datestamp=$(date +%Y%m%d)
	dynamodb_table_arn="arn:aws:dynamodb:us-east-2:YOURACCOUNT:table/YOURTABLENAME"
	s3_bucket="my-dynamodb-exports"
	s3_export_path="s3://${s3_bucket}/${datestamp}/AWSDynamoDB/"
	glue_database_name="YOURTABLENAME-ddb-s3"
	crawler_basename="My DynamoDB Crawler Name"
	glue_table_prefix="YOURTABLENAME_${datestamp}_"

	echo "Using new date stamp of: ${datestamp}"

	echo "Starting DynamoDB S3 export."
	# create a DynamoDB export to S3 and capture the export ARN
	dynamo_export_arn=$(aws dynamodb export-table-to-point-in-time \
	--table-arn "${dynamodb_table_arn}" \
	--s3-bucket "${s3_bucket}" \
	--s3-prefix ${datestamp} \
	--export-format DYNAMODB_JSON \
	--export-type FULL_EXPORT \
	--s3-sse-algorithm AES256 \
	--output json \
	--no-cli-pager \| jq -r '.ExportDescription.ExportArn')

	echo "DynamoDB S3 Export ARN: ${dynamo_export_arn}"

	echo "Waiting for DynamoDB S3 Export to complete (usually about 15-20 minutes)..."
	# poll status to wait until done - run this periodically, until finds "COMPLETED"
	# this usually takes about 15 minutes
	while ! aws dynamodb describe-export --export-arn "${dynamo_export_arn}" \| grep "COMPLETED"; do
	echo -n "."
	sleep 60
	done
	echo "DynamoDB S3 Export completed"

	# list the bucket to find the ID/full path we need for the next command
	s3_export_id=$(aws s3 ls ${s3_export_path} \| awk '{print $2}')
	s3_path="${s3_export_path}${s3_export_id}data/"
	echo "S3 Path for Glue Crawler: ${s3_path}"

	# Create a new Glue table using a crawler - first create crawler, then run it to create the table
	crawler_name="${crawler_basename} ${datestamp}"
	aws glue create-crawler \
	--name "${crawler_name}" \
	--role arn:aws:iam::YOURACCOUNT:role/Glue-Dynamo-S3-Athena \
	--database-name "${glue_database_name}" \
	--table-prefix "${glue_table_prefix}" \
	--targets '{"S3Targets": [{"Path": "'${s3_path}'"}]}' \
	--no-cli-pager

	echo "Created Glue crawler, name: ${crawler_name}"

	aws glue start-crawler --name "${crawler_name}"
	echo "Started Glue crawler - wait for this to finish before querying the table."

	#
	# clean up previous setup
	#

	# Find the previous S3 export and delete it - there should only be one, which this relies on
	old_export=$(aws s3 ls s3://${s3_bucket}/ \| grep -v ${datestamp} \| awk '{print $2}')
	# make sure old_export is not empty
	if [ -z "$old_export" ]; then
	echo "No old S3 export found, assuming nothing to clean up, exiting."
	exit 0
	fi

	echo "Deleting old S3 export: ${old_export}"
	aws s3 rm "s3://${s3_bucket}/${old_export}" --recursive

	# for the datestamp, remove the slash off the old_export dir name we just got
	old_export_datestamp=${old_export%/}

	# Delete older Glue crawler
	aws glue delete-crawler --name "${crawler_basename} ${old_export_datestamp}"
	echo "Deleted older Glue crawler."

	# Delete older Glue table
	aws glue delete-table --database-name ${glue_database_name} --name "naturedose_${old_export_datestamp}_data"
	echo "Deleted older Glue table."

	echo "Done - update your Athena queries to use the table: ${glue_table_prefix}data"