Created
August 20, 2020 16:35
-
-
Save auyer/f7defc2420111c34fe81dda74efaf176 to your computer and use it in GitHub Desktop.
Cuncurrently deletes all files in a folder (or with a prefix) in an AWS S3 Bucket. This is usefull for a very large ammount of files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"sync" | |
"github.com/aws/aws-sdk-go/aws" | |
"github.com/aws/aws-sdk-go/aws/awserr" | |
"github.com/aws/aws-sdk-go/aws/session" | |
"github.com/aws/aws-sdk-go/service/s3" | |
) | |
// This script will read all files in a folder or with a prefix with pagination,and run a batch delete for each page. | |
// To avoid rate limit errors, this is done with 15 workers (that request the group deletion) | |
// Set these Variables to point to what you want to delete | |
var bucketName = "your-S3-bucket-name" | |
var s3Region = "us-east-1" | |
// this variable can be a fodler name, or a prefix that every file you want to delete has. | |
var prefixOrFolder = "folder-name" | |
var wg sync.WaitGroup | |
// Worker that will delete the files in batch | |
func worker(id int, svc *s3.S3, jobs <-chan []*s3.Object) { | |
sid := fmt.Sprintf("%d", id) | |
for list := range jobs { | |
fmt.Println("worker " + string(id) + "got job") | |
objList := []*s3.ObjectIdentifier{} | |
// creates list of objects for input | |
for _, obj := range list { | |
objList = append(objList, &s3.ObjectIdentifier{Key: obj.Key}) | |
} | |
input := &s3.DeleteObjectsInput{ | |
Bucket: aws.String(bucketName), | |
Delete: &s3.Delete{ | |
Objects: objList, | |
Quiet: aws.Bool(false), | |
}, | |
} | |
_, err := svc.DeleteObjects(input) | |
if err != nil { | |
if aerr, ok := err.(awserr.Error); ok { | |
switch aerr.Code() { | |
default: | |
fmt.Println("worker ERROR " + sid + aerr.Error()) | |
} | |
} else { | |
fmt.Println("worker ERROR " + sid + err.Error()) | |
} | |
} | |
fmt.Println("worker " + sid + "got result") | |
wg.Done() | |
} | |
} | |
func main() { | |
sess := session.Must(session.NewSession()) | |
svc := s3.New(sess, aws.NewConfig().WithRegion(s3Region)) | |
wg = sync.WaitGroup{} | |
jobs := make(chan []*s3.Object, 16) | |
for w := 1; w <= 15; w++ { | |
go worker(w, svc, jobs) | |
} | |
i := 0 | |
// ListObjectsPages will list all matching objects in pages | |
err := svc.ListObjectsPages(&s3.ListObjectsInput{ | |
Bucket: aws.String(bucketName), | |
Prefix: aws.String(prefixOrFolder), | |
}, func(p *s3.ListObjectsOutput, last bool) (shouldContinue bool) { | |
wg.Add(1) | |
fmt.Println("Page,", i) | |
i++ | |
jobs <- p.Contents | |
return true | |
}) | |
wg.Wait() | |
if err != nil { | |
fmt.Println("failed to list objects", err) | |
return | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Why ?
The issue I had, is that the folder had so many small files ( failed attempts of AWS Glue partitioned Parquet files) that neither the AWS web Console nor the CLI were able to delete the files.
I had the
aws s3 rm s3://bucket/folder --recursive
CLI command running for the whole night without success.Important note:
The value of 15 workers was chosen because larger values would hit the Rate limit often. Keep a watch for errors, and if happening frequently, reduce the number of workers.
If you had failures, just run it again to clear the leftovers.