Skip to content

Instantly share code, notes, and snippets.

@BarakStout
Created September 19, 2023 16:39
Show Gist options
  • Save BarakStout/f52ef11997dab94cfe64998aa087dc34 to your computer and use it in GitHub Desktop.
Save BarakStout/f52ef11997dab94cfe64998aa087dc34 to your computer and use it in GitHub Desktop.
import pandas as pd
import os
import deltalake
minio_url = "http://df-minio:9000"
minio_access = "console"
minio_secret = "console123"
tables = [
"call_center.parquet",
"catalog_page.parquet",
"catalog_returns.parquet",
"catalog_sales.parquet",
"customer.parquet",
"customer_address.parquet",
"customer_demographics.parquet",
"date_dim.parquet",
"dbgen_version.parquet",
"folder-init",
"household_demographics.parquet",
"income_band.parquet",
"inventory.parquet",
"item.parquet",
"promotion.parquet",
"reason.parquet",
"ship_mode.parquet",
"store.parquet",
"store_returns.parquet",
"store_sales.parquet",
"time_dim.parquet",
"warehouse.parquet",
"web_page.parquet",
"web_returns.parquet",
"web_sales.parquet",
"web_site.parquet",
]
#tables=["call_center.parquet"]
for table in tables:
table_name=table[:table.rindex('.')]+"/"
print("Starting table... " + table+" | "+table_name)
df = pd.read_parquet("s3://df-bucket/db100-parquet-clean/"+table,engine='pyarrow',storage_options={"key":minio_access,"secret":minio_secret,"endpoint_url":minio_url})
try:
deltalake.write_deltalake("s3://df-bucket/warehouse/"+table_name,df,storage_options={"key":minio_access,"secret":minio_secret,"endpoint_url":minio_url,"AWS_REGION":"us-east-1","AWS_S3_ALLOW_UNSAFE_RENAME":"true"})
except:
print("Table "+table_name[:-1]+" already there")
# table = deltalake.DeltaTable("s3://df-bucket/warehouse/"+table_name)
# table.to_pandas()
# table.vacuum()
# table.optimize.compact()
print("table done... \n\n")
print("done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment