daviddwlee84 · May 31, 2024 08:18
diff --git a/.s3cfg b/.s3cfg
 # ~/.s3cfg
 [default]
 access_key = AAAAAAAAAAAAAAAAAAAA
 bucket_location = US
 host_base = url.to.minio
 secret_key = 0000000000000000000000000000000000000000000
diff --git a/read_table_from_s3_using_pyarrow.py b/read_table_from_s3_using_pyarrow.py
 import configparser
 import os
 import pyarrow.parquet as pq
 import pyarrow.fs as fs

 config = configparser.ConfigParser()
 assert config.read(os.path.expanduser("~/.s3cfg")), "Loading ~/.s3cfg failed"

 access_key = config["default"]["access_key"]
 secret_key = config["default"]["secret_key"]
 host_base = config["default"]["host_base"]
 region = config["default"]["bucket_location"]

 # Create an S3FileSystem object
 # https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html
 s3 = fs.S3FileSystem(
    access_key=access_key,
    secret_key=secret_key,
    endpoint_override=f"https://{host_base}",
    region=region,
 )

 # Define the S3 path to the Parquet files
 # NOTE: Expected an S3 object path of the form 'bucket/key...'
 s3_path = "bucket/key/path/to/dir"

 # Create a ParquetDataset
 # https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#
 dataset = pq.ParquetDataset(s3_path, filesystem=s3)

 # Read the dataset into a Table
 table = dataset.read()

 # Print the table
 print(table)

 # Convert first 2 rows to pandas dataframe
 print(df := table.take([0, 1]).to_pandas())
diff --git a/requirements.txt b/requirements.txt
 pyarrow
 pandas
	# ~/.s3cfg
	[default]
	access_key = AAAAAAAAAAAAAAAAAAAA
	bucket_location = US
	host_base = url.to.minio
	secret_key = 0000000000000000000000000000000000000000000
	import configparser
	import os
	import pyarrow.parquet as pq
	import pyarrow.fs as fs

	config = configparser.ConfigParser()
	assert config.read(os.path.expanduser("~/.s3cfg")), "Loading ~/.s3cfg failed"

	access_key = config["default"]["access_key"]
	secret_key = config["default"]["secret_key"]
	host_base = config["default"]["host_base"]
	region = config["default"]["bucket_location"]

	# Create an S3FileSystem object
	# https://arrow.apache.org/docs/python/generated/pyarrow.fs.S3FileSystem.html
	s3 = fs.S3FileSystem(
	access_key=access_key,
	secret_key=secret_key,
	endpoint_override=f"https://{host_base}",
	region=region,
	)

	# Define the S3 path to the Parquet files
	# NOTE: Expected an S3 object path of the form 'bucket/key...'
	s3_path = "bucket/key/path/to/dir"

	# Create a ParquetDataset
	# https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetDataset.html#
	dataset = pq.ParquetDataset(s3_path, filesystem=s3)

	# Read the dataset into a Table
	table = dataset.read()

	# Print the table
	print(table)

	# Convert first 2 rows to pandas dataframe
	print(df := table.take([0, 1]).to_pandas())