Skip to content

Instantly share code, notes, and snippets.

@jaakla
Created January 23, 2025 08:34
Show Gist options
  • Save jaakla/a3b02255121c2b15a43cbed1e0277d36 to your computer and use it in GitHub Desktop.
Save jaakla/a3b02255121c2b15a43cbed1e0277d36 to your computer and use it in GitHub Desktop.
generator of test data - about 3B rows (orders)
import polars as pl
import numpy as np
from datetime import datetime, timedelta
# Constants
NUM_ROWS = 2_000_000_000 # Large number for demonstration; adjust for actual hardware limits
CHUNK_SIZE = 10_000_000 # Number of rows per chunk (file)
data_folder = '/Users/jaak/Downloads/jupydata/'
OUTPUT_FILE = data_folder + "generated_dataset_2B"
# Helper Functions
def generate_random_dates(start_date, end_date, num_samples):
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
return [(start + timedelta(days=np.random.randint(0, (end - start).days))).date() for _ in range(num_samples)]
def assign_season(month):
if month in [12, 1, 2]:
return "winter"
elif month in [3, 4, 5]:
return "spring"
elif month in [6, 7, 8]:
return "summer"
else:
return "fall"
def assign_location():
location_id_ranges = [
(1, 50, 0.3), (51, 200, 0.5), (201, 500, 0.15), (501, 1000, 0.05)
]
rand = np.random.rand()
cumulative = 0
for start, end, prob in location_id_ranges:
cumulative += prob
if rand < cumulative:
return np.random.randint(start, end + 1)
return np.random.randint(1, 1001)
def process_chunk(start, end):
# Generate base dataset for the chunk
order_ids = [f"order_{i}" for i in range(start, end)]
dates = generate_random_dates("2023-01-01", "2024-12-31", end - start)
months = [date.month for date in dates]
seasons = [assign_season(month) for month in months]
time_of_day_probs = [0.5, 0.3, 0.2]
time_of_day_choices = ["Morning", "Afternoon", "Night"]
num_lines_probs = [0.6, 0.3, 0.05, 0.01, 0.04]
num_lines_choices = [1, 2, 3, 4, 5]
data = {
"Order_ID": order_ids,
"Order_Date": dates,
"Month": months,
"Season": seasons,
"Time_Of_Day": np.random.choice(time_of_day_choices, end - start, p=time_of_day_probs),
"Num_Lines": np.random.choice(num_lines_choices, end - start, p=num_lines_probs),
"Location_ID": [assign_location() for _ in range(end - start)],
}
base_df = pl.DataFrame(data)
# Explode lines by Num_Lines
exploded_rows = []
for row in base_df.iter_rows(named=True):
for line in range(row["Num_Lines"]):
exploded_rows.append({
"Order_ID": row["Order_ID"],
"Order_Line_ID": f"{row['Order_ID']}_{line + 1}",
"Order_Date": row["Order_Date"],
"Time_Of_Day": row["Time_Of_Day"],
"Season": row["Season"],
"Month": row["Month"],
"Location_ID": row["Location_ID"],
"Line_Pos": line + 1
})
exploded_df = pl.DataFrame(exploded_rows)
# Add random line-level columns
exploded_df = exploded_df.with_columns([
pl.Series("Quantity", np.random.choice([1, 2, 3, 4, 5], len(exploded_df), p=[0.4, 0.3, 0.15, 0.1, 0.05])),
pl.Series("Discount_Rate", np.random.choice([0] + list(range(1, 16)), len(exploded_df), p=[0.8] + [0.2 / 15] * 15)),
pl.Series("Product_ID", np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], len(exploded_df), p=[0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.02, 0.02, 0.06]))
])
return exploded_df
# Process data in chunks
i = 93
print("Processing data in chunks...")
print("number of chunks:", NUM_ROWS // CHUNK_SIZE)
for start in range(930_000_000, NUM_ROWS, CHUNK_SIZE):
end = min(start + CHUNK_SIZE, NUM_ROWS)
chunk_df = process_chunk(start, end)
fn = f"{OUTPUT_FILE}_{i}.parquet"
chunk_df.write_parquet(fn)
i += 1
print(f"Chunk written to {fn}")
@jaakla
Copy link
Author

jaakla commented Jan 23, 2025

set i = 0 to start from beginning, or bigger if continue.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment