Created
January 23, 2025 08:34
-
-
Save jaakla/a3b02255121c2b15a43cbed1e0277d36 to your computer and use it in GitHub Desktop.
generator of test data - about 3B rows (orders)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import polars as pl | |
import numpy as np | |
from datetime import datetime, timedelta | |
# Constants | |
NUM_ROWS = 2_000_000_000 # Large number for demonstration; adjust for actual hardware limits | |
CHUNK_SIZE = 10_000_000 # Number of rows per chunk (file) | |
data_folder = '/Users/jaak/Downloads/jupydata/' | |
OUTPUT_FILE = data_folder + "generated_dataset_2B" | |
# Helper Functions | |
def generate_random_dates(start_date, end_date, num_samples): | |
start = datetime.strptime(start_date, "%Y-%m-%d") | |
end = datetime.strptime(end_date, "%Y-%m-%d") | |
return [(start + timedelta(days=np.random.randint(0, (end - start).days))).date() for _ in range(num_samples)] | |
def assign_season(month): | |
if month in [12, 1, 2]: | |
return "winter" | |
elif month in [3, 4, 5]: | |
return "spring" | |
elif month in [6, 7, 8]: | |
return "summer" | |
else: | |
return "fall" | |
def assign_location(): | |
location_id_ranges = [ | |
(1, 50, 0.3), (51, 200, 0.5), (201, 500, 0.15), (501, 1000, 0.05) | |
] | |
rand = np.random.rand() | |
cumulative = 0 | |
for start, end, prob in location_id_ranges: | |
cumulative += prob | |
if rand < cumulative: | |
return np.random.randint(start, end + 1) | |
return np.random.randint(1, 1001) | |
def process_chunk(start, end): | |
# Generate base dataset for the chunk | |
order_ids = [f"order_{i}" for i in range(start, end)] | |
dates = generate_random_dates("2023-01-01", "2024-12-31", end - start) | |
months = [date.month for date in dates] | |
seasons = [assign_season(month) for month in months] | |
time_of_day_probs = [0.5, 0.3, 0.2] | |
time_of_day_choices = ["Morning", "Afternoon", "Night"] | |
num_lines_probs = [0.6, 0.3, 0.05, 0.01, 0.04] | |
num_lines_choices = [1, 2, 3, 4, 5] | |
data = { | |
"Order_ID": order_ids, | |
"Order_Date": dates, | |
"Month": months, | |
"Season": seasons, | |
"Time_Of_Day": np.random.choice(time_of_day_choices, end - start, p=time_of_day_probs), | |
"Num_Lines": np.random.choice(num_lines_choices, end - start, p=num_lines_probs), | |
"Location_ID": [assign_location() for _ in range(end - start)], | |
} | |
base_df = pl.DataFrame(data) | |
# Explode lines by Num_Lines | |
exploded_rows = [] | |
for row in base_df.iter_rows(named=True): | |
for line in range(row["Num_Lines"]): | |
exploded_rows.append({ | |
"Order_ID": row["Order_ID"], | |
"Order_Line_ID": f"{row['Order_ID']}_{line + 1}", | |
"Order_Date": row["Order_Date"], | |
"Time_Of_Day": row["Time_Of_Day"], | |
"Season": row["Season"], | |
"Month": row["Month"], | |
"Location_ID": row["Location_ID"], | |
"Line_Pos": line + 1 | |
}) | |
exploded_df = pl.DataFrame(exploded_rows) | |
# Add random line-level columns | |
exploded_df = exploded_df.with_columns([ | |
pl.Series("Quantity", np.random.choice([1, 2, 3, 4, 5], len(exploded_df), p=[0.4, 0.3, 0.15, 0.1, 0.05])), | |
pl.Series("Discount_Rate", np.random.choice([0] + list(range(1, 16)), len(exploded_df), p=[0.8] + [0.2 / 15] * 15)), | |
pl.Series("Product_ID", np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], len(exploded_df), p=[0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.02, 0.02, 0.06])) | |
]) | |
return exploded_df | |
# Process data in chunks | |
i = 93 | |
print("Processing data in chunks...") | |
print("number of chunks:", NUM_ROWS // CHUNK_SIZE) | |
for start in range(930_000_000, NUM_ROWS, CHUNK_SIZE): | |
end = min(start + CHUNK_SIZE, NUM_ROWS) | |
chunk_df = process_chunk(start, end) | |
fn = f"{OUTPUT_FILE}_{i}.parquet" | |
chunk_df.write_parquet(fn) | |
i += 1 | |
print(f"Chunk written to {fn}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
set i = 0 to start from beginning, or bigger if continue.