jaakla · January 23, 2025 08:34 · jaakla · Jan 23, 2025
diff --git a/generate3b.py b/generate3b.py
 import polars as pl
 import numpy as np
 from datetime import datetime, timedelta

 # Constants
 NUM_ROWS = 2_000_000_000  # Large number for demonstration; adjust for actual hardware limits
 CHUNK_SIZE = 10_000_000  # Number of rows per chunk (file)

 data_folder = '/Users/jaak/Downloads/jupydata/'

 OUTPUT_FILE = data_folder + "generated_dataset_2B"

 # Helper Functions
 def generate_random_dates(start_date, end_date, num_samples):
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    return [(start + timedelta(days=np.random.randint(0, (end - start).days))).date() for _ in range(num_samples)]

 def assign_season(month):
    if month in [12, 1, 2]:
        return "winter"
    elif month in [3, 4, 5]:
        return "spring"
    elif month in [6, 7, 8]:
        return "summer"
    else:
        return "fall"

 def assign_location():
    location_id_ranges = [
        (1, 50, 0.3), (51, 200, 0.5), (201, 500, 0.15), (501, 1000, 0.05)
    ]
    rand = np.random.rand()
    cumulative = 0
    for start, end, prob in location_id_ranges:
        cumulative += prob
        if rand < cumulative:
            return np.random.randint(start, end + 1)
    return np.random.randint(1, 1001)

 def process_chunk(start, end):
    # Generate base dataset for the chunk
    order_ids = [f"order_{i}" for i in range(start, end)]
    dates = generate_random_dates("2023-01-01", "2024-12-31", end - start)
    months = [date.month for date in dates]
    seasons = [assign_season(month) for month in months]
    
    time_of_day_probs = [0.5, 0.3, 0.2]
    time_of_day_choices = ["Morning", "Afternoon", "Night"]
    num_lines_probs = [0.6, 0.3, 0.05, 0.01, 0.04]
    num_lines_choices = [1, 2, 3, 4, 5]

    data = {
        "Order_ID": order_ids,
        "Order_Date": dates,
        "Month": months,
        "Season": seasons,
        "Time_Of_Day": np.random.choice(time_of_day_choices, end - start, p=time_of_day_probs),
        "Num_Lines": np.random.choice(num_lines_choices, end - start, p=num_lines_probs),
        "Location_ID": [assign_location() for _ in range(end - start)],
    }

    base_df = pl.DataFrame(data)

    # Explode lines by Num_Lines
    exploded_rows = []
    for row in base_df.iter_rows(named=True):
        for line in range(row["Num_Lines"]):
            exploded_rows.append({
                "Order_ID": row["Order_ID"],
                "Order_Line_ID": f"{row['Order_ID']}_{line + 1}",
                "Order_Date": row["Order_Date"],
                "Time_Of_Day": row["Time_Of_Day"],
                "Season": row["Season"],
                "Month": row["Month"],
                "Location_ID": row["Location_ID"],
                "Line_Pos": line + 1
            })
    exploded_df = pl.DataFrame(exploded_rows)

    # Add random line-level columns
    exploded_df = exploded_df.with_columns([
        pl.Series("Quantity", np.random.choice([1, 2, 3, 4, 5], len(exploded_df), p=[0.4, 0.3, 0.15, 0.1, 0.05])),
        pl.Series("Discount_Rate", np.random.choice([0] + list(range(1, 16)), len(exploded_df), p=[0.8] + [0.2 / 15] * 15)),
        pl.Series("Product_ID", np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], len(exploded_df), p=[0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.02, 0.02, 0.06]))
    ])

    return exploded_df

 # Process data in chunks
 i = 93
 print("Processing data in chunks...")
 print("number of chunks:", NUM_ROWS // CHUNK_SIZE)

 for start in range(930_000_000, NUM_ROWS, CHUNK_SIZE):
    end = min(start + CHUNK_SIZE, NUM_ROWS)
    chunk_df = process_chunk(start, end)
    fn = f"{OUTPUT_FILE}_{i}.parquet"
    chunk_df.write_parquet(fn)
    i += 1
    print(f"Chunk written to {fn}")
	import polars as pl
	import numpy as np
	from datetime import datetime, timedelta

	# Constants
	NUM_ROWS = 2_000_000_000 # Large number for demonstration; adjust for actual hardware limits
	CHUNK_SIZE = 10_000_000 # Number of rows per chunk (file)

	data_folder = '/Users/jaak/Downloads/jupydata/'

	OUTPUT_FILE = data_folder + "generated_dataset_2B"

	# Helper Functions
	def generate_random_dates(start_date, end_date, num_samples):
	start = datetime.strptime(start_date, "%Y-%m-%d")
	end = datetime.strptime(end_date, "%Y-%m-%d")
	return [(start + timedelta(days=np.random.randint(0, (end - start).days))).date() for _ in range(num_samples)]

	def assign_season(month):
	if month in [12, 1, 2]:
	return "winter"
	elif month in [3, 4, 5]:
	return "spring"
	elif month in [6, 7, 8]:
	return "summer"
	else:
	return "fall"

	def assign_location():
	location_id_ranges = [
	(1, 50, 0.3), (51, 200, 0.5), (201, 500, 0.15), (501, 1000, 0.05)
	]
	rand = np.random.rand()
	cumulative = 0
	for start, end, prob in location_id_ranges:
	cumulative += prob
	if rand < cumulative:
	return np.random.randint(start, end + 1)
	return np.random.randint(1, 1001)

	def process_chunk(start, end):
	# Generate base dataset for the chunk
	order_ids = [f"order_{i}" for i in range(start, end)]
	dates = generate_random_dates("2023-01-01", "2024-12-31", end - start)
	months = [date.month for date in dates]
	seasons = [assign_season(month) for month in months]

	time_of_day_probs = [0.5, 0.3, 0.2]
	time_of_day_choices = ["Morning", "Afternoon", "Night"]
	num_lines_probs = [0.6, 0.3, 0.05, 0.01, 0.04]
	num_lines_choices = [1, 2, 3, 4, 5]

	data = {
	"Order_ID": order_ids,
	"Order_Date": dates,
	"Month": months,
	"Season": seasons,
	"Time_Of_Day": np.random.choice(time_of_day_choices, end - start, p=time_of_day_probs),
	"Num_Lines": np.random.choice(num_lines_choices, end - start, p=num_lines_probs),
	"Location_ID": [assign_location() for _ in range(end - start)],
	}

	base_df = pl.DataFrame(data)

	# Explode lines by Num_Lines
	exploded_rows = []
	for row in base_df.iter_rows(named=True):
	for line in range(row["Num_Lines"]):
	exploded_rows.append({
	"Order_ID": row["Order_ID"],
	"Order_Line_ID": f"{row['Order_ID']}_{line + 1}",
	"Order_Date": row["Order_Date"],
	"Time_Of_Day": row["Time_Of_Day"],
	"Season": row["Season"],
	"Month": row["Month"],
	"Location_ID": row["Location_ID"],
	"Line_Pos": line + 1
	})
	exploded_df = pl.DataFrame(exploded_rows)

	# Add random line-level columns
	exploded_df = exploded_df.with_columns([
	pl.Series("Quantity", np.random.choice([1, 2, 3, 4, 5], len(exploded_df), p=[0.4, 0.3, 0.15, 0.1, 0.05])),
	pl.Series("Discount_Rate", np.random.choice([0] + list(range(1, 16)), len(exploded_df), p=[0.8] + [0.2 / 15] * 15)),
	pl.Series("Product_ID", np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], len(exploded_df), p=[0.1, 0.1, 0.1, 0.1, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.02, 0.02, 0.06]))
	])

	return exploded_df

	# Process data in chunks
	i = 93
	print("Processing data in chunks...")
	print("number of chunks:", NUM_ROWS // CHUNK_SIZE)

	for start in range(930_000_000, NUM_ROWS, CHUNK_SIZE):
	end = min(start + CHUNK_SIZE, NUM_ROWS)
	chunk_df = process_chunk(start, end)
	fn = f"{OUTPUT_FILE}_{i}.parquet"
	chunk_df.write_parquet(fn)
	i += 1
	print(f"Chunk written to {fn}")