thehappycheese · November 17, 2023 03:29
diff --git a/split_dataframe_by_weighted_sequence.py b/split_dataframe_by_weighted_sequence.py
 import pandas as pd

 def split_dataframe_by_weighted_sequence(
        df:pd.DataFrame,
        split_column:str,
        weight_column:str,
        skew:float=0.5
    ):
    """
    Split a dataframe into two parts by a sequence of values and a sequence of weights.
    
    1. The dataframe is sorted by the `split_column`
    2. The cumulative sum of `weight_column` is calculated
    3. The split is made at the index where the cumulative weight is equal to the total of `weight_column` times the `skew`.

    The default skew (0.5) will split the dataframe such that each part contains approximately half of the total weight.

    If a skew of 0.6 is used then the first part will contain 60% of the total weight and the second part will contain 40% of the total weight.
    
    """
    df = df.loc[df.loc[:,split_column].sort_values().index]
    split_index = (
        df.loc[:,weight_column].cumsum().values
        > df.loc[:,weight_column].sum() * skew
    ).argmax()
    return df.iloc[:split_index], df.iloc[split_index:]
	import pandas as pd

	def split_dataframe_by_weighted_sequence(
	df:pd.DataFrame,
	split_column:str,
	weight_column:str,
	skew:float=0.5
	):
	"""
	Split a dataframe into two parts by a sequence of values and a sequence of weights.

	1. The dataframe is sorted by the `split_column`
	2. The cumulative sum of `weight_column` is calculated
	3. The split is made at the index where the cumulative weight is equal to the total of `weight_column` times the `skew`.

	The default skew (0.5) will split the dataframe such that each part contains approximately half of the total weight.

	If a skew of 0.6 is used then the first part will contain 60% of the total weight and the second part will contain 40% of the total weight.

	"""
	df = df.loc[df.loc[:,split_column].sort_values().index]
	split_index = (
	df.loc[:,weight_column].cumsum().values
	> df.loc[:,weight_column].sum() * skew
	).argmax()
	return df.iloc[:split_index], df.iloc[split_index:]
No results found