-
-
Save alvations/cf12b095067552c252349090eec21796 to your computer and use it in GitHub Desktop.
Mean target value encoding for categorical variable using dask
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import os.path as op | |
from time import time | |
import dask.dataframe as ddf | |
import dask.array as da | |
from dask import delayed, compute | |
from distributed import Client | |
def make_categorical_data(n_samples=int(1e7), n_features=10): | |
"""Generate some random categorical data | |
The default parameters should generate around 1GB of random integer data | |
with increasing cardinality along with a normally distributed real valued | |
target variable. | |
""" | |
feature_names = ['f_%03d' % i for i in range(n_features)] | |
features_series = [ | |
da.random.randint(low=0, high=(i + 1) * 10, size=n_samples, | |
chunks=n_samples // 10) | |
for i in range(n_features) | |
] | |
features_series = [ | |
ddf.from_dask_array(col_data, columns=[feature_name]) | |
for col_data, feature_name in zip(features_series, feature_names) | |
] | |
target = da.random.normal(loc=0, scale=1, size=n_samples, | |
chunks=n_samples // 10) | |
target = ddf.from_dask_array(target, columns=['target']) | |
data = ddf.concat(features_series + [target], axis=1) | |
data = data.repartition(npartitions=10) | |
return data | |
def target_mean_transform(data, feature_colname, target_colname): | |
if data[feature_colname].dtype.kind not in ('i', 'O'): | |
# Non-categorical variables are kept untransformed: | |
return data[feature_colname] | |
data = data[[feature_colname, target_colname]] | |
target_means = data.groupby(feature_colname).mean() | |
mapping = target_means.to_dict()[target_colname] | |
return data[feature_colname].map(mapping) | |
def encode_with_target_mean(data, target_colname='target'): | |
"""Supervised encoding of categorical variables with per-group target mean. | |
All columns that contain integer values are replaced by real valued data | |
representing the average target value for each category. | |
""" | |
features_data = data.drop(target_colname, axis=1) | |
target_data = data[target_colname] | |
return delayed(ddf.concat)( | |
[delayed(target_mean_transform)(data, colname, target_colname) | |
for colname in features_data.columns] + [target_data], | |
axis=1 | |
) | |
if __name__ == '__main__': | |
# make sure dask uses the distributed scheduler: | |
# Start the scheduler and at least one worker with: | |
# $ dask-scheduler | |
# $ dask-worker localhost:8786 | |
# | |
c = Client('localhost:8786') | |
original_folder_name = 'random_categorical_data' | |
encoded_folder_name = 'random_encoded_data' | |
if not op.exists(original_folder_name): | |
print("Generating random categorical data in", | |
op.abspath(original_folder_name)) | |
os.mkdir(original_folder_name) | |
data = make_categorical_data() | |
ddf.to_parquet(original_folder_name, data) | |
print("Using data from", op.abspath(original_folder_name)) | |
data = ddf.read_parquet(original_folder_name) | |
print(data.head(5)) | |
print("Encoding categorical variables...") | |
encoded = encode_with_target_mean(data, target_colname='target') | |
t0 = time() | |
encoded.persist() | |
encoded_summary = compute(encoded.head(5)) | |
print("done in %0.3fs" % (time() - t0)) | |
print(encoded_summary) | |
print("Saving encoded data to", op.abspath(encoded_folder_name)) | |
t0 = time() | |
compute(delayed(ddf.to_parquet)(encoded_folder_name, encoded)) | |
print("done in %0.3fs" % (time() - t0)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment