Created
October 2, 2016 09:58
-
-
Save rafbarr/40eccb8158c3c2ea2fedfcffd6cc9374 to your computer and use it in GitHub Desktop.
Very efficient implementation of feature hashing for Python.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# distutils: language=c++ | |
import collections | |
import operator | |
from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_GET_SIZE | |
import cython | |
import numpy as np | |
import pandas as pd | |
from libc.stdint cimport uint8_t, uint32_t | |
from libcpp.vector cimport vector | |
from murmurhash cimport mrmr | |
from scipy.sparse import coo_matrix | |
from .helpers import fast_coo2csr | |
cdef inline uint32_t _murmur3_32(object key, uint32_t seed=0) except 0: | |
"""32-bit MurmurHash3 for bytes and unicode objects.""" | |
cdef bytes bkey | |
if isinstance(key, unicode): | |
bkey = (<unicode> key).encode('utf-8') | |
elif isinstance(key, bytes): | |
bkey = <bytes> key | |
else: | |
raise ValueError("the key %s must be either unicode or str" % key) | |
return mrmr.hash32(PyBytes_AS_STRING(bkey), PyBytes_GET_SIZE(bkey), seed) | |
@cython.boundscheck(False) | |
@cython.wraparound(False) | |
cdef uint32_t[:] _murmur3_32_vec(object[:] keys, uint32_t[:] out, int seed=0): | |
"""Vectorized 32-bit MurmurHash3. | |
:param keys: the keys to be hashed | |
:param out: a pre-allocated uint32_t[:] array used to store the output | |
:param seed: the MurmurHash3 seed | |
:return: the out array | |
""" | |
assert len(keys) == len(out) | |
for i in range(len(keys)): | |
out[i] = _murmur3_32(keys[i], seed) | |
return out | |
cdef inline uint32_t _fnv1a_32(uint8_t* buf, size_t buf_size, uint32_t seed=0x811c9dc5): | |
"""32-bit FNV-1a hash.""" | |
cdef uint8_t* buf_curr = buf | |
cdef uint8_t* buf_end = buf + buf_size | |
while buf_curr < buf_end: | |
seed ^= buf_curr[0] | |
seed += (seed << 1) + (seed << 4) + (seed << 7) + (seed << 8) + (seed << 24) | |
buf_curr += 1 | |
return seed | |
@cython.boundscheck(False) | |
@cython.wraparound(False) | |
cdef uint32_t[:] _combine_hash_arrays(list hash_arrays, uint32_t[:] out): | |
"""Combine hash arrays by taking the FNV-1a hash of their values. | |
For instance, for the i-th entry of two arrays a and b, this will produce a final array | |
out such that out[i] := fnv1a([a[i], b[i]]). | |
:param hash_arrays: a list of uint32_t[:] arrays (of same size) containing hash values | |
:param out: a pre-allocated uint32_t[:] array used to store the output | |
:return: the out array | |
""" | |
cdef size_t n_hash_arrays = len(hash_arrays) | |
assert n_hash_arrays > 1 | |
cdef size_t n_elements = len(hash_arrays[0]) | |
assert n_elements == len(out) | |
cdef uint32_t[:] hash_array | |
cdef vector[uint32_t*] hash_array_ptrs | |
for hash_array in hash_arrays: | |
assert len(hash_array) == n_elements | |
hash_array_ptrs.push_back(&hash_array[0]) | |
cdef vector[uint32_t] buf = vector[uint32_t](n_hash_arrays) | |
cdef size_t n_buf_bytes = sizeof(uint32_t) * n_hash_arrays | |
for i in range(n_elements): | |
for j in range(n_hash_arrays): | |
buf[j] = hash_array_ptrs[j][i] | |
out[i] = _fnv1a_32(<uint8_t*> &buf[0], n_buf_bytes) | |
return out | |
def transform(X, int n_bits, set cat_feats, set cont_feats, set crossed_feats, dtype=np.float32, | |
bint unbiased=True): | |
"""Feature hashing transformation. | |
For details, check this out: http://www.machinelearning.org/archive/icml2009/papers/407.pdf. | |
:param X: a pandas DataFrame or a mapping from feature name to an array-like data structure | |
containing the data to be transformed | |
:param n_bits: the number of bits used to determine the size of the feature space | |
:param cat_feats: a set with the names of the categorical features | |
:param cont_feats: a set with the names of the continous features | |
:param crossed_feats: a set with tuples indicating the crossed (i.e. interaction) features | |
:param dtype: the data type of the output (should be numpy.float32 or numpy.float64) | |
:param unbiased: if the sign of the hashes should be used to avoid the bias introduced by | |
potential collisions | |
:return: a scipy sparse CSR matrix with the result of the feature hashing transformation of X | |
""" | |
assert isinstance(X, pd.DataFrame) or isinstance(X, collections.Mapping) | |
n_samples = X.shape[0] if isinstance(X, pd.DataFrame) else len(X.values()[0]) | |
assert all(len(v) == n_samples for v in X.itervalues()) | |
n_features = len(cat_feats) + len(cont_feats) + len(crossed_feats) | |
n_hashed_features = n_samples * n_features | |
assert n_hashed_features > 0 | |
rows = np.tile(np.arange(n_samples, dtype=np.uint32), n_features) | |
cols = np.empty(n_hashed_features, dtype=np.uint32) | |
vals = np.ones(n_hashed_features, dtype=dtype) | |
cols_per_feat, vals_per_feat = {}, {} | |
for i, f in enumerate(sorted(cat_feats | cont_feats | crossed_feats)): | |
cols_per_feat[f] = cols[i * n_samples:i * n_samples + n_samples] | |
vals_per_feat[f] = vals[i * n_samples:i * n_samples + n_samples] | |
for f in cat_feats: | |
_murmur3_32_vec( | |
np.array(X[f], copy=False, dtype=object), | |
cols_per_feat[f], | |
seed=_murmur3_32(f) | |
) | |
for f in cont_feats: | |
cols_per_feat[f][:] = _murmur3_32(f) | |
vals_per_feat[f][:] = np.array(X[f], copy=False, dtype=dtype) | |
for cf in crossed_feats: | |
# avoid expensive rehashing by combining the previously computed hashes | |
_combine_hash_arrays([cols_per_feat[f] for f in cf], cols_per_feat[cf]) | |
vals_per_feat[cf][:] = reduce(operator.mul, [X[f] for f in cf if f in cont_feats], 1) | |
# multiply the values by the sign of the hash to avoid bias | |
if unbiased: | |
vals *= np.sign(cols.astype(np.int32, copy=False)) | |
n_hash_buckets = 2 ** n_bits | |
# map the hashes to the actual bucket indices | |
cols &= n_hash_buckets - 1 | |
return fast_coo2csr(coo_matrix( | |
(vals, (rows, cols)), | |
(n_samples, n_hash_buckets) | |
)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import operator | |
import numpy as np | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from ._feature_hasher import transform | |
class FeatureHasher(BaseEstimator, TransformerMixin): | |
"""Feature hashing transformer. | |
Feature hashing is a technique for controlling the memory footprint of the feature engineering | |
module of a machine learning system. The typical approach of one-hot-encoding of categorical | |
variables requires a dictionary mapping strings to indices, which normally requires large | |
amounts of memory. | |
Feature hashing, on the other hand, simply requires the storage of the hash function. Each | |
feature is passed through the hash function to find the corresponding index in the feature | |
vector, which is of predetermined size. Collisions are bound to happen, however, it's not as big | |
of a deal as one might think [1]. | |
References | |
========== | |
1. Weinberger, Kilian, et al. "Feature hashing for large scale multitask learning." Proceedings | |
of the 26th Annual International Conference on Machine Learning. ACM, 2009. | |
2. Wikipedia contributors. "Feature hashing." Wikipedia, The Free Encyclopedia. Wikipedia, The | |
Free Encyclopedia, 15 Jan. 2015. Web. 2 Jun. 2015. | |
""" | |
def __init__(self, n_bits=22, cat_feats=None, cont_feats=None, crossed_feats=None, | |
dtype=np.float32, unbiased=True): | |
assert 0 < n_bits < 32 | |
self.n_bits_ = n_bits | |
self.cat_feats_ = set(cat_feats or []) | |
self.cont_feats_ = set(cont_feats or []) | |
self.crossed_feats_ = set(crossed_feats or []) | |
self.dtype_ = dtype | |
self.unbiased_ = unbiased | |
assert len(self.cat_feats_ | self.cont_feats_) > 0 | |
assert len(self.cat_feats_ & self.cont_feats_) == 0 | |
assert reduce( | |
operator.or_, map(set, self.crossed_feats_), set() | |
).issubset(self.cat_feats_ | self.cont_feats_) | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
return transform( | |
X, self.n_bits_, self.cat_feats_, self.cont_feats_, self.crossed_feats_, self.dtype_, | |
self.unbiased_ | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.sparse import csr_matrix | |
from scipy.sparse.sputils import get_index_dtype, upcast | |
from scipy.sparse._sparsetools import coo_tocsr | |
def fast_coo2csr(X): | |
assert X.format == 'coo' | |
m, n = X.shape | |
idx_dtype = get_index_dtype((X.row, X.col), maxval=max(X.nnz, n)) | |
row = X.row.astype(idx_dtype, copy=False) | |
col = X.col.astype(idx_dtype, copy=False) | |
# allocate space for CSR matrix | |
indptr = np.empty(m + 1, dtype=idx_dtype) | |
indices = np.empty_like(col, dtype=idx_dtype) | |
data = np.empty_like(X.data, dtype=upcast(X.dtype)) | |
# do conversion | |
coo_tocsr(m, n, X.nnz, row, col, X.data, indptr, indices, data) | |
X_csr = csr_matrix((data, indices, indptr), shape=X.shape) | |
X_csr.sum_duplicates() | |
return X_csr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment