Created
February 6, 2018 16:36
-
-
Save jad2192/46b04f3297801b82d0185eb0c0d9a327 to your computer and use it in GitHub Desktop.
N-Tuples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
class N_Tuple_Classifier_fast(object): | |
def __init__(self, pixel_percentage=0.1, num_tuples=100, pixel_tolerance=0.3, warm_start=None): | |
''' pixel_percentage: The percentage of pixels on to which we'll | |
For example the default 0.1 randomly choose 78 pixels. | |
num_tuples: Number of unique tuples on to which project. | |
pixel_tolerance: The classifier requires binary 0-1 pixel values, | |
set this as the pixel intensity threshold below which | |
we set a pixel to zero. | |
warm_start: Pre-set tuple matrix and frequency table if you want to start with | |
a pre-trained model.''' | |
self.M = num_tuples | |
self.pt = pixel_tolerance | |
self.class_ind = {} | |
if warm_start is None: | |
self.tuples = np.zeros((num_tuples,784), dtype='i8') | |
for m in range(self.M): | |
self.tuples[m][np.random.choice(np.arange(784), | |
size=int(784*pixel_percentage), | |
replace=False)] = np.ones(int(784*pixel_percentage), dtype='i8') | |
self.projs = None | |
else: | |
self.tuples = warm_start[0] | |
self.freq_table = warm_start[1] | |
def fit(self, data): | |
'''Assume the data is an array of size n_samples x 785, where first column is the label | |
and next 784 columns are the pixel values.''' | |
start_t = d_timer() | |
labs = data[:,0] | |
for k in range(10): | |
self.class_ind[k] = np.where(labs == k) | |
# Transforming image to a binary vector by cutting of pixel intensities which are | |
# below pixel_tolerance. | |
data_bin = np.asanyarray((data[:,1:] / 256) > self.pt, dtype='i8') | |
# Will project all the data on all dimensions simultaneously using tensor ops. | |
data_tensor = np.ones((self.M, data_bin.shape[0], data_bin.shape[1]), dtype='i8') | |
data_tensor = np.einsum('ij,kij->kij', data_bin, data_tensor, dtype='i8') | |
# The following will result in a tensor T of shape (M,N,785) = (num_tup, num_samp, 785) | |
# where T[m,n,:] is the projection of the n-th data sample onto the m-th tuple set. | |
self.projs = np.einsum('ij,ikj->ikj', self.tuples, data_tensor, dtype='i8') | |
print('Model fit, time spent: ', d_timer() - start_t, ' s') | |
def predict(self, X): | |
s = d_timer() | |
X_bin = np.asanyarray(X / 256 > self.pt, dtype='i8') | |
X_proj = np.einsum('i,ji->ji', X_bin, self.tuples, dtype='i8') | |
proj_test = np.einsum('ij,ikj->ikj', X_proj, self.projs, dtype='i8') | |
ext = np.ones(proj_test.shape, dtype='i8') | |
ext = np.einsum('ij,ikj->ikj', X_proj, ext, dtype='i8') | |
''' The following tensor holds tell us: | |
Given the N-th test sample comb[N,m,j] is zero | |
iff the N-th test input has same projection on the m-th tuple | |
as the j-th training sample. We can use this to indirecty compute the | |
counts and hence make a prediction''' | |
comb = np.asanyarray((proj_test + ext) == 2, dtype='i8').sum(axis=-1) | |
prob = np.zeros(10) | |
for k in range(10): | |
cur_ix = self.class_ind[k] | |
cur_comb = comb[:,cur_ix] | |
prob[k] = cur_comb.sum() / len(cur_ix) | |
print('Prediction took: ', d_timer() - s, ' s') | |
return prob.argmax() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment