Created
November 23, 2019 10:07
-
-
Save sudevschiz/678de8629cb336243d802f6bb97e41c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
def load_vectors(fname): | |
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') | |
n, d = map(int, fin.readline().split()) | |
data = {} | |
for line in fin: | |
tokens = line.rstrip().split(' ') | |
data[tokens[0]] = list(map(float, tokens[1:])) | |
return n,d,data | |
#Download pre-trained vectors from fasttext.cc | |
fast_text_fname = 'ADD_FNAME' | |
n,d,ft_dict = load_vectors(fname) | |
def get_ft_vec(token,ft_dict,d=300): | |
try: | |
v = np.array(ft_dict[token]) | |
except KeyError: | |
v = np.array([0]*d) | |
return v | |
def compute_ft_sum(text,ft_dict,d=300): | |
ret_vec = np.array([0]*d) | |
for key in text.keys(): | |
# Multiply the word embedding with the | |
# number of times the user searched | |
vec = np.multiply(get_ft_vec(key,ft_dict),int(text[key])) | |
# Vector addition of token embeddings | |
ret_vec = ret_vec + vec | |
return list(user_vec) | |
vec_mappings = document_series.map(lambda x : compute_ft_sum(x,ft_dict,d=300)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment