sudevschiz · November 23, 2019 10:07
diff --git a/convert_to_fasttext_vec.py b/convert_to_fasttext_vec.py
 import io

 def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return n,d,data

 #Download pre-trained vectors from fasttext.cc
 fast_text_fname = 'ADD_FNAME'
 n,d,ft_dict = load_vectors(fname)

 def get_ft_vec(token,ft_dict,d=300):
    try:
        v = np.array(ft_dict[token])
    except KeyError:
        v = np.array([0]*d)
    return v

 def compute_ft_sum(text,ft_dict,d=300):
   ret_vec = np.array([0]*d)
    for key in text.keys():
        # Multiply the word embedding with the 
        # number of times the user searched 
        vec = np.multiply(get_ft_vec(key,ft_dict),int(text[key]))
        # Vector addition of token embeddings
        ret_vec = ret_vec + vec
    return list(user_vec)

 vec_mappings = document_series.map(lambda x : compute_ft_sum(x,ft_dict,d=300))
	import io

	def load_vectors(fname):
	fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
	n, d = map(int, fin.readline().split())
	data = {}
	for line in fin:
	tokens = line.rstrip().split(' ')
	data[tokens[0]] = list(map(float, tokens[1:]))
	return n,d,data

	#Download pre-trained vectors from fasttext.cc
	fast_text_fname = 'ADD_FNAME'
	n,d,ft_dict = load_vectors(fname)

	def get_ft_vec(token,ft_dict,d=300):
	try:
	v = np.array(ft_dict[token])
	except KeyError:
	v = np.array([0]*d)
	return v

	def compute_ft_sum(text,ft_dict,d=300):
	ret_vec = np.array([0]*d)
	for key in text.keys():
	# Multiply the word embedding with the
	# number of times the user searched
	vec = np.multiply(get_ft_vec(key,ft_dict),int(text[key]))
	# Vector addition of token embeddings
	ret_vec = ret_vec + vec
	return list(user_vec)

	vec_mappings = document_series.map(lambda x : compute_ft_sum(x,ft_dict,d=300))