onidzelskyi · July 9, 2018 07:33
diff --git a/Word_embeddings_vs_TFIDF_for_text_classification_task b/Word_embeddings_vs_TFIDF_for_text_classification_task
 # Word embeddings with GloVe50
 # This demo aims to show up the technique, alternative to famous TFIDF.
 # Helper functions:
 # read_glove_vecs - reads GloVe data from glove.6B.50d.txt into word_to_vec_map
 # sentence_to_avg - converts sentence to its vector representation (50 dimensional in case of using glove.6B.50d
 # as word embeddings data source)source
 from sklearn.metrics.pairwise import cosine_similarity


 def read_glove_vecs(glove_file):
    """Reads GloVe data from glove.6B.50d.txt into word_to_vec_map.
    @input glove_file - file name with path to GloVe data.
    @return word_to_vec_map - mapping word to its vector representation in GloVe format."""
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return word_to_vec_map


 def sentence_to_avg(sentence, word_to_vec_map):
    """Converts sentence to its vector representation.
    @input sentence - string of words.
    @input word_to_vec_map - mapping word to its GloVe representation.
    @return Average of vectors of sentence words."""
    words = sentence.lower().split()
    avg = np.zeros(word_to_vec_map[words[0]].shape)
    for word in words:
        avg += word_to_vec_map[word]
    return avg / len(words)


 # Train
 # This step should be executed once before of using it for solving text similarity task.
 # As an input X we use sample dataset below but, you can upload data you work with
 # Result is matrix Y of (#X, 50) shape with GloVe representation for each document in X dataset


 """Dependencies
 numpy

 Install them
 pip install numpy
 """

 import numpy as np

 # Sample dataset
 X = np.asarray(['I am going to the bar tonight', 'I love you', 'miss you my dear',
                'Lets go party and drinks', 'Congrats on the new job', 'Congratulations',
                'I am so happy for you', 'Why are you feeling bad', 'What is wrong with you',
                'You totally deserve this prize', 'Let us go play football',
                'Are you down for football this afternoon', 'Work hard play harder',
                'It is suprising how people can be dumb sometimes',
                'I am very disappointed', 'It is the best day in my life',
                'I think I will end up alone', 'My life is so boring', 'Good job',
                'Great so awesome'])

 # Load words to GloVe mapping
 word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

 # Initialize vector representation for train dataset with dimension of #docs by GloVe dimenstion (50)
 Y = np.zeros((len(X), 50))

 # Fill each row of Y by average vec representation of sentences from train dataset
 for index, sentence in enumerate(X):
    Y[index:] = sentence_to_avg(sentence, word_to_vec_map)

 print('#docs in X dataset is: {}'.format(len(X)))
 print('Shape of GloVe representation for X dataset is: {}'.format(Y.shape))
 print('First document in X dataset is:\n\n\t{}\n'.format(X[0]))
 print('GloVe representation for it is:\n\n{}'.format(Y[0]))


 # Predict with trained model
 # Code snipped below represents algorithm of finding most similar document from available dataset for current sentence


 """
 Y - GloVe representation of train dataset filled above from train dataset (corpus of documents).
    #rows equals to #docs in the corpus
    #columns equals to #features in GloVe (50 in this example.)
 sentence - sample sentence to find similarity for.

 Steps:

    1. Convert sentence to its GloVec representation using by helper function sentence_to_avg
       It returns vector of coeff. of (50,) shape
    2. Calc document similarity from available text corpus using by cosine_similarity
    3. Show up top 3 similar documents
 """

 sentence = 'I believe I can fly'

 # Step 1.
 y = sentence_to_avg(sentence, word_to_vec_map)

 # Step 2.
 x = cosine_similarity(Y, y)

 # Step 3.
 for index in np.argsort(x, axis=0)[-3:][::-1]:
    print(X[index])


 # Comparison with traditional TFIDF¶


 """Dependencies
 scikit_learn>=0.18
 scipy>=0.18

 Install them
 pip install scikit_learn scipy
 """
 from sklearn.feature_extraction.text import TfidfVectorizer


 vectorizer = TfidfVectorizer()
 Y_t = vectorizer.fit_transform(X)
 y_t = vectorizer.transform([sentence])
 xx = cosine_similarity(Y_t, y_t)

 for index in np.argsort(xx, axis=0)[-3:][::-1]:
    print(X[index])


 # What next?
 # After you familirised yourself with technique of word embeddings, you can try next steps:
 # Use other word embeddings vector dataset e.g., GloVe or Word2Vec
 # Use demo model for your own dataset
	# Word embeddings with GloVe50
	# This demo aims to show up the technique, alternative to famous TFIDF.
	# Helper functions:
	# read_glove_vecs - reads GloVe data from glove.6B.50d.txt into word_to_vec_map
	# sentence_to_avg - converts sentence to its vector representation (50 dimensional in case of using glove.6B.50d
	# as word embeddings data source)source
	from sklearn.metrics.pairwise import cosine_similarity


	def read_glove_vecs(glove_file):
	"""Reads GloVe data from glove.6B.50d.txt into word_to_vec_map.
	@input glove_file - file name with path to GloVe data.
	@return word_to_vec_map - mapping word to its vector representation in GloVe format."""
	with open(glove_file, 'r') as f:
	words = set()
	word_to_vec_map = {}
	for line in f:
	line = line.strip().split()
	curr_word = line[0]
	words.add(curr_word)
	word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

	return word_to_vec_map


	def sentence_to_avg(sentence, word_to_vec_map):
	"""Converts sentence to its vector representation.
	@input sentence - string of words.
	@input word_to_vec_map - mapping word to its GloVe representation.
	@return Average of vectors of sentence words."""
	words = sentence.lower().split()
	avg = np.zeros(word_to_vec_map[words[0]].shape)
	for word in words:
	avg += word_to_vec_map[word]
	return avg / len(words)


	# Train
	# This step should be executed once before of using it for solving text similarity task.
	# As an input X we use sample dataset below but, you can upload data you work with
	# Result is matrix Y of (#X, 50) shape with GloVe representation for each document in X dataset


	"""Dependencies
	numpy

	Install them
	pip install numpy
	"""

	import numpy as np

	# Sample dataset
	X = np.asarray(['I am going to the bar tonight', 'I love you', 'miss you my dear',
	'Lets go party and drinks', 'Congrats on the new job', 'Congratulations',
	'I am so happy for you', 'Why are you feeling bad', 'What is wrong with you',
	'You totally deserve this prize', 'Let us go play football',
	'Are you down for football this afternoon', 'Work hard play harder',
	'It is suprising how people can be dumb sometimes',
	'I am very disappointed', 'It is the best day in my life',
	'I think I will end up alone', 'My life is so boring', 'Good job',
	'Great so awesome'])

	# Load words to GloVe mapping
	word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

	# Initialize vector representation for train dataset with dimension of #docs by GloVe dimenstion (50)
	Y = np.zeros((len(X), 50))

	# Fill each row of Y by average vec representation of sentences from train dataset
	for index, sentence in enumerate(X):
	Y[index:] = sentence_to_avg(sentence, word_to_vec_map)

	print('#docs in X dataset is: {}'.format(len(X)))
	print('Shape of GloVe representation for X dataset is: {}'.format(Y.shape))
	print('First document in X dataset is:\n\n\t{}\n'.format(X[0]))
	print('GloVe representation for it is:\n\n{}'.format(Y[0]))


	# Predict with trained model
	# Code snipped below represents algorithm of finding most similar document from available dataset for current sentence


	"""
	Y - GloVe representation of train dataset filled above from train dataset (corpus of documents).
	#rows equals to #docs in the corpus
	#columns equals to #features in GloVe (50 in this example.)
	sentence - sample sentence to find similarity for.

	Steps:

	1. Convert sentence to its GloVec representation using by helper function sentence_to_avg
	It returns vector of coeff. of (50,) shape
	2. Calc document similarity from available text corpus using by cosine_similarity
	3. Show up top 3 similar documents
	"""

	sentence = 'I believe I can fly'

	# Step 1.
	y = sentence_to_avg(sentence, word_to_vec_map)

	# Step 2.
	x = cosine_similarity(Y, y)

	# Step 3.
	for index in np.argsort(x, axis=0)[-3:][::-1]:
	print(X[index])


	# Comparison with traditional TFIDF¶


	"""Dependencies
	scikit_learn>=0.18
	scipy>=0.18

	Install them
	pip install scikit_learn scipy
	"""
	from sklearn.feature_extraction.text import TfidfVectorizer


	vectorizer = TfidfVectorizer()
	Y_t = vectorizer.fit_transform(X)
	y_t = vectorizer.transform([sentence])
	xx = cosine_similarity(Y_t, y_t)

	for index in np.argsort(xx, axis=0)[-3:][::-1]:
	print(X[index])


	# What next?
	# After you familirised yourself with technique of word embeddings, you can try next steps:
	# Use other word embeddings vector dataset e.g., GloVe or Word2Vec
	# Use demo model for your own dataset