-
-
Save aneesh-joshi/c8a451502958fa367d84bf038081ee4b to your computer and use it in GitHub Desktop.
| import tensorflow as tf | |
| import numpy as np | |
| corpus_raw = 'He is the king . The king is royal . She is the royal queen ' | |
| # convert to lower case | |
| corpus_raw = corpus_raw.lower() | |
| words = [] | |
| for word in corpus_raw.split(): | |
| if word != '.': # because we don't want to treat . as a word | |
| words.append(word) | |
| words = set(words) # so that all duplicate words are removed | |
| word2int = {} | |
| int2word = {} | |
| vocab_size = len(words) # gives the total number of unique words | |
| for i,word in enumerate(words): | |
| word2int[word] = i | |
| int2word[i] = word | |
| # raw sentences is a list of sentences. | |
| raw_sentences = corpus_raw.split('.') | |
| sentences = [] | |
| for sentence in raw_sentences: | |
| sentences.append(sentence.split()) | |
| WINDOW_SIZE = 2 | |
| data = [] | |
| for sentence in sentences: | |
| for word_index, word in enumerate(sentence): | |
| for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : | |
| if nb_word != word: | |
| data.append([word, nb_word]) | |
| # function to convert numbers to one hot vectors | |
| def to_one_hot(data_point_index, vocab_size): | |
| temp = np.zeros(vocab_size) | |
| temp[data_point_index] = 1 | |
| return temp | |
| x_train = [] # input word | |
| y_train = [] # output word | |
| for data_word in data: | |
| x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size)) | |
| y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size)) | |
| # convert them to numpy arrays | |
| x_train = np.asarray(x_train) | |
| y_train = np.asarray(y_train) | |
| # making placeholders for x_train and y_train | |
| x = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
| y_label = tf.placeholder(tf.float32, shape=(None, vocab_size)) | |
| EMBEDDING_DIM = 5 # you can choose your own number | |
| W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM])) | |
| b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias | |
| hidden_representation = tf.add(tf.matmul(x,W1), b1) | |
| W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size])) | |
| b2 = tf.Variable(tf.random_normal([vocab_size])) | |
| prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2)) | |
| sess = tf.Session() | |
| init = tf.global_variables_initializer() | |
| sess.run(init) #make sure you do this! | |
| # define the loss function: | |
| cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1])) | |
| # define the training step: | |
| train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss) | |
| n_iters = 10000 | |
| # train for n_iter iterations | |
| for _ in range(n_iters): | |
| sess.run(train_step, feed_dict={x: x_train, y_label: y_train}) | |
| print('loss is : ', sess.run(cross_entropy_loss, feed_dict={x: x_train, y_label: y_train})) | |
| vectors = sess.run(W1 + b1) | |
| def euclidean_dist(vec1, vec2): | |
| return np.sqrt(np.sum((vec1-vec2)**2)) | |
| def find_closest(word_index, vectors): | |
| min_dist = 10000 # to act like positive infinity | |
| min_index = -1 | |
| query_vector = vectors[word_index] | |
| for index, vector in enumerate(vectors): | |
| if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector): | |
| min_dist = euclidean_dist(vector, query_vector) | |
| min_index = index | |
| return min_index | |
| from sklearn.manifold import TSNE | |
| model = TSNE(n_components=2, random_state=0) | |
| np.set_printoptions(suppress=True) | |
| vectors = model.fit_transform(vectors) | |
| from sklearn import preprocessing | |
| normalizer = preprocessing.Normalizer() | |
| vectors = normalizer.fit_transform(vectors, 'l2') | |
| print(vectors) | |
| import matplotlib.pyplot as plt | |
| fig, ax = plt.subplots() | |
| print(words) | |
| for word in words: | |
| print(word, vectors[word2int[word]][1]) | |
| ax.annotate(word, (vectors[word2int[word]][0],vectors[word2int[word]][1] )) | |
| plt.show() |
I enjoyed it as well - I added the code above and created a jupyter notebook for it - its a great learning tutorial
Very good notebook and tutorial -
Though i get the reference to 42
print(int2word[42]) - > in your guide is a bit confusing . You may want to change to something already available in the dictionary say [5] instead of 42 .
On Line 35:
if nb_word != word:I believe this will generate incorrect neighbour pairs, since there are instances where a word might be its own neighbour. e.g. "I think I want to go to the park". Both "I" and "to" have neighbour pairs containing themselves.
On Line 35:
if nb_word != word:I believe this will generate incorrect neighbour pairs, since there are instances where a word might be its own neighbour. e.g. "I think I want to go to the park". Both "I" and "to" have neighbour pairs containing themselves.
Will it though? Since we are taking window of size 2 wont it just take [i,think] and then [think,i]
Hey! Love the tutorial. I had to add this to line 123 (before plt.show())