Created
August 6, 2017 01:44
-
-
Save GavinXing/9954ea846072e115bb07d9758892382c to your computer and use it in GitHub Desktop.
A complete word2vec based on pytorch tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf-8 | |
# Project: learn-pytorch | |
# Author: xingjunjie github: @gavinxing | |
# Create Time: 29/07/2017 11:58 AM on PyCharm | |
# Basic template from http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html | |
import torch | |
import torch.nn as nn | |
import torch.autograd as autograd | |
import torch.optim as optim | |
import torch.nn.functional as F | |
class CBOW(nn.Module): | |
def __init__(self, context_size=2, embedding_size=100, vocab_size=None): | |
super(CBOW, self).__init__() | |
self.embeddings = nn.Embedding(vocab_size, embedding_size) | |
self.linear1 = nn.Linear(embedding_size, vocab_size) | |
def forward(self, inputs): | |
lookup_embeds = self.embeddings(inputs) | |
embeds = lookup_embeds.sum(dim=0) | |
out = self.linear1(embeds) | |
out = F.log_softmax(out) | |
return out | |
# create your model and train. here are some functions to help you make | |
# the data ready for use by your module | |
def make_context_vector(context, word_to_ix): | |
idxs = [word_to_ix[w] for w in context] | |
tensor = torch.LongTensor(idxs) | |
return autograd.Variable(tensor) | |
# print(make_context_vector(data[0][0], word_to_ix)) # example | |
if __name__ == '__main__': | |
CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right | |
EMBEDDING_SIZE = 10 | |
raw_text = """We are about to study the idea of a computational process. | |
Computational processes are abstract beings that inhabit computers. | |
As they evolve, processes manipulate other abstract things called data. | |
The evolution of a process is directed by a pattern of rules | |
called a program. People create programs to direct processes. In effect, | |
we conjure the spirits of the computer with our spells.""".split() | |
# By deriving a set from `raw_text`, we deduplicate the array | |
vocab = set(raw_text) | |
vocab_size = len(vocab) | |
word_to_ix = {word: i for i, word in enumerate(vocab)} | |
data = [] | |
for i in range(2, len(raw_text) - 2): | |
context = [raw_text[i - 2], raw_text[i - 1], | |
raw_text[i + 1], raw_text[i + 2]] | |
target = raw_text[i] | |
data.append((context, target)) | |
loss_func = nn.CrossEntropyLoss() | |
net = CBOW(CONTEXT_SIZE, embedding_size=EMBEDDING_SIZE, vocab_size=vocab_size) | |
optimizer = optim.SGD(net.parameters(), lr=0.01) | |
for epoch in range(100): | |
total_loss = 0 | |
for context, target in data: | |
context_var = make_context_vector(context, word_to_ix) | |
net.zero_grad() | |
log_probs = net(context_var) | |
loss = loss_func(log_probs, autograd.Variable( | |
torch.LongTensor([word_to_ix[target]]) | |
)) | |
loss.backward() | |
optimizer.step() | |
total_loss += loss.data | |
print(total_loss) |
To make it work, in CBOW.forward()
comment out line 24: out = F.log_softmax(out)
. Also update line 74 to read loss = loss_func(log_probs.view(-1,1), autograd.Variable(
.
To make it work, in
CBOW.forward()
comment out line 24:out = F.log_softmax(out)
. Also update line 74 to readloss = loss_func(log_probs.view(-1,1), autograd.Variable(
.
line 74 to read
loss = loss_func(log_probs.view(1,-1), autograd.Variable(
.
works for me
why context_size is unused?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Running the code gives error "RuntimeError: dimension out of range (expected to be in range of [-1, 0], but got 1)" error...What could be the reason?