cleemesser · October 7, 2015 18:31 · Aug 5, 2015 · Aug 5, 2015 · Jul 27, 2015 · Jul 26, 2015
diff --git a/min-char-rnn.py b/min-char-rnn.py
@@ -7,9 +7,8 @@
 # data I/O
 data = open('input.txt', 'r').read() # should be simple plain text file
 chars = list(set(data))
-print '%d unique characters in data.' % (len(chars), )
-vocab_size = len(chars)
-data_size = len(data)
+data_size, vocab_size = len(data), len(chars)
+print 'data has %d characters, %d unique.' % (data_size, vocab_size)
 char_to_ix = { ch:i for i,ch in enumerate(chars) }
 ix_to_char = { i:ch for i,ch in enumerate(chars) }
 
@@ -58,7 +57,7 @@ def lossFun(inputs, targets, hprev):
     dWhh += np.dot(dhraw, hs[t-1].T)
     dhnext = np.dot(Whh.T, dhraw)
   for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
-    np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
+    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
   return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
 
 def sample(h, seed_ix, n):

diff --git a/min-char-rnn.py b/min-char-rnn.py
@@ -38,10 +38,10 @@ def lossFun(inputs, targets, hprev):
   for t in xrange(len(inputs)):
     xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
     xs[t][inputs[t]] = 1
-    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
-    ys[t] = np.dot(Why, hs[t]) + by
-    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
-    loss += -np.log(ps[t][targets[t],0]) # softmax ("cross-entropy loss")
+    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
+    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
+    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
+    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
   # backward pass: compute gradients going backwards
   dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
   dbh, dby = np.zeros_like(bh), np.zeros_like(by)
@@ -58,7 +58,7 @@ def lossFun(inputs, targets, hprev):
     dWhh += np.dot(dhraw, hs[t-1].T)
     dhnext = np.dot(Whh.T, dhraw)
   for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
-    dparam = np.clip(dparam, -1, 1) # clip to mitigate exploding gradients
+    np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
   return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
 
 def sample(h, seed_ix, n):
@@ -103,9 +103,11 @@ def sample(h, seed_ix, n):
   if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
 
   # perform parameter update with Adagrad
-  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]):
-    mem += dparam*dparam
+  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
+                                [dWxh, dWhh, dWhy, dbh, dby], 
+                                [mWxh, mWhh, mWhy, mbh, mby]):
+    mem += dparam * dparam
     param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
 
   p += seq_length # move data pointer
-  n += 1 # iteration counter
+  n += 1 # iteration counter 
diff --git a/min-char-rnn.py b/min-char-rnn.py
@@ -1,11 +1,11 @@
 """
-Minimal character-level demo. Written by Andrej Karpathy (@karpathy)
+Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
 BSD License
 """
 import numpy as np
 
 # data I/O
-data = open('data.txt', 'r').read() # should be simple plain text file
+data = open('input.txt', 'r').read() # should be simple plain text file
 chars = list(set(data))
 print '%d unique characters in data.' % (len(chars), )
 vocab_size = len(chars)
@@ -14,10 +14,9 @@
 ix_to_char = { i:ch for i,ch in enumerate(chars) }
 
 # hyperparameters
-hidden_size = 50 # size of hidden layer of neurons
-seq_length = 20 # number of steps to unroll the RNN for
-base_learning_rate = 0.01
-learning_rate_decay = 0.85 # every 1000 iteration learning rate gets divided by this
+hidden_size = 100 # size of hidden layer of neurons
+seq_length = 25 # number of steps to unroll the RNN for
+learning_rate = 1e-1
 
 # model parameters
 Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
@@ -29,7 +28,7 @@
 def lossFun(inputs, targets, hprev):
   """
   inputs,targets are both list of integers.
-  hprev is Hx1 array of initial
+  hprev is Hx1 array of initial hidden state
   returns the loss, gradients on model parameters, and last hidden state
   """
   xs, hs, ys, ps = {}, {}, {}, {}
@@ -58,7 +57,8 @@ def lossFun(inputs, targets, hprev):
     dWxh += np.dot(dhraw, xs[t].T)
     dWhh += np.dot(dhraw, hs[t-1].T)
     dhnext = np.dot(Whh.T, dhraw)
-
+  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
+    dparam = np.clip(dparam, -1, 1) # clip to mitigate exploding gradients
   return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
 
 def sample(h, seed_ix, n):
@@ -80,7 +80,10 @@ def sample(h, seed_ix, n):
   return ixes
 
 n, p = 0, 0
-while n < 20000:
+mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
+mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
+smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
+while True:
   # prepare inputs (we're sweeping from left to right in steps seq_length long)
   if p+seq_length+1 >= len(data) or n == 0: 
     hprev = np.zeros((hidden_size,1)) # reset RNN memory
@@ -90,18 +93,19 @@ def sample(h, seed_ix, n):
 
   # sample from the model now and then
   if n % 100 == 0:
-    sample_ix = sample(hprev, inputs[0], 40)
-    print 'sample:'
-    print ''.join(ix_to_char[ix] for ix in sample_ix)
+    sample_ix = sample(hprev, inputs[0], 200)
+    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
+    print '----\n %s \n----' % (txt, )
 
   # forward seq_length characters through the net and fetch gradient
   loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
-  if p == 0: print 'iter %d, loss: %f' % (n, loss) # print progress each epoch
+  smooth_loss = smooth_loss * 0.999 + loss * 0.001
+  if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress
 
-  # perform parameter update with vanilla SGD, decay learning rate
-  learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)
-  for param, dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
-    param += -learning_rate * dparam
+  # perform parameter update with Adagrad
+  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]):
+    mem += dparam*dparam
+    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
 
   p += seq_length # move data pointer
-  n += 1 # iteration counter
+  n += 1 # iteration counter
diff --git a/min-char-rnn.py b/min-char-rnn.py
@@ -51,14 +51,14 @@ def lossFun(inputs, targets, hprev):
     dy = np.copy(ps[t])
     dy[targets[t]] -= 1 # backprop into y
     dWhy += np.dot(dy, hs[t].T)
-    dby += np.copy(dy)
+    dby += dy
     dh = np.dot(Why.T, dy) + dhnext # backprop into h
     dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
     dbh += dhraw
     dWxh += np.dot(dhraw, xs[t].T)
     dWhh += np.dot(dhraw, hs[t-1].T)
     dhnext = np.dot(Whh.T, dhraw)
-  
+
   return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
 
 def sample(h, seed_ix, n):
@@ -92,11 +92,11 @@ def sample(h, seed_ix, n):
   if n % 100 == 0:
     sample_ix = sample(hprev, inputs[0], 40)
     print 'sample:'
-    print ''.join([ix_to_char[ix] for ix in sample_ix])
+    print ''.join(ix_to_char[ix] for ix in sample_ix)
 
   # forward seq_length characters through the net and fetch gradient
   loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
-  if p == 0:  print 'iter %d, loss: %f' % (n, loss) # print progress each epoch
+  if p == 0: print 'iter %d, loss: %f' % (n, loss) # print progress each epoch
 
   # perform parameter update with vanilla SGD, decay learning rate
   learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)

diff --git a/min-char-rnn.py b/min-char-rnn.py
@@ -42,40 +42,39 @@ def lossFun(inputs, targets, hprev):
     hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
     ys[t] = np.dot(Why, hs[t]) + by
     ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
-    loss += -np.log(ps[t][targets[t],0])
+    loss += -np.log(ps[t][targets[t],0]) # softmax ("cross-entropy loss")
   # backward pass: compute gradients going backwards
   dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
   dbh, dby = np.zeros_like(bh), np.zeros_like(by)
-  dhs, dys = {}, {}
   dhnext = np.zeros_like(hs[0])
   for t in reversed(xrange(len(inputs))):
-    dys[t] = np.copy(ps[t])
-    dys[t][targets[t]] -= 1 # backprop into y
-    dWhy += np.dot(dys[t], hs[t].T)
-    dby += np.copy(dys[t])
-    dhs[t] = np.dot(Why.T, dys[t]) + dhnext # backprop into h
-    dhraw = (1 - hs[t] * hs[t]) * dhs[t] # backprop through tanh nonlinearity
+    dy = np.copy(ps[t])
+    dy[targets[t]] -= 1 # backprop into y
+    dWhy += np.dot(dy, hs[t].T)
+    dby += np.copy(dy)
+    dh = np.dot(Why.T, dy) + dhnext # backprop into h
+    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
     dbh += dhraw
     dWxh += np.dot(dhraw, xs[t].T)
     dWhh += np.dot(dhraw, hs[t-1].T)
     dhnext = np.dot(Whh.T, dhraw)
-
+  
   return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
 
 def sample(h, seed_ix, n):
   """ 
   sample a sequence of integers from the model 
   h is memory state, seed_ix is seed letter for first time step
   """
-  x = np.zeros((vocab_size,1))
+  x = np.zeros((vocab_size, 1))
   x[seed_ix] = 1
   ixes = []
   for t in xrange(n):
     h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
     y = np.dot(Why, h) + by
     p = np.exp(y) / np.sum(np.exp(y))
     ix = np.random.choice(range(vocab_size), p=p.ravel())
-    x = np.zeros((vocab_size,1))
+    x = np.zeros((vocab_size, 1))
     x[ix] = 1
     ixes.append(ix)
   return ixes
@@ -101,7 +100,7 @@ def sample(h, seed_ix, n):
 
   # perform parameter update with vanilla SGD, decay learning rate
   learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)
-  for param,dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
+  for param, dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
     param += -learning_rate * dparam
 
   p += seq_length # move data pointer

diff --git a/min-char-rnn.py b/min-char-rnn.py
@@ -0,0 +1,108 @@
+"""
+Minimal character-level demo. Written by Andrej Karpathy (@karpathy)
+BSD License
+"""
+import numpy as np
+
+# data I/O
+data = open('data.txt', 'r').read() # should be simple plain text file
+chars = list(set(data))
+print '%d unique characters in data.' % (len(chars), )
+vocab_size = len(chars)
+data_size = len(data)
+char_to_ix = { ch:i for i,ch in enumerate(chars) }
+ix_to_char = { i:ch for i,ch in enumerate(chars) }
+
+# hyperparameters
+hidden_size = 50 # size of hidden layer of neurons
+seq_length = 20 # number of steps to unroll the RNN for
+base_learning_rate = 0.01
+learning_rate_decay = 0.85 # every 1000 iteration learning rate gets divided by this
+
+# model parameters
+Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
+Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
+Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
+bh = np.zeros((hidden_size, 1)) # hidden bias
+by = np.zeros((vocab_size, 1)) # output bias
+
+def lossFun(inputs, targets, hprev):
+  """
+  inputs,targets are both list of integers.
+  hprev is Hx1 array of initial
+  returns the loss, gradients on model parameters, and last hidden state
+  """
+  xs, hs, ys, ps = {}, {}, {}, {}
+  hs[-1] = np.copy(hprev)
+  loss = 0
+  # forward pass
+  for t in xrange(len(inputs)):
+    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
+    xs[t][inputs[t]] = 1
+    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
+    ys[t] = np.dot(Why, hs[t]) + by
+    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
+    loss += -np.log(ps[t][targets[t],0])
+  # backward pass: compute gradients going backwards
+  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
+  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
+  dhs, dys = {}, {}
+  dhnext = np.zeros_like(hs[0])
+  for t in reversed(xrange(len(inputs))):
+    dys[t] = np.copy(ps[t])
+    dys[t][targets[t]] -= 1 # backprop into y
+    dWhy += np.dot(dys[t], hs[t].T)
+    dby += np.copy(dys[t])
+    dhs[t] = np.dot(Why.T, dys[t]) + dhnext # backprop into h
+    dhraw = (1 - hs[t] * hs[t]) * dhs[t] # backprop through tanh nonlinearity
+    dbh += dhraw
+    dWxh += np.dot(dhraw, xs[t].T)
+    dWhh += np.dot(dhraw, hs[t-1].T)
+    dhnext = np.dot(Whh.T, dhraw)
+
+  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
+
+def sample(h, seed_ix, n):
+  """ 
+  sample a sequence of integers from the model 
+  h is memory state, seed_ix is seed letter for first time step
+  """
+  x = np.zeros((vocab_size,1))
+  x[seed_ix] = 1
+  ixes = []
+  for t in xrange(n):
+    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
+    y = np.dot(Why, h) + by
+    p = np.exp(y) / np.sum(np.exp(y))
+    ix = np.random.choice(range(vocab_size), p=p.ravel())
+    x = np.zeros((vocab_size,1))
+    x[ix] = 1
+    ixes.append(ix)
+  return ixes
+
+n, p = 0, 0
+while n < 20000:
+  # prepare inputs (we're sweeping from left to right in steps seq_length long)
+  if p+seq_length+1 >= len(data) or n == 0: 
+    hprev = np.zeros((hidden_size,1)) # reset RNN memory
+    p = 0 # go from start of data
+  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
+  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
+
+  # sample from the model now and then
+  if n % 100 == 0:
+    sample_ix = sample(hprev, inputs[0], 40)
+    print 'sample:'
+    print ''.join([ix_to_char[ix] for ix in sample_ix])
+
+  # forward seq_length characters through the net and fetch gradient
+  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
+  if p == 0:  print 'iter %d, loss: %f' % (n, loss) # print progress each epoch
+
+  # perform parameter update with vanilla SGD, decay learning rate
+  learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)
+  for param,dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
+    param += -learning_rate * dparam
+
+  p += seq_length # move data pointer
+  n += 1 # iteration counter