Skip to content

Instantly share code, notes, and snippets.

@cleemesser
Forked from karpathy/min-char-rnn.py
Created October 7, 2015 18:31

Revisions

  1. @karpathy karpathy revised this gist Aug 5, 2015. 1 changed file with 3 additions and 4 deletions.
    7 changes: 3 additions & 4 deletions min-char-rnn.py
    Original file line number Diff line number Diff line change
    @@ -7,9 +7,8 @@
    # data I/O
    data = open('input.txt', 'r').read() # should be simple plain text file
    chars = list(set(data))
    print '%d unique characters in data.' % (len(chars), )
    vocab_size = len(chars)
    data_size = len(data)
    data_size, vocab_size = len(data), len(chars)
    print 'data has %d characters, %d unique.' % (data_size, vocab_size)
    char_to_ix = { ch:i for i,ch in enumerate(chars) }
    ix_to_char = { i:ch for i,ch in enumerate(chars) }

    @@ -58,7 +57,7 @@ def lossFun(inputs, targets, hprev):
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

    def sample(h, seed_ix, n):
  2. @karpathy karpathy revised this gist Aug 5, 2015. 1 changed file with 10 additions and 8 deletions.
    18 changes: 10 additions & 8 deletions min-char-rnn.py
    Original file line number Diff line number Diff line change
    @@ -38,10 +38,10 @@ def lossFun(inputs, targets, hprev):
    for t in xrange(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
    ys[t] = np.dot(Why, hs[t]) + by
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
    loss += -np.log(ps[t][targets[t],0]) # softmax ("cross-entropy loss")
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    @@ -58,7 +58,7 @@ def lossFun(inputs, targets, hprev):
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    dparam = np.clip(dparam, -1, 1) # clip to mitigate exploding gradients
    np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

    def sample(h, seed_ix, n):
    @@ -103,9 +103,11 @@ def sample(h, seed_ix, n):
    if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam*dparam
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
    [dWxh, dWhh, dWhy, dbh, dby],
    [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter
    n += 1 # iteration counter
  3. @karpathy karpathy revised this gist Jul 27, 2015. 1 changed file with 22 additions and 18 deletions.
    40 changes: 22 additions & 18 deletions min-char-rnn.py
    Original file line number Diff line number Diff line change
    @@ -1,11 +1,11 @@
    """
    Minimal character-level demo. Written by Andrej Karpathy (@karpathy)
    Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
    BSD License
    """
    import numpy as np

    # data I/O
    data = open('data.txt', 'r').read() # should be simple plain text file
    data = open('input.txt', 'r').read() # should be simple plain text file
    chars = list(set(data))
    print '%d unique characters in data.' % (len(chars), )
    vocab_size = len(chars)
    @@ -14,10 +14,9 @@
    ix_to_char = { i:ch for i,ch in enumerate(chars) }

    # hyperparameters
    hidden_size = 50 # size of hidden layer of neurons
    seq_length = 20 # number of steps to unroll the RNN for
    base_learning_rate = 0.01
    learning_rate_decay = 0.85 # every 1000 iteration learning rate gets divided by this
    hidden_size = 100 # size of hidden layer of neurons
    seq_length = 25 # number of steps to unroll the RNN for
    learning_rate = 1e-1

    # model parameters
    Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
    @@ -29,7 +28,7 @@
    def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    @@ -58,7 +57,8 @@ def lossFun(inputs, targets, hprev):
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)

    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    dparam = np.clip(dparam, -1, 1) # clip to mitigate exploding gradients
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

    def sample(h, seed_ix, n):
    @@ -80,7 +80,10 @@ def sample(h, seed_ix, n):
    return ixes

    n, p = 0, 0
    while n < 20000:
    mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
    smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
    while True:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    @@ -90,18 +93,19 @@ def sample(h, seed_ix, n):

    # sample from the model now and then
    if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 40)
    print 'sample:'
    print ''.join(ix_to_char[ix] for ix in sample_ix)
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print '----\n %s \n----' % (txt, )

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    if p == 0: print 'iter %d, loss: %f' % (n, loss) # print progress each epoch
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress

    # perform parameter update with vanilla SGD, decay learning rate
    learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)
    for param, dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
    param += -learning_rate * dparam
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam*dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

    p += seq_length # move data pointer
    n += 1 # iteration counter
    n += 1 # iteration counter
  4. @karpathy karpathy revised this gist Jul 26, 2015. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions min-char-rnn.py
    Original file line number Diff line number Diff line change
    @@ -51,14 +51,14 @@ def lossFun(inputs, targets, hprev):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y
    dWhy += np.dot(dy, hs[t].T)
    dby += np.copy(dy)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)

    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

    def sample(h, seed_ix, n):
    @@ -92,11 +92,11 @@ def sample(h, seed_ix, n):
    if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 40)
    print 'sample:'
    print ''.join([ix_to_char[ix] for ix in sample_ix])
    print ''.join(ix_to_char[ix] for ix in sample_ix)

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    if p == 0: print 'iter %d, loss: %f' % (n, loss) # print progress each epoch
    if p == 0: print 'iter %d, loss: %f' % (n, loss) # print progress each epoch

    # perform parameter update with vanilla SGD, decay learning rate
    learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)
  5. @karpathy karpathy revised this gist Jul 26, 2015. 1 changed file with 11 additions and 12 deletions.
    23 changes: 11 additions & 12 deletions min-char-rnn.py
    Original file line number Diff line number Diff line change
    @@ -42,40 +42,39 @@ def lossFun(inputs, targets, hprev):
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
    ys[t] = np.dot(Why, hs[t]) + by
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
    loss += -np.log(ps[t][targets[t],0])
    loss += -np.log(ps[t][targets[t],0]) # softmax ("cross-entropy loss")
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhs, dys = {}, {}
    dhnext = np.zeros_like(hs[0])
    for t in reversed(xrange(len(inputs))):
    dys[t] = np.copy(ps[t])
    dys[t][targets[t]] -= 1 # backprop into y
    dWhy += np.dot(dys[t], hs[t].T)
    dby += np.copy(dys[t])
    dhs[t] = np.dot(Why.T, dys[t]) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dhs[t] # backprop through tanh nonlinearity
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y
    dWhy += np.dot(dy, hs[t].T)
    dby += np.copy(dy)
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)

    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

    def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size,1))
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in xrange(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size,1))
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
    return ixes
    @@ -101,7 +100,7 @@ def sample(h, seed_ix, n):

    # perform parameter update with vanilla SGD, decay learning rate
    learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)
    for param,dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
    for param, dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
    param += -learning_rate * dparam

    p += seq_length # move data pointer
  6. @karpathy karpathy created this gist Jul 26, 2015.
    108 changes: 108 additions & 0 deletions min-char-rnn.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,108 @@
    """
    Minimal character-level demo. Written by Andrej Karpathy (@karpathy)
    BSD License
    """
    import numpy as np

    # data I/O
    data = open('data.txt', 'r').read() # should be simple plain text file
    chars = list(set(data))
    print '%d unique characters in data.' % (len(chars), )
    vocab_size = len(chars)
    data_size = len(data)
    char_to_ix = { ch:i for i,ch in enumerate(chars) }
    ix_to_char = { i:ch for i,ch in enumerate(chars) }

    # hyperparameters
    hidden_size = 50 # size of hidden layer of neurons
    seq_length = 20 # number of steps to unroll the RNN for
    base_learning_rate = 0.01
    learning_rate_decay = 0.85 # every 1000 iteration learning rate gets divided by this

    # model parameters
    Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
    Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
    Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
    bh = np.zeros((hidden_size, 1)) # hidden bias
    by = np.zeros((vocab_size, 1)) # output bias

    def lossFun(inputs, targets, hprev):
    """
    inputs,targets are both list of integers.
    hprev is Hx1 array of initial
    returns the loss, gradients on model parameters, and last hidden state
    """
    xs, hs, ys, ps = {}, {}, {}, {}
    hs[-1] = np.copy(hprev)
    loss = 0
    # forward pass
    for t in xrange(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh)
    ys[t] = np.dot(Why, hs[t]) + by
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
    loss += -np.log(ps[t][targets[t],0])
    # backward pass: compute gradients going backwards
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    dhs, dys = {}, {}
    dhnext = np.zeros_like(hs[0])
    for t in reversed(xrange(len(inputs))):
    dys[t] = np.copy(ps[t])
    dys[t][targets[t]] -= 1 # backprop into y
    dWhy += np.dot(dys[t], hs[t].T)
    dby += np.copy(dys[t])
    dhs[t] = np.dot(Why.T, dys[t]) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dhs[t] # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)

    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

    def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size,1))
    x[seed_ix] = 1
    ixes = []
    for t in xrange(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size,1))
    x[ix] = 1
    ixes.append(ix)
    return ixes

    n, p = 0, 0
    while n < 20000:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
    inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

    # sample from the model now and then
    if n % 100 == 0:
    sample_ix = sample(hprev, inputs[0], 40)
    print 'sample:'
    print ''.join([ix_to_char[ix] for ix in sample_ix])

    # forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    if p == 0: print 'iter %d, loss: %f' % (n, loss) # print progress each epoch

    # perform parameter update with vanilla SGD, decay learning rate
    learning_rate = base_learning_rate * np.power(learning_rate_decay, n/1000.0)
    for param,dparam in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby]):
    param += -learning_rate * dparam

    p += seq_length # move data pointer
    n += 1 # iteration counter