Created
December 3, 2019 10:15
-
-
Save tdiggelm/fc86bad6850e83371fdfb72b693d293e to your computer and use it in GitHub Desktop.
Test TPU
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
! pip3 install nltk --user | |
import os | |
import tensorflow as tf | |
import numpy as np | |
print(tf.__version__) | |
import nltk | |
nltk.download('movie_reviews') | |
nltk.download('punkt') | |
from nltk.corpus import movie_reviews as mov | |
tpu_address = os.environ['TPU_NAME'] | |
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( | |
tpu=tpu_address) | |
tf.config.experimental_connect_to_cluster(cluster_resolver) | |
tf.tpu.experimental.initialize_tpu_system(cluster_resolver) | |
tpu_strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver) | |
vocab_size=30000 | |
embd_size=128 | |
hidden_size=256 | |
n_categories=2 | |
with tpu_strategy.scope(): | |
model = tf.keras.models.Sequential([ | |
tf.keras.layers.Embedding(vocab_size, embd_size), | |
tf.keras.layers.Bidirectional( | |
tf.keras.layers.GRU(hidden_size, return_sequences=True)), | |
tf.keras.layers.GlobalAveragePooling1D(), | |
tf.keras.layers.Dense(n_categories) | |
]) | |
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) | |
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] | |
model.compile(optimizer='adam', loss=loss, metrics=metrics) | |
def random_split(*arrays, train_size=0.8): | |
n_all = arrays[0].shape[0] | |
n_train = int(train_size*n_all) | |
ind = np.random.permutation(n_all) | |
ind_train = ind[:n_train] | |
ind_test = ind[n_train:] | |
splitted = [] | |
for arr in arrays: | |
splitted.append((arr[ind_train], arr[ind_test])) | |
return splitted | |
fileids = mov.fileids() | |
texts = [mov.raw(fid) for fid in fileids] | |
labels = [mov.categories(fid)[0] for fid in fileids] | |
tok = tf.keras.preprocessing.text.Tokenizer(vocab_size) | |
tok.fit_on_texts(texts) | |
x_all = tok.texts_to_sequences(texts) | |
x_all = tf.keras.preprocessing.sequence.pad_sequences(x_all, 512) | |
y_all = np.array([l=='pos' for l in labels], dtype=np.int32) | |
(x_train, x_test), (y_train, y_test) = random_split(x_all, y_all) | |
model.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_test, y_test)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment