Skip to content

Instantly share code, notes, and snippets.

@JanSchm
Last active June 27, 2022 17:09
Show Gist options
  • Save JanSchm/6b832194a49c456a228321472b0784bd to your computer and use it in GitHub Desktop.
Save JanSchm/6b832194a49c456a228321472b0784bd to your computer and use it in GitHub Desktop.
# Load ALBERT tokenizer
albert_tokenizer = hub.KerasLayer("http://tfhub.dev/tensorflow/albert_en_preprocess/2")
# Define Data Generator function for online learning
class DataGenerator(tf.keras.utils.Sequence):
def __init__(self, data, tokenizer, batch_size):
self.data = data
self.tokenizer = tokenizer
self.batch_size = batch_size
def __len__(self):
return (np.ceil(len(self.data) / float(self.batch_size))).astype(np.int)
def __getitem__(self, idx):
df_batch = self.data.iloc[idx*self.batch_size : ((idx+1)*self.batch_size)]
preprocessed_tokens = self.tokenizer(tf.constant(df_batch['headline_description'].tolist()))
y = df_batch['category_id'].values
return [preprocessed_tokens['input_word_ids'], preprocessed_tokens['input_mask'], preprocessed_tokens['input_type_ids']], y
train_gen = DataGenerator(df_train, albert_tokenizer, BATCH_SIZE)
val_gen = DataGenerator(df_val, albert_tokenizer, BATCH_SIZE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment