Created
August 21, 2020 10:23
-
-
Save emillykkejensen/aa7535c29538a956d5b9c41e31f731a1 to your computer and use it in GitHub Desktop.
Multi-Label, Multi-Class Text Classification with BERT, Transformer and Keras
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################### | |
### -------- Load libraries ------- ### | |
# Load Huggingface transformers | |
from transformers import TFBertModel, BertConfig, BertTokenizerFast | |
# Then what you need from tensorflow.keras | |
from tensorflow.keras.layers import Input, Dropout, Dense | |
from tensorflow.keras.models import Model | |
from tensorflow.keras.optimizers import Adam | |
from tensorflow.keras.callbacks import EarlyStopping | |
from tensorflow.keras.initializers import TruncatedNormal | |
from tensorflow.keras.losses import CategoricalCrossentropy | |
from tensorflow.keras.metrics import CategoricalAccuracy | |
from tensorflow.keras.utils import to_categorical | |
# And pandas for data import + sklearn because you allways need sklearn | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
####################################### | |
### --------- Import data --------- ### | |
# Import data from csv | |
data = pd.read_csv('dev/Fun with BERT/complaints.csv') | |
# Select required columns | |
data = data[['Consumer complaint narrative', 'Product', 'Issue']] | |
# Remove a row if any of the three remaining columns are missing | |
data = data.dropna() | |
# Remove rows, where the label is present only ones (can't be split) | |
data = data.groupby('Issue').filter(lambda x : len(x) > 1) | |
data = data.groupby('Product').filter(lambda x : len(x) > 1) | |
# Set your model output as categorical and save in new label col | |
data['Issue_label'] = pd.Categorical(data['Issue']) | |
data['Product_label'] = pd.Categorical(data['Product']) | |
# Transform your output to numeric | |
data['Issue'] = data['Issue_label'].cat.codes | |
data['Product'] = data['Product_label'].cat.codes | |
# Split into train and test - stratify over Issue | |
data, data_test = train_test_split(data, test_size = 0.2, stratify = data[['Issue']]) | |
####################################### | |
### --------- Setup BERT ---------- ### | |
# Name of the BERT model to use | |
model_name = 'bert-base-uncased' | |
# Max length of tokens | |
max_length = 100 | |
# Load transformers config and set output_hidden_states to False | |
config = BertConfig.from_pretrained(model_name) | |
config.output_hidden_states = False | |
# Load BERT tokenizer | |
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config) | |
# Load the Transformers BERT model | |
transformer_model = TFBertModel.from_pretrained(model_name, config = config) | |
####################################### | |
### ------- Build the model ------- ### | |
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model | |
# Load the MainLayer | |
bert = transformer_model.layers[0] | |
# Build your model input | |
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32') | |
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') | |
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} | |
inputs = {'input_ids': input_ids} | |
# Load the Transformers BERT model as a layer in a Keras model | |
bert_model = bert(inputs)[1] | |
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output') | |
pooled_output = dropout(bert_model, training=False) | |
# Then build your model output | |
issue = Dense(units=len(data.Issue_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output) | |
product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output) | |
outputs = {'issue': issue, 'product': product} | |
# And combine it all in a model object | |
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass') | |
# Take a look at the model | |
model.summary() | |
####################################### | |
### ------- Train the model ------- ### | |
# Set an optimizer | |
optimizer = Adam( | |
learning_rate=5e-05, | |
epsilon=1e-08, | |
decay=0.01, | |
clipnorm=1.0) | |
# Set loss and metrics | |
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)} | |
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')} | |
# Compile the model | |
model.compile( | |
optimizer = optimizer, | |
loss = loss, | |
metrics = metric) | |
# Ready output data for the model | |
y_issue = to_categorical(data['Issue']) | |
y_product = to_categorical(data['Product']) | |
# Tokenize the input (takes some time) | |
x = tokenizer( | |
text=data['Consumer complaint narrative'].to_list(), | |
add_special_tokens=True, | |
max_length=max_length, | |
truncation=True, | |
padding=True, | |
return_tensors='tf', | |
return_token_type_ids = False, | |
return_attention_mask = True, | |
verbose = True) | |
# Fit the model | |
history = model.fit( | |
# x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']}, | |
x={'input_ids': x['input_ids']}, | |
y={'issue': y_issue, 'product': y_product}, | |
validation_split=0.2, | |
batch_size=64, | |
epochs=10) | |
####################################### | |
### ----- Evaluate the model ------ ### | |
# Ready test data | |
test_y_issue = to_categorical(data_test['Issue']) | |
test_y_product = to_categorical(data_test['Product']) | |
test_x = tokenizer( | |
text=data_test['Consumer complaint narrative'].to_list(), | |
add_special_tokens=True, | |
max_length=max_length, | |
truncation=True, | |
padding=True, | |
return_tensors='tf', | |
return_token_type_ids = False, | |
return_attention_mask = False, | |
verbose = True) | |
# Run evaluation | |
model_eval = model.evaluate( | |
x={'input_ids': test_x['input_ids']}, | |
y={'issue': test_y_issue, 'product': test_y_product} | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I think your tutorial is great. But, similar to other people requesting you to post the prediction code on the towardsdatascience website,
I too could not get predictions, confusion matrix, and classification report working.
Below is my code. I appreciate any help. I am a beginner.
predicted_raw = model.predict({'input_ids':x_test['input_ids']})
y_predicted = numpy.argmax(predicted_raw, axis = 1)
The error is here: y_predicted = numpy.argmax(predicted_raw, axis = 1). The error message says "axis 1 is out of bounds for array of dimension 1" When I change axis to zero. The new error message is "Singleton array 0 cannot be considered a valid collection." I think what the axis=0 error says is that y_predicted is null. I double checked it with an if statement.