### First, tokenize the input
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased', do_basic_tokenize=False)

text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"

# Tokenized input
indexed_tokens = tokenizer.encode(text_1, text_2, add_special_tokens=True)
### Get the hidden states computed by `BertModel`
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
segments_tensors = torch.tensor([segments_ids])
tokens_tensor = torch.tensor([indexed_tokens])

model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-cased')

with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
### Predict masked tokens using `bertForMaskedLM`
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
tokens_tensor = torch.tensor([indexed_tokens])

maskedLM_model = torch.hub.load('huggingface/pytorch-transformers', 'modelWithLMHead', 'bert-base-cased')

with torch.no_grad():
    predictions = maskedLM_model(tokens_tensor, segments_tensors)

# Get the predicted token
predicted_index = torch.argmax(predictions[0][0])[masked_index].item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
assert predicted_token == 'Jim'
### Question answering using `BertForQuestionAnswering`
questionAnswering_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-base-cased')

# Predict the start and end positions logits
with torch.no_grad():
    start_logits, end_logits = questionAnswering_model(tokens_tensor, segments_tensors)

# Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions (set model to train mode before if used for training)
start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
multiple_choice_loss = questionAnswering_model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
### Classify sequence using `BertForSequenceClassification`
seqClassification_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-cased', num_labels=2)

# Predict the sequence classification logits
with torch.no_grad():
    seq_classif_logits = seqClassification_model(tokens_tensor, segments_tensors)

# Or get the sequence classification loss (set model to train mode before if used for training)
labels = torch.tensor([1])
seq_classif_loss = seqClassification_model(tokens_tensor, segments_tensors, labels=labels)