!pip install -q -U accelerate # update accelerate

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 244.2/244.2 kB 3.2 MB/s eta 0:00:00


!pip install -q transformers
!pip install -q Xformers
from transformers import pipeline

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.4/7.4 MB 15.0 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 25.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 34.8 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 44.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 109.1/109.1 MB 8.4 MB/s eta 0:00:00


#for reproducibility

from transformers import set_seed
import random
import numpy as np
seed = 137
set_seed(seed)
random.seed(seed)
np.random.seed(seed)


review1 = "The Menu isn't the first to satirise the rich and their incompetence and isn't saying anything new \
but that definitely doesn't prevent it from being a great satire that pokes fun at everything it can in ways that \
are often consistently funny, playful and extremely stylish. Ralph Fiennes gives a terrific performance full of awkward\
unease that only enhances his commanding screen presence. Anya Taylor-Joy is a perfect audience surrogate amongst a sea\
of deliberately unlikeable characters of which the best is Nicholas Hoult whose almost too good at making his character\
hilariously pathetic. Mark Mylod's direction is excellent, the film has more than enough visual style to match the \
pretentiousness of its characters and is really good at building tension. The music by Colin Stetson is fantastic, \
striking a unusual balance between beautiful and unnerving."


review2 = "This looked like an interesting film based on the trailer and the first half of it was just that. \
The tension and suspense was building nicely. There were little dribs and drabs and hints of what might be coming \
without being too obvious. The acting from everyone in the film was good. Even supporting characters with only a few \
lines. Were well realized I remember thinking that I couldn't wait to see where it was all going. Sadly it didn't \
really go anywhere. It all unwound in the second half. The acting was still on but the writing failed. That's the most \
i can say without giving up any spoilers. And that was extra disappointing because the first half was so good. This \
Menu did not deliver the meal as advertised."


print(review1)

The Menu isn't the first to satirise the rich and their incompetence and isn't saying anything new but that definitely doesn't prevent it from being a great satire that pokes fun at everything it can in ways that are often consistently funny, playful and extremely stylish. Ralph Fiennes gives a terrific performance full of awkwardunease that only enhances his commanding screen presence. Anya Taylor-Joy is a perfect audience surrogate amongst a seaof deliberately unlikeable characters of which the best is Nicholas Hoult whose almost too good at making his characterhilariously pathetic. Mark Mylod's direction is excellent, the film has more than enough visual style to match the pretentiousness of its characters and is really good at building tension. The music by Colin Stetson is fantastic, striking a unusual balance between beautiful and unnerving.


print(review2)

This looked like an interesting film based on the trailer and the first half of it was just that. The tension and suspense was building nicely. There were little dribs and drabs and hints of what might be coming without being too obvious. The acting from everyone in the film was good. Even supporting characters with only a few lines. Were well realized I remember thinking that I couldn't wait to see where it was all going. Sadly it didn't really go anywhere. It all unwound in the second half. The acting was still on but the writing failed. That's the most i can say without giving up any spoilers. And that was extra disappointing because the first half was so good. This Menu did not deliver the meal as advertised.


sentiment_pipeline = pipeline("sentiment-analysis", model = 'distilbert-base-uncased-finetuned-sst-2-english')

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]


sentiment_pipeline(review1) # predict sentiment

[{'label': 'POSITIVE', 'score': 0.9983012080192566}]


sentiment_pipeline(review2) # predict sentiment

[{'label': 'NEGATIVE', 'score': 0.9622442722320557}]


from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")


encoding = tokenizer([review1, review2], padding = True, truncation = True, return_tensors = 'pt') # tokenize the reviews


print(encoding['input_ids'][0]) # first review's input_ids

tensor([  101,  1996, 12183,  3475,  1005,  1056,  1996,  2034,  2000,  2938,
        15735,  3366,  1996,  4138,  1998,  2037,  4297, 25377, 12870,  5897,
         1998,  3475,  1005,  1056,  3038,  2505,  2047,  2021,  2008,  5791,
         2987,  1005,  1056,  4652,  2009,  2013,  2108,  1037,  2307, 18312,
         2008, 26202,  2015,  4569,  2012,  2673,  2009,  2064,  1999,  3971,
         2008,  2024,  2411, 10862,  6057,  1010, 18378,  1998,  5186,  2358,
         8516,  4509,  1012,  6798, 10882, 24336,  2015,  3957,  1037, 27547,
         2836,  2440,  1997,  9596,  9816, 11022,  2008,  2069, 11598,  2015,
         2010,  7991,  3898,  3739,  1012, 21728,  4202,  1011,  6569,  2003,
         1037,  3819,  4378,  7505, 21799,  5921,  1037,  2712, 11253,  9969,
         4406,  3085,  3494,  1997,  2029,  1996,  2190,  2003,  6141,  7570,
        11314,  3005,  2471,  2205,  2204,  2012,  2437,  2010,  2839, 26415,
         9488, 27191, 17203,  1012,  2928,  2026,  4135,  2094,  1005,  1055,
         3257,  2003,  6581,  1010,  1996,  2143,  2038,  2062,  2084,  2438,
         5107,  2806,  2000,  2674,  1996,  3653,  6528, 20771,  2791,  1997,
         2049,  3494,  1998,  2003,  2428,  2204,  2012,  2311,  6980,  1012,
         1996,  2189,  2011,  6972, 26261, 25656,  2003, 10392,  1010,  8478,
         1037,  5866,  5703,  2090,  3376,  1998,  4895,  3678,  6455,  1012,
          102])


print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])) # first review's tokens

['[CLS]', 'the', 'menu', 'isn', "'", 't', 'the', 'first', 'to', 'sat', '##iri', '##se', 'the', 'rich', 'and', 'their', 'inc', '##omp', '##ete', '##nce', 'and', 'isn', "'", 't', 'saying', 'anything', 'new', 'but', 'that', 'definitely', 'doesn', "'", 't', 'prevent', 'it', 'from', 'being', 'a', 'great', 'satire', 'that', 'poke', '##s', 'fun', 'at', 'everything', 'it', 'can', 'in', 'ways', 'that', 'are', 'often', 'consistently', 'funny', ',', 'playful', 'and', 'extremely', 'st', '##yl', '##ish', '.', 'ralph', 'fi', '##enne', '##s', 'gives', 'a', 'terrific', 'performance', 'full', 'of', 'awkward', '##une', '##ase', 'that', 'only', 'enhance', '##s', 'his', 'commanding', 'screen', 'presence', '.', 'anya', 'taylor', '-', 'joy', 'is', 'a', 'perfect', 'audience', 'sur', '##rogate', 'amongst', 'a', 'sea', '##of', 'deliberately', 'unlike', '##able', 'characters', 'of', 'which', 'the', 'best', 'is', 'nicholas', 'ho', '##ult', 'whose', 'almost', 'too', 'good', 'at', 'making', 'his', 'character', '##hila', '##rio', '##usly', 'pathetic', '.', 'mark', 'my', '##lo', '##d', "'", 's', 'direction', 'is', 'excellent', ',', 'the', 'film', 'has', 'more', 'than', 'enough', 'visual', 'style', 'to', 'match', 'the', 'pre', '##ten', '##tious', '##ness', 'of', 'its', 'characters', 'and', 'is', 'really', 'good', 'at', 'building', 'tension', '.', 'the', 'music', 'by', 'colin', 'ste', '##tson', 'is', 'fantastic', ',', 'striking', 'a', 'unusual', 'balance', 'between', 'beautiful', 'and', 'un', '##ner', '##ving', '.', '[SEP]']


# prediction of sentiment
import torch
output = model(input_ids = encoding['input_ids'], attention_mask = encoding['attention_mask'])


print("Predicted logits:\n\n", output['logits']) # logits

Predicted logits:

 tensor([[-3.1107,  3.2654],
        [ 1.8161, -1.4221]], grad_fn=<AddmmBackward0>)


print("Predicted probabilities:\n\n", torch.nn.functional.softmax(output['logits'], dim=-1)) # from logits to probabilities

Predicted probabilities:

 tensor([[0.0017, 0.9983],
        [0.9622, 0.0378]], grad_fn=<SoftmaxBackward0>)


prediction = torch.argmax(output['logits'], 1) # from logits to binary class
print("Predicted classes:\n", prediction)

Predicted classes:
 tensor([1, 0])


!pip install -q shap
import shap
explainer = shap.Explainer(sentiment_pipeline)
shap_values = explainer([review1, review2])

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 547.9/547.9 kB 8.7 MB/s eta 0:00:00

  0%|          | 0/498 [00:00<?, ?it/s]

Partition explainer:  50%|█████     | 1/2 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

Partition explainer: 3it [05:11, 155.63s/it]


shap.plots.text(shap_values[0]) # first review


shap.plots.text(shap_values[1]) # second review


!pip install -q datasets
!pip install -q transformers
!pip install -q evaluate

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 486.2/486.2 kB 5.7 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 110.5/110.5 kB 10.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 212.5/212.5 kB 9.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.3/134.3 kB 11.4 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 81.4/81.4 kB 2.1 MB/s eta 0:00:00


from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
import numpy as np


imdb = load_dataset("imdb")
del imdb['unsupervised']

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.

  0%|          | 0/3 [00:00<?, ?it/s]


imdb["test"][0] # examine the first instance in test

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they have to always say "Gene Roddenberry\'s Earth..." otherwise people would not continue watching. Roddenberry\'s ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.',
 'label': 0}


imdb.shape # inspect dimensions full data

{'train': (25000, 2), 'test': (25000, 2)}


imdb_sample = imdb
imdb_sample['train'] = imdb['train'].shuffle(seed=42).select(range(int(0.1*len(imdb['train']))))
imdb_sample['test'] = imdb['test'].shuffle(seed=42).select(range(int(0.1*len(imdb['test']))))


imdb_sample.shape

{'train': (2500, 2), 'test': (2500, 2)}


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_imdb = imdb_sample.map(preprocess_function, batched=True)

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]


accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


training_args = TrainingArguments(
    output_dir="tuned_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_steps = 100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False)


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


from transformers import set_seed
set_seed(137)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english", num_labels=2, id2label=id2label, label2id=label2id)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

trainer.train()
trainer.save_model()

This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning


classifier = pipeline("sentiment-analysis", model="tuned_model")
classifier("The movie was an experience.")

[{'label': 'POSITIVE', 'score': 0.9790797233581543}]


sentiment_pipeline = pipeline("sentiment-analysis", model = 'distilbert-base-uncased-finetuned-sst-2-english')
sentiment_pipeline("The movie was an experience.")

[{'label': 'POSITIVE', 'score': 0.9962561130523682}]


sentiment_pipeline("French movie")

[{'label': 'POSITIVE', 'score': 0.9987333416938782}]


sentiment_pipeline("Iraqi movie")

[{'label': 'NEGATIVE', 'score': 0.6413735747337341}]


classifier("French movie")

[{'label': 'POSITIVE', 'score': 0.9880437254905701}]


classifier("Iraqi movie")

[{'label': 'POSITIVE', 'score': 0.6644718050956726}]

Token	Token ID	Meaning
`[CLS]`	`101`	Beginning of input
`[SEP]`	`102`	End of input or sentence
`[MASK]`	`103`	Masked tokens the model should predict
`[PAD]`	`0`	Padding
`[UNK]`	`100`	Unknown token not in training data

Epoch	Training Loss	Validation Loss	Accuracy
1	0.323600	0.233133	0.912000
2	0.145300	0.294243	0.908400

Practical 10: Transformers for Sentiment Analysis¶

Daniel Anadria¶

Applied Text Mining - Utrecht Summer School¶

Overview¶

Prepare the Colab Environment¶

Part 1: Off-the-shelf sentiment analysis pipeline¶

Part 2: Sentiment Analysis Pipeline - Deconstructed¶

Part 3: Feature importance with SHAP¶

Part 4: Fine-tuning BERT using the IMDb dataset¶

Remember: Be on the lookout for bias and other limitations!¶

Further reading / materials¶

Credits¶