from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import transformers
import pandas as pd
import numpy as np
import random

C:\Python\Python311\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

# set the seeds so we might be able to get the same results!
seed = 137
random.seed(seed)
np.random.seed(seed)
transformers.set_seed(seed)

WARNING:tensorflow:From C:\Python\Python311\Lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

# load the data
df_train = pd.read_csv("data/drugsComTrain_raw.tsv",sep='\t')
df_test = pd.read_csv("data/drugsComTest_raw.tsv",sep='\t')

# Prepare the label for training
df_train['label'] = 'neutral'
df_train.loc[df_train['rating'] >= 6, 'label'] = 'positive'
df_train.loc[df_train['rating'] < 6, 'label'] = 'negative'

# Prepare the label for test
df_test['label'] = 'neutral'
df_test.loc[df_test['rating'] >= 6, 'label'] = 'positive'
df_test.loc[df_test['rating'] < 6, 'label'] = 'negative'

review1 = df_train.loc[1, 'review']
review2 = df_train.loc[7456, 'review']

print(review1)

"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. 
We have tried many different medications and so far this is the most effective."

print(review2)

"I received the Implanon after giving birth to my first son. I thought this would be great because I forget to take the pill. Well it is great. I am not pregnant. However, I am having the WORST side effect of them all. I AM LOOSING MY HAIR. I have got to get it out. I walked in thinking that this would not happen to me, side effects never effect me. Good luck to the rest of you."

print("The rating for review1 is", df_train.loc[1, 'rating'])
print("The label for review1 is", df_train.loc[1, 'label'])
print("The rating for review2 is", df_train.loc[7456, 'rating'])
print("The label for review2 is", df_train.loc[7456, 'label'])

The rating for review1 is 8.0
The label for review1 is positive
The rating for review2 is 3.0
The label for review2 is negative

sentiment_pipeline = transformers.pipeline("sentiment-analysis", model = 'distilbert-base-uncased-finetuned-sst-2-english')

sentiment_pipeline(review1) # predict sentiment

[{'label': 'POSITIVE', 'score': 0.9062138795852661}]

sentiment_pipeline(review2) # predict sentiment

[{'label': 'NEGATIVE', 'score': 0.9842180013656616}]

tokenizer = transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = transformers.DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

encoding = tokenizer([review1, review2], padding = True, truncation = True, return_tensors = 'pt') # tokenize the reviews

print(encoding['input_ids'][0]) # first review's input_ids

tensor([  101,  1000,  2026,  2365,  2003,  8576,  2083,  2010,  2959,  2733,
         1997, 20014, 19496,  2615,  1012,  2057,  2150,  4986,  2043,  2002,
         2211,  2023,  2197,  2733,  1010,  2043,  2002,  2318,  2635,  1996,
         3284, 13004,  2002,  2097,  2022,  2006,  1012,  2005,  2048,  2420,
         1010,  2002,  2071,  6684,  2131,  2041,  1997,  2793,  1010,  2001,
         2200, 27987,  2100,  1010,  1998,  7771,  2005,  3053,  1022,  2847,
         2006,  1037,  3298,  2188,  2013,  2082, 10885,  1006,  2200,  5866,
         2005,  2032,  1012,  1007,  1045,  2170,  2010,  3460,  2006,  6928,
         2851,  1998,  2016,  2056,  2000,  6293,  2009,  2041,  1037,  2261,
         2420,  1012,  2156,  2129,  2002,  2106,  2012,  2082,  1010,  1998,
         2007,  2893,  2039,  1999,  1996,  2851,  1012,  1996,  2197,  2048,
         2420,  2031,  2042,  3291,  2489,  1012,  2002,  2003,  2172,  2062,
         5993,  3085,  2084,  2412,  1012,  2002,  2003,  2625,  6832,  1006,
         1037,  2204,  2518,  1007,  1010,  2625, 27987,  2100,  1012,  2002,
         2003, 10397,  2035,  1996,  2477,  2002,  2323,  1012,  3452,  2010,
         5248,  2003,  2488,  1012,  2057,  2031,  2699,  2116,  2367, 20992,
         1998,  2061,  2521,  2023,  2003,  1996,  2087,  4621,  1012,  1000,
          102])

print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])) # first review's tokens

['[CLS]', '"', 'my', 'son', 'is', 'halfway', 'through', 'his', 'fourth', 'week', 'of', 'int', '##uni', '##v', '.', 'we', 'became', 'concerned', 'when', 'he', 'began', 'this', 'last', 'week', ',', 'when', 'he', 'started', 'taking', 'the', 'highest', 'dose', 'he', 'will', 'be', 'on', '.', 'for', 'two', 'days', ',', 'he', 'could', 'hardly', 'get', 'out', 'of', 'bed', ',', 'was', 'very', 'crank', '##y', ',', 'and', 'slept', 'for', 'nearly', '8', 'hours', 'on', 'a', 'drive', 'home', 'from', 'school', 'vacation', '(', 'very', 'unusual', 'for', 'him', '.', ')', 'i', 'called', 'his', 'doctor', 'on', 'monday', 'morning', 'and', 'she', 'said', 'to', 'stick', 'it', 'out', 'a', 'few', 'days', '.', 'see', 'how', 'he', 'did', 'at', 'school', ',', 'and', 'with', 'getting', 'up', 'in', 'the', 'morning', '.', 'the', 'last', 'two', 'days', 'have', 'been', 'problem', 'free', '.', 'he', 'is', 'much', 'more', 'agree', '##able', 'than', 'ever', '.', 'he', 'is', 'less', 'emotional', '(', 'a', 'good', 'thing', ')', ',', 'less', 'crank', '##y', '.', 'he', 'is', 'remembering', 'all', 'the', 'things', 'he', 'should', '.', 'overall', 'his', 'behavior', 'is', 'better', '.', 'we', 'have', 'tried', 'many', 'different', 'medications', 'and', 'so', 'far', 'this', 'is', 'the', 'most', 'effective', '.', '"', '[SEP]']

# prediction of sentiment
import torch
output = model(input_ids = encoding['input_ids'], attention_mask = encoding['attention_mask'])

print("Predicted logits:\n\n", output['logits']) # logits

Predicted logits:

 tensor([[-1.0547,  1.2136],
        [ 2.2059, -1.9270]], grad_fn=<AddmmBackward0>)

print("Predicted probabilities:\n\n", torch.nn.functional.softmax(output['logits'], dim=-1)) # from logits to probabilities

Predicted probabilities:

 tensor([[0.0938, 0.9062],
        [0.9842, 0.0158]], grad_fn=<SoftmaxBackward0>)

prediction = torch.argmax(output['logits'], 1) # from logits to a binary class
print("Predicted classes:\n", prediction)

Predicted classes:
 tensor([1, 0])

# !pip install -q shap
import shap
explainer = shap.Explainer(sentiment_pipeline)
shap_values = explainer([review1, review2])

PartitionExplainer explainer:  50%|███████████████████████████████                               | 1/2 [00:00<?, ?it/s]
  0%|                                                                                          | 0/498 [00:00<?, ?it/s]
 17%|█████████████▊                                                                  | 86/498 [00:00<00:02, 205.16it/s]
 22%|█████████████████▋                                                              | 110/498 [00:02<00:08, 43.87it/s]
 24%|███████████████████▌                                                            | 122/498 [00:02<00:11, 32.89it/s]
 26%|████████████████████▌                                                           | 128/498 [00:03<00:12, 29.01it/s]
 27%|█████████████████████▌                                                          | 134/498 [00:03<00:14, 25.50it/s]
 28%|██████████████████████▍                                                         | 140/498 [00:04<00:15, 22.71it/s]
 29%|███████████████████████▍                                                        | 146/498 [00:04<00:17, 20.60it/s]
 31%|████████████████████████▍                                                       | 152/498 [00:04<00:18, 18.83it/s]
 32%|█████████████████████████▍                                                      | 158/498 [00:05<00:19, 17.70it/s]
 33%|██████████████████████████▎                                                     | 164/498 [00:05<00:20, 16.62it/s]
 34%|███████████████████████████▎                                                    | 170/498 [00:06<00:20, 16.00it/s]
 35%|████████████████████████████▎                                                   | 176/498 [00:06<00:20, 15.62it/s]
 37%|█████████████████████████████▏                                                  | 182/498 [00:07<00:20, 15.18it/s]
 38%|██████████████████████████████▏                                                 | 188/498 [00:07<00:20, 14.91it/s]
 39%|███████████████████████████████▏                                                | 194/498 [00:07<00:20, 14.96it/s]
 40%|████████████████████████████████▏                                               | 200/498 [00:08<00:20, 14.76it/s]
 41%|█████████████████████████████████                                               | 206/498 [00:08<00:19, 14.81it/s]
 43%|██████████████████████████████████                                              | 212/498 [00:09<00:19, 14.57it/s]
 44%|███████████████████████████████████                                             | 218/498 [00:09<00:19, 14.38it/s]
 45%|███████████████████████████████████▉                                            | 224/498 [00:09<00:19, 14.35it/s]
 46%|████████████████████████████████████▉                                           | 230/498 [00:10<00:18, 14.33it/s]
 47%|█████████████████████████████████████▉                                          | 236/498 [00:10<00:18, 14.37it/s]
 49%|██████████████████████████████████████▉                                         | 242/498 [00:11<00:17, 14.53it/s]
 50%|███████████████████████████████████████▊                                        | 248/498 [00:11<00:17, 14.29it/s]
 51%|████████████████████████████████████████▊                                       | 254/498 [00:12<00:16, 14.39it/s]
 52%|█████████████████████████████████████████▊                                      | 260/498 [00:12<00:16, 14.38it/s]
 53%|██████████████████████████████████████████▋                                     | 266/498 [00:12<00:16, 14.38it/s]
 55%|███████████████████████████████████████████▋                                    | 272/498 [00:13<00:15, 14.38it/s]
 56%|████████████████████████████████████████████▋                                   | 278/498 [00:13<00:15, 14.39it/s]
 57%|█████████████████████████████████████████████▌                                  | 284/498 [00:14<00:14, 14.40it/s]
 58%|██████████████████████████████████████████████▌                                 | 290/498 [00:14<00:14, 14.45it/s]
 59%|███████████████████████████████████████████████▌                                | 296/498 [00:14<00:14, 14.39it/s]
 61%|████████████████████████████████████████████████▌                               | 302/498 [00:15<00:13, 14.56it/s]
 62%|█████████████████████████████████████████████████▍                              | 308/498 [00:15<00:13, 14.44it/s]
 63%|██████████████████████████████████████████████████▍                             | 314/498 [00:16<00:12, 14.33it/s]
 64%|███████████████████████████████████████████████████▍                            | 320/498 [00:16<00:12, 14.44it/s]
 65%|████████████████████████████████████████████████████▎                           | 326/498 [00:17<00:11, 14.51it/s]
 67%|█████████████████████████████████████████████████████▎                          | 332/498 [00:17<00:11, 14.45it/s]
 68%|██████████████████████████████████████████████████████▎                         | 338/498 [00:17<00:11, 14.46it/s]
 69%|███████████████████████████████████████████████████████▎                        | 344/498 [00:18<00:10, 14.40it/s]
 70%|████████████████████████████████████████████████████████▏                       | 350/498 [00:18<00:10, 14.41it/s]
 71%|█████████████████████████████████████████████████████████▏                      | 356/498 [00:19<00:09, 14.30it/s]
 73%|██████████████████████████████████████████████████████████▏                     | 362/498 [00:19<00:09, 14.35it/s]
 74%|███████████████████████████████████████████████████████████                     | 368/498 [00:19<00:08, 14.52it/s]
 75%|████████████████████████████████████████████████████████████                    | 374/498 [00:20<00:08, 14.03it/s]
 76%|█████████████████████████████████████████████████████████████                   | 380/498 [00:20<00:08, 14.21it/s]
 78%|██████████████████████████████████████████████████████████████                  | 386/498 [00:21<00:07, 14.27it/s]
 79%|██████████████████████████████████████████████████████████████▉                 | 392/498 [00:21<00:07, 14.29it/s]
 80%|███████████████████████████████████████████████████████████████▉                | 398/498 [00:22<00:07, 14.20it/s]
 81%|████████████████████████████████████████████████████████████████▉               | 404/498 [00:22<00:06, 14.23it/s]
 82%|█████████████████████████████████████████████████████████████████▊              | 410/498 [00:22<00:06, 14.29it/s]
 84%|██████████████████████████████████████████████████████████████████▊             | 416/498 [00:23<00:05, 14.16it/s]
 85%|███████████████████████████████████████████████████████████████████▊            | 422/498 [00:23<00:05, 14.23it/s]
 86%|████████████████████████████████████████████████████████████████████▊           | 428/498 [00:24<00:04, 14.27it/s]
 87%|█████████████████████████████████████████████████████████████████████▋          | 434/498 [00:24<00:04, 14.19it/s]
 88%|██████████████████████████████████████████████████████████████████████▋         | 440/498 [00:25<00:04, 14.22it/s]
 90%|███████████████████████████████████████████████████████████████████████▋        | 446/498 [00:25<00:03, 14.09it/s]
 91%|████████████████████████████████████████████████████████████████████████▌       | 452/498 [00:25<00:03, 14.05it/s]
 92%|█████████████████████████████████████████████████████████████████████████▌      | 458/498 [00:26<00:02, 14.13it/s]
 93%|██████████████████████████████████████████████████████████████████████████▌     | 464/498 [00:26<00:02, 14.15it/s]
 94%|███████████████████████████████████████████████████████████████████████████▌    | 470/498 [00:27<00:01, 14.11it/s]
 96%|████████████████████████████████████████████████████████████████████████████▍   | 476/498 [00:27<00:01, 13.95it/s]
 97%|█████████████████████████████████████████████████████████████████████████████▍  | 482/498 [00:27<00:01, 14.14it/s]
 98%|██████████████████████████████████████████████████████████████████████████████▍ | 488/498 [00:28<00:00, 14.06it/s]
 99%|███████████████████████████████████████████████████████████████████████████████▎| 494/498 [00:28<00:00, 14.18it/s]
500it [00:29, 14.23it/s]                                                                                               
504it [00:29, 14.09it/s]
PartitionExplainer explainer: 3it [01:40, 50.47s/it]

shap.plots.text(shap_values[0]) # first review

shap.plots.text(shap_values[1]) # second review

from datasets import load_dataset

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import evaluate
import numpy as np

df_train.head()

df_train['label'].hist()

<Axes: >

df_train = df_train[['label','review']]
df_test = df_test[['label','review']]

# target_map = { 'positive': 1, 'negative': 0, 'neutral': 2}
target_map = { 'positive': 1, 'negative': 0}
df_train['target'] = df_train['label'].map(target_map)
df_test['target'] = df_test['label'].map(target_map)

# Save data to new csv file. Because transformers required special format of dataset to perform operations on it, 
# which we will give using load_dataset class. 
df1 = df_train[['review','target']]
df1.columns = ['sentence','label']
df1.to_csv('train.csv', index = False)

df1 = df_test[['review','target']]
df1.columns = ['sentence','label']
df1.to_csv('test.csv', index = False)

raw_dataset = load_dataset('csv', 
              data_files = { 'train': 'train.csv',
                             'test': 'test.csv'})

Downloading data files: 100%|████████████████████████████████████████████████████████████████████| 2/2 [00:00<?, ?it/s]
Extracting data files: 100%|████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 149.91it/s]
Generating train split: 161297 examples [00:01, 158894.96 examples/s]
Generating test split: 53766 examples [00:00, 154972.56 examples/s]

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 53766
    })
})

# Because fine-tuning on the dataset would be too resource-intensive to run in this practical, 
# we will work with a randomly sampled 2% of the original train and test dataset size.
drug_data = raw_dataset
drug_data['train'] = drug_data['train'].shuffle(seed=42).select(range(int(0.02*len(raw_dataset['train']))))
drug_data['test'] = drug_data['test'].shuffle(seed=42).select(range(int(0.02*len(raw_dataset['test']))))
drug_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3225
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1075
    })
})

# Import AutoTokenizer and create tokenizer object
from transformers import AutoTokenizer
#checkpoint = 'bert-base-cased'
checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokernizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_fn(batch):
  return tokernizer(batch['sentence'], truncation = True)

tokenized_data = drug_data.map(tokenize_fn, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████| 3225/3225 [00:01<00:00, 2150.29 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 1075/1075 [00:00<00:00, 2480.38 examples/s]

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

from torchinfo import summary
summary(model)

================================================================================
Layer (type:depth-idx)                                  Param #
================================================================================
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
================================================================================
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
================================================================================

training_args = TrainingArguments(output_dir='training_dir',
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  num_train_epochs=2,
                                  per_device_train_batch_size=10,
                                  per_device_eval_batch_size=10)

def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc = np.mean(predictions == labels)
  f1 = f1_score(labels, predictions, average = 'micro')
  return {'accuracy': acc, 'f1_score': f1}

trainer = Trainer(model,
                  training_args,
                  train_dataset = tokenized_data["train"],
                  eval_dataset = tokenized_data["test"],
                  tokenizer=tokernizer,
                  compute_metrics=compute_metrics)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

TrainOutput(global_step=646, training_loss=0.3219169345064429, metrics={'train_runtime': 613.5648, 'train_samples_per_second': 10.512, 'train_steps_per_second': 1.053, 'total_flos': 337819032738540.0, 'train_loss': 0.3219169345064429, 'epoch': 2.0})

# the code here can be uncommented to clear the memory if you are using your machine for the practical
# import torch
# torch.cuda.empty_cache()

# import gc
# del df1, df_train, df_test
# gc.collect()
# torch.cuda.memory_summary(device=None, abbreviated=False)

from transformers import pipeline
fine_tuned_model = pipeline('text-classification',
                       model = 'training_dir/checkpoint-82')

fine_tuned_model(review1)

[{'label': 'POSITIVE', 'score': 0.9956554174423218}]

fine_tuned_model(review2)

[{'label': 'POSITIVE', 'score': 0.9846466183662415}]

fine_tuned_model("It was awful.")

[{'label': 'NEGATIVE', 'score': 0.9894874095916748}]

fine_tuned_model("The drug has many painful side effects.")

[{'label': 'NEGATIVE', 'score': 0.9243993759155273}]

predictions = fine_tuned_model(raw_dataset['test']['sentence'])

# predictions

Token	Token ID	Meaning
`[CLS]`	`101`	Beginning of input
`[SEP]`	`102`	End of input or sentence
`[MASK]`	`103`	Masked tokens the model should predict
`[PAD]`	`0`	Padding
`[UNK]`	`100`	Unknown token not in training data

	Unnamed: 0	drugName	condition	review	rating	date	usefulCount	label
0	206461	Valsartan	Left Ventricular Dysfunction	"It has no side effect, I take it in combinati...	9.0	May 20, 2012	27	positive
1	95260	Guanfacine	ADHD	"My son is halfway through his fourth week of ...	8.0	April 27, 2010	192	positive
2	92703	Lybrel	Birth Control	"I used to take another oral contraceptive, wh...	5.0	December 14, 2009	17	negative
3	138000	Ortho Evra	Birth Control	"This is my first time using any form of birth...	8.0	November 3, 2015	10	positive
4	35696	Buprenorphine / naloxone	Opiate Dependence	"Suboxone has completely turned my life around...	9.0	November 27, 2016	37	positive

Epoch	Training Loss	Validation Loss	Accuracy	F1 Score
1	No log	0.342572	0.849302	0.849302
2	0.356900	0.561416	0.852093	0.852093

Lab 2: Transformers for Text Analysis¶

Transformers workshop¶

Overview¶

Load Data¶

Off-the-shelf Sentiment Analysis Pipeline¶

Sentiment Analysis Pipeline - Deconstructed¶

Feature Importance with SHAP¶

Fine-tuning BERT using the Drug Review Dataset¶

Further reading¶