from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import layers, utils

from scikeras.wrappers import KerasClassifier
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import tensorflow as tf
import seaborn as sns
import pandas as pd
import numpy as np
import random

WARNING:tensorflow:From C:\Python\Python311\Lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.

# set the seeds so we might be able to get the same results!
seed = 137
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

df_train = pd.read_csv("data/drugsComTrain_raw.tsv",sep='\t')
df_train.head()

df_test = pd.read_csv("data/drugsComTest_raw.tsv",sep='\t')
df_test.head()

df_train.shape

(161297, 7)

df_test.shape

(53766, 7)

len(df_train['drugName'].unique().tolist())

3436

len(df_test['drugName'].unique().tolist())

2637

# let's see the words cloud for the reviews 
# most popular drugs
wordcloud = WordCloud(max_font_size = 25, max_words = 50, background_color = "white").generate(str(df_train['drugName']))
plt.figure()
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

# This barplot shows the top 20 drugs with the 10/10 rating
# Setting the Parameter
sns.set(font_scale = 1.2, style = 'darkgrid')
plt.rcParams['figure.figsize'] = [15, 8]

rating = dict(df_train.loc[df_train.rating == 10, "drugName"].value_counts())
drugname = list(rating.keys())
drug_rating = list(rating.values())

sns_rating = sns.barplot(x = drugname[0:20], y = drug_rating[0:20])

sns_rating.set_title('Top 20 drugs with 10/10 rating')
sns_rating.set_ylabel("Number of Ratings")
sns_rating.set_xlabel("Drug Names")
plt.setp(sns_rating.get_xticklabels(), rotation=90);

df_train['label'] = 'neutral'
df_train.loc[df_train['rating'] >= 6, 'label'] = 'positive'
df_train.loc[df_train['rating'] <= 4, 'label'] = 'negative'
df_train.head()

df_test['label'] = 'neutral'
df_test.loc[df_test['rating'] >= 6, 'label'] = 'positive'
df_test.loc[df_test['rating'] <= 4, 'label'] = 'negative'
df_test.head()

# tokenizer from keras
tokenizer = Tokenizer(num_words = 20000)
tokenizer.fit_on_texts(df_train.review.values)
X_train = tokenizer.texts_to_sequences(df_train.review.values)
X_test  = tokenizer.texts_to_sequences(df_test.review.values)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index for sequence padding

# pad sequence
maxlen  = 100
X_train = pad_sequences(X_train, padding = 'post', maxlen = maxlen)
X_test  = pad_sequences(X_test,  padding = 'post', maxlen = maxlen)

# One-hot encoding the labels
lb = LabelEncoder()
y = lb.fit_transform(df_train.label.values)
y_train = utils.to_categorical(y)
y = lb.transform(df_test.label.values)
y_test = utils.to_categorical(y)

plt.style.use('ggplot')
def plot_history(history, val=0):
    acc = history.history['accuracy']
    if val == 1:
        val_acc = history.history['val_accuracy'] # we can add a validation set in our fit function with nn
    loss = history.history['loss']
    if val == 1:
        val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training accuracy')
    if val == 1:
        plt.plot(x, val_acc, 'r', label='Validation accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.title('Accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    if val == 1:
        plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.title('Loss')
    plt.legend()

embedding_dim = 100
model_rnn = Sequential()
model_rnn.add(layers.Embedding(vocab_size, embedding_dim, input_length = maxlen))
model_rnn.add(layers.LSTM(100, dropout = 0.2, recurrent_dropout = 0.2))
model_rnn.add(layers.Dense(10, activation = 'relu'))
model_rnn.add(layers.Dense(3, activation = 'softmax'))
model_rnn.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])
model_rnn.summary()

WARNING:tensorflow:From C:\Python\Python311\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

WARNING:tensorflow:From C:\Python\Python311\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 100, 100)          5143000   
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 10)                1010      
                                                                 
 dense_1 (Dense)             (None, 3)                 33        
                                                                 
=================================================================
Total params: 5224443 (19.93 MB)
Trainable params: 5224443 (19.93 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

history_rnn = model_rnn.fit(X_train, y_train,
                        epochs = 5,
                        verbose = True,
                        validation_split = 0.1,
                        batch_size = 64)

Epoch 1/5
WARNING:tensorflow:From C:\Python\Python311\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead.

WARNING:tensorflow:From C:\Python\Python311\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead.

2269/2269 [==============================] - 322s 141ms/step - loss: 0.6573 - accuracy: 0.7354 - val_loss: 0.6459 - val_accuracy: 0.7620
Epoch 2/5
2269/2269 [==============================] - 318s 140ms/step - loss: 0.5227 - accuracy: 0.8063 - val_loss: 0.4845 - val_accuracy: 0.8244
Epoch 3/5
2269/2269 [==============================] - 340s 150ms/step - loss: 0.4252 - accuracy: 0.8478 - val_loss: 0.4525 - val_accuracy: 0.8359
Epoch 4/5
2269/2269 [==============================] - 351s 155ms/step - loss: 0.3698 - accuracy: 0.8691 - val_loss: 0.4266 - val_accuracy: 0.8474
Epoch 5/5
2269/2269 [==============================] - 348s 153ms/step - loss: 0.3216 - accuracy: 0.8869 - val_loss: 0.4210 - val_accuracy: 0.8520

loss, accuracy = model_rnn.evaluate(X_test, y_test, verbose = True)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history_rnn, val = 1)

1681/1681 [==============================] - 23s 13ms/step - loss: 0.4144 - accuracy: 0.8542
Testing Accuracy:  0.8542

model_cnn = Sequential()
model_cnn.add(layers.Embedding(vocab_size, embedding_dim, input_length = maxlen))
model_cnn.add(layers.Conv1D(128, 5, activation = 'relu'))
model_cnn.add(layers.GlobalMaxPooling1D())
model_cnn.add(layers.Dense(10, activation = 'relu'))
model_cnn.add(layers.Dense(3, activation = 'softmax'))
model_cnn.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])
model_cnn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_1 (Embedding)     (None, 100, 100)          5143000   
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           64128     
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
 dense_3 (Dense)             (None, 3)                 33        
                                                                 
=================================================================
Total params: 5208451 (19.87 MB)
Trainable params: 5208451 (19.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

history_cnn = model_cnn.fit(X_train, y_train,
                        epochs = 5,
                        verbose = True,
                        validation_split = 0.1,
                        batch_size = 64)

Epoch 1/5
2269/2269 [==============================] - 96s 42ms/step - loss: 0.3115 - accuracy: 0.8016 - val_loss: 0.2689 - val_accuracy: 0.8365
Epoch 2/5
2269/2269 [==============================] - 92s 40ms/step - loss: 0.2120 - accuracy: 0.8792 - val_loss: 0.2328 - val_accuracy: 0.8667
Epoch 3/5
2269/2269 [==============================] - 91s 40ms/step - loss: 0.1443 - accuracy: 0.9185 - val_loss: 0.2198 - val_accuracy: 0.8791
Epoch 4/5
2269/2269 [==============================] - 92s 40ms/step - loss: 0.0925 - accuracy: 0.9464 - val_loss: 0.2446 - val_accuracy: 0.8877
Epoch 5/5
2269/2269 [==============================] - 91s 40ms/step - loss: 0.0567 - accuracy: 0.9708 - val_loss: 0.2642 - val_accuracy: 0.8923

loss, accuracy = model_cnn.evaluate(X_test, y_test, verbose = True)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history_cnn, val = 1)

1681/1681 [==============================] - 5s 3ms/step - loss: 0.2556 - accuracy: 0.8944
Testing Accuracy:  0.8944

model_brnn = Sequential()
model_brnn.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model_brnn.add(layers.Bidirectional(layers.GRU(300)))
model_brnn.add(layers.Dense(10, activation = 'relu'))
model_brnn.add(layers.Dense(3, activation = 'softmax'))
model_brnn.compile(optimizer = 'adam',
                   loss = 'categorical_crossentropy',
                   metrics = ['accuracy'])
model_brnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_2 (Embedding)     (None, 100, 100)          5143000   
                                                                 
 bidirectional (Bidirection  (None, 600)               723600    
 al)                                                             
                                                                 
 dense_4 (Dense)             (None, 10)                6010      
                                                                 
 dense_5 (Dense)             (None, 3)                 33        
                                                                 
=================================================================
Total params: 5872643 (22.40 MB)
Trainable params: 5872643 (22.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

history_brnn = model_brnn.fit(X_train, y_train,
                              epochs = 5,
                              verbose = True,
                              validation_split = 0.1,
                              batch_size = 64)

Epoch 1/5
2269/2269 [==============================] - 502s 220ms/step - loss: 0.5030 - accuracy: 0.8113 - val_loss: 0.4523 - val_accuracy: 0.8305
Epoch 2/5
2269/2269 [==============================] - 489s 216ms/step - loss: 0.3699 - accuracy: 0.8674 - val_loss: 0.4084 - val_accuracy: 0.8507
Epoch 3/5
2269/2269 [==============================] - 495s 218ms/step - loss: 0.2903 - accuracy: 0.8946 - val_loss: 0.3969 - val_accuracy: 0.8633
Epoch 4/5
2269/2269 [==============================] - 490s 216ms/step - loss: 0.2200 - accuracy: 0.9208 - val_loss: 0.3946 - val_accuracy: 0.8668
Epoch 5/5
2269/2269 [==============================] - 488s 215ms/step - loss: 0.1621 - accuracy: 0.9424 - val_loss: 0.4444 - val_accuracy: 0.8717

loss, accuracy = model_brnn.evaluate(X_test, y_test, verbose = True)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history_brnn, val = 1)

1681/1681 [==============================] - 68s 41ms/step - loss: 0.4249 - accuracy: 0.8758
Testing Accuracy:  0.8758

def create_model(num_filters, kernel_size, embedding_dim):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length = 100))
    model.add(layers.Conv1D(num_filters, kernel_size, activation = 'relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(10, activation = 'relu'))
    model.add(layers.Dense(3, activation = 'sigmoid'))
    model.compile(optimizer = 'adam',
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])
    return model

param_grid = dict(num_filters = [32, 64],
                  kernel_size = [3, 5])

# Parameter grid for grid search
# Hyperparameters to be tuned need to be added as arguments to KerasClassifier from scikeras (https://adriangb.com/scikeras/stable/migration.html#default-arguments-in-build-fn-model)
model = KerasClassifier(model = create_model,
                        epochs = 5,
                        batch_size = 64,
                        num_filters = 32, # hyperparameter 1
                        kernel_size = 3, # hyperparameter 2
                        embedding_dim = 50, # hyperparameter 3
                        verbose = True)

grid = RandomizedSearchCV(estimator = model,
                          param_distributions = param_grid,
                          cv = 5,
                          n_jobs = -1,
                          verbose = 1,
                          n_iter = 2)

grid_result = grid.fit(X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Epoch 1/5
2521/2521 [==============================] - 49s 19ms/step - loss: 0.3137 - accuracy: 0.8123
Epoch 2/5
2521/2521 [==============================] - 48s 19ms/step - loss: 0.2236 - accuracy: 0.8721
Epoch 3/5
2521/2521 [==============================] - 48s 19ms/step - loss: 0.1717 - accuracy: 0.9023
Epoch 4/5
2521/2521 [==============================] - 50s 20ms/step - loss: 0.1302 - accuracy: 0.9228
Epoch 5/5
2521/2521 [==============================] - 47s 19ms/step - loss: 0.0984 - accuracy: 0.9414

print(grid_result.best_score_)
print(grid_result.best_params_)

0.8723844470072674
{'num_filters': 64, 'kernel_size': 3}

test_accuracy = grid.score(X_test, y_test)
test_accuracy

841/841 [==============================] - 2s 2ms/step

0.8807610757727932

	Unnamed: 0	drugName	condition	review	rating	date	usefulCount
0	206461	Valsartan	Left Ventricular Dysfunction	"It has no side effect, I take it in combinati...	9.0	May 20, 2012	27
1	95260	Guanfacine	ADHD	"My son is halfway through his fourth week of ...	8.0	April 27, 2010	192
2	92703	Lybrel	Birth Control	"I used to take another oral contraceptive, wh...	5.0	December 14, 2009	17
3	138000	Ortho Evra	Birth Control	"This is my first time using any form of birth...	8.0	November 3, 2015	10
4	35696	Buprenorphine / naloxone	Opiate Dependence	"Suboxone has completely turned my life around...	9.0	November 27, 2016	37

	Unnamed: 0	drugName	condition	review	rating	date	usefulCount
0	163740	Mirtazapine	Depression	"I've tried a few antidepressants over th...	10.0	February 28, 2012	22
1	206473	Mesalamine	Crohn's Disease, Maintenance	"My son has Crohn's disease and has done ...	8.0	May 17, 2009	17
2	159672	Bactrim	Urinary Tract Infection	"Quick reduction of symptoms"	9.0	September 29, 2017	3
3	39293	Contrave	Weight Loss	"Contrave combines drugs that were used for al...	9.0	March 5, 2017	35
4	97768	Cyclafem 1 / 35	Birth Control	"I have been on this birth control for one cyc...	9.0	October 22, 2015	4

	Unnamed: 0	drugName	condition	review	rating	date	usefulCount	label
0	206461	Valsartan	Left Ventricular Dysfunction	"It has no side effect, I take it in combinati...	9.0	May 20, 2012	27	positive
1	95260	Guanfacine	ADHD	"My son is halfway through his fourth week of ...	8.0	April 27, 2010	192	positive
2	92703	Lybrel	Birth Control	"I used to take another oral contraceptive, wh...	5.0	December 14, 2009	17	neutral
3	138000	Ortho Evra	Birth Control	"This is my first time using any form of birth...	8.0	November 3, 2015	10	positive
4	35696	Buprenorphine / naloxone	Opiate Dependence	"Suboxone has completely turned my life around...	9.0	November 27, 2016	37	positive

	Unnamed: 0	drugName	condition	review	rating	date	usefulCount	label
0	163740	Mirtazapine	Depression	"I've tried a few antidepressants over th...	10.0	February 28, 2012	22	positive
1	206473	Mesalamine	Crohn's Disease, Maintenance	"My son has Crohn's disease and has done ...	8.0	May 17, 2009	17	positive
2	159672	Bactrim	Urinary Tract Infection	"Quick reduction of symptoms"	9.0	September 29, 2017	3	positive
3	39293	Contrave	Weight Loss	"Contrave combines drugs that were used for al...	9.0	March 5, 2017	35	positive
4	97768	Cyclafem 1 / 35	Birth Control	"I have been on this birth control for one cyc...	9.0	October 22, 2015	4	positive

Lab 1: Recurrent, Convolutional and Bidirectional Neural Networks¶

Transformers workshop¶

Data exploration and visualization¶

Recurrent neural networks¶

Convolutional neural networks¶

Bidirectional recurrent neural networks¶

Hyperparameter optimization (optional)¶