from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder

from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import layers, utils

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


categories = ['rec.sport.hockey', 'talk.politics.mideast', 'soc.religion.christian', 'comp.graphics', 'sci.med']


twenty_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),
                                  categories=categories, shuffle=True, random_state=321)
# type(twenty_train)


twenty_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),
                                 categories=categories, shuffle=True, random_state=321)


twenty_train.target_names

['comp.graphics',
 'rec.sport.hockey',
 'sci.med',
 'soc.religion.christian',
 'talk.politics.mideast']


twenty_train.filenames.shape

(2941,)


twenty_test.filenames.shape

(1958,)


df_train = pd.DataFrame(list(zip(twenty_train.data, twenty_train.target)), columns=['text', 'label'])
df_train.head()


df_test = pd.DataFrame(list(zip(twenty_test.data, twenty_test.target)), columns=['text', 'label'])
df_test.head()


# A function for transforming train or test into tfidf features
def tfidf_features(txt, flag):
    if flag == "train":
        x = tfidf.fit_transform(txt)
    else:
        x = tfidf.transform(txt)
    x = x.astype('float16')
    return x

tfidf = TfidfVectorizer(binary=True)
X_train = tfidf_features(df_train.text.values, flag="train")
X_test = tfidf_features(df_test.text.values, flag="test")

# With CountVectorizer and without the function
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_train = count_vect.fit_transform(df_train.text.values)
# X_test = count_vect.transform(df_test.text.values)


X_train.nnz / float(X_train.shape[0])

111.5678340700442


X_test.nnz / float(X_train.shape[0])

75.78748724923496


# tfidf.vocabulary_


# Converting the list of strings to the matrix of vectors (to be fed neural network models)
# Encode the list of newsgroups into categorical integer values
lb = LabelEncoder()
y = lb.fit_transform(df_train.label.values)
y_train = utils.np_utils.to_categorical(y)


y_train

array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)


y_train.shape

(2941, 5)


y = lb.transform(df_test.label.values)
y_test = utils.np_utils.to_categorical(y)


model = Sequential()
input_dim = X_train.shape[1]  # Number of features
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))


model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 10)                371490    
                                                                 
 dense_1 (Dense)             (None, 5)                 55        
                                                                 
=================================================================
Total params: 371,545
Trainable params: 371,545
Non-trainable params: 0
_________________________________________________________________


history = model.fit(X_train, y_train, epochs=20, batch_size=512)
# model.save_weights("model.h5")
# print("Saved model to disk")

Epoch 1/20
6/6 [==============================] - 1s 25ms/step - loss: 0.6860 - accuracy: 0.2207
Epoch 2/20
6/6 [==============================] - 0s 20ms/step - loss: 0.6661 - accuracy: 0.2370
Epoch 3/20
6/6 [==============================] - 0s 22ms/step - loss: 0.6463 - accuracy: 0.2333
Epoch 4/20
6/6 [==============================] - 0s 23ms/step - loss: 0.6269 - accuracy: 0.2380
Epoch 5/20
6/6 [==============================] - 0s 22ms/step - loss: 0.6084 - accuracy: 0.2543
Epoch 6/20
6/6 [==============================] - 0s 22ms/step - loss: 0.5908 - accuracy: 0.2649
Epoch 7/20
6/6 [==============================] - 0s 23ms/step - loss: 0.5743 - accuracy: 0.2809
Epoch 8/20
6/6 [==============================] - 0s 21ms/step - loss: 0.5589 - accuracy: 0.3002
Epoch 9/20
6/6 [==============================] - 0s 20ms/step - loss: 0.5440 - accuracy: 0.3142
Epoch 10/20
6/6 [==============================] - 0s 22ms/step - loss: 0.5297 - accuracy: 0.3332
Epoch 11/20
6/6 [==============================] - 0s 22ms/step - loss: 0.5161 - accuracy: 0.3502
Epoch 12/20
6/6 [==============================] - 0s 21ms/step - loss: 0.5031 - accuracy: 0.3699
Epoch 13/20
6/6 [==============================] - 0s 19ms/step - loss: 0.4908 - accuracy: 0.4349
Epoch 14/20
6/6 [==============================] - 0s 21ms/step - loss: 0.4790 - accuracy: 0.5236
Epoch 15/20
6/6 [==============================] - 0s 24ms/step - loss: 0.4677 - accuracy: 0.5971
Epoch 16/20
6/6 [==============================] - 0s 21ms/step - loss: 0.4568 - accuracy: 0.6593
Epoch 17/20
6/6 [==============================] - 0s 19ms/step - loss: 0.4464 - accuracy: 0.7198
Epoch 18/20
6/6 [==============================] - 0s 21ms/step - loss: 0.4363 - accuracy: 0.7766
Epoch 19/20
6/6 [==============================] - 0s 26ms/step - loss: 0.4266 - accuracy: 0.8208
Epoch 20/20
6/6 [==============================] - 0s 31ms/step - loss: 0.4171 - accuracy: 0.8592


print(history.history.keys())
plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

dict_keys(['loss', 'accuracy'])


# Here we converted the code to a function so we can use it later as well
plt.style.use('ggplot')

def plot_history(history, val=0):
    acc = history.history['accuracy']
    if val == 1:
        val_acc = history.history['val_accuracy'] # we can add a validation set in our fit function with nn
    loss = history.history['loss']
    if val == 1:
        val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training accuracy')
    if val == 1:
        plt.plot(x, val_acc, 'r', label='Validation accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.title('Accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    if val == 1:
        plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.title('Loss')
    plt.legend()


plot_history(history)


loss, accuracy = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(loss,accuracy))

62/62 [==============================] - 0s 2ms/step - loss: 0.4427 - accuracy: 0.6977
Test set
  Loss: 0.443
  Accuracy: 0.698


tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df_train.text.values)


X_train = tokenizer.texts_to_sequences(df_train.text.values)
X_test = tokenizer.texts_to_sequences(df_test.text.values)


vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index for sequence padding
vocab_size

38111


for word in ['the', 'all', 'happy', 'sad']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

the: 1
all: 35
happy: 1043
sad: 3422


maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


print(X_train[0, :])

[  555 12221    23    75  1507   379    23    16 15253     3   621    63
  5371 10293    73  2745     5   246   686     2 10294     2  2340     4
  3927     6   332  4110  1290     3 12222   131    37     2     1    72
  7235    34   177    21     7  3928     4    33    16    35    86 12223
  3226     4    23   241   753     3 12222     3    47    38  7953    42
     4    93    17   364     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(5, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 100, 50)           1905550   
                                                                 
 flatten (Flatten)           (None, 5000)              0         
                                                                 
 dense_2 (Dense)             (None, 10)                50010     
                                                                 
 dense_3 (Dense)             (None, 5)                 55        
                                                                 
=================================================================
Total params: 1,955,615
Trainable params: 1,955,615
Non-trainable params: 0
_________________________________________________________________


history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=64)


loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9772
Testing Accuracy:  0.7594


plot_history(history, val=1)


def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix


embedding_matrix = create_embedding_matrix('glove.6B.50d.txt',
                                           tokenizer.word_index, embedding_dim = 50)


nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.7059641573299048


model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                           weights=[embedding_matrix],
                           input_length=maxlen,
                           trainable=False))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(5, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_1 (Embedding)     (None, 100, 50)           1905550   
                                                                 
 global_max_pooling1d (Globa  (None, 50)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense_4 (Dense)             (None, 10)                510       
                                                                 
 dense_5 (Dense)             (None, 5)                 55        
                                                                 
=================================================================
Total params: 1,906,115
Trainable params: 565
Non-trainable params: 1,905,550
_________________________________________________________________


history = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history, val=1)

Training Accuracy: 0.7926
Testing Accuracy:  0.7829


model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim,
                           weights=[embedding_matrix],
                           input_length=maxlen,
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(5, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_2 (Embedding)     (None, 100, 50)           1905550   
                                                                 
 global_max_pooling1d_1 (Glo  (None, 50)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 10)                510       
                                                                 
 dense_7 (Dense)             (None, 5)                 55        
                                                                 
=================================================================
Total params: 1,906,115
Trainable params: 1,906,115
Non-trainable params: 0
_________________________________________________________________


history = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=False,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history, val=1)

Training Accuracy: 0.9796
Testing Accuracy:  0.8386

	text	label
0	\nDr. cheghadr bA namakand! They just wait un...	4
1	\n\n\n\n\n:) No...I was one of the lucky ones....	2
2	\n\n[After a small refresh Hasan got on the tr...	4
3	Before getting excited and implying that I am ...	4
4	I have posted disp135.zip to alt.binaries.pict...	0

	text	label
0	hi all, Ive applied for the class of 93 at qui...	2
1	:In article <enea1-270493135255@enea.apple.com...	2
2	\nI don't know the answer the to this one, alt...	0
3	\n\nWe here at IBM have the same problem with ...	0
4	\nI was at an Adobe seminar/conference/propaga...	0

Task	Output type	Last-layer activation	Loss function	Metric(s)
Regression	Numerical	Linear	meanSquaredError (MSE), meanAbsoluteError (MAE)	Same as loss
Classification	Binary	Sigmoid	binary_crossentropy	Accuracy, precision, recall, sensitivity, TPR, FPR, ROC, AUC
Classification	Single label, Multiple classes	Softmax	categorical_crossentropy	Accuracy, confusion matrix
Classification	Multiple labels, Multiple classes	Sigmoid	binary_crossentropy	Accuracy, precision, recall, sensitivity, TPR, FPR, ROC, AUC

Practical 6: Deep Learning for Multiclass Text Classification¶

Ayoub Bagheri¶

Applied Text Mining - Utrecht Summer School¶

Let's get started!¶

Train a neural network a with document-term matrix¶

The embedding layer¶

Pretrained word embeddings¶