from google.colab import files
uploaded = files.upload()

Saving book_reviews.csv to book_reviews.csv


import pandas as pd
data = pd.read_csv('book_reviews.csv')
# print the first five rows of the dataframe
data.head()


from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(data['tokenised_text'])
y = data['book_genre']


words = vectorizer.get_feature_names_out()
print(words[:20])

['aa' 'aaaaaaa' 'aaaaaaaahhhhh' 'aaaaah' 'aaaaand' 'aaaahhhhh' 'aaack'
 'aaah' 'aaarrrgggh' 'aagggh' 'aaj' 'ab' 'aback' 'abacus' 'abandon'
 'abandone' 'abandoned' 'abandonment' 'abasement' 'abasment']


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['tokenised_text'])
y = data['book_genre']


from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)
dummy_clf.score(X, y)

0.4991


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


import numpy as np
np.random.seed(42)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
model = knn.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors=10)
model2 = knn.fit(X_train, y_train)

knn = KNeighborsClassifier(n_neighbors=100)
model3 = knn.fit(X_train, y_train)
print('accuracy with 3 neighbours:', model.score(X_test, y_test),
      '\naccuracy with 10 neighbours:', model2.score(X_test, y_test),
      '\naccuracy with 100 neighbours:', model3.score(X_test, y_test))

accuracy with 3 neighbours: 0.5806060606060606 
accuracy with 10 neighbours: 0.6466666666666666 
accuracy with 100 neighbours: 0.6378787878787879


from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=1)
model = nb.fit(X_train, y_train)

nb = MultinomialNB(alpha=10)
model2 = nb.fit(X_train, y_train)
print('accuracy with alpha=1:', model.score(X_test, y_test),
      '\naccuracy with alpha=10:', model2.score(X_test, y_test))

accuracy with alpha=1: 0.5163636363636364 
accuracy with alpha=10: 0.5051515151515151


from sklearn.svm import LinearSVC
svm = LinearSVC(C=1.0)
model = svm.fit(X_train, y_train)

svm = LinearSVC(C=0.1)
model2 = svm.fit(X_train, y_train)
print('accuracy with default regularization:', model.score(X_test, y_test),
      '\naccuracy with more regularization:', model2.score(X_test, y_test))

accuracy with default regularization: 0.7145454545454546 
accuracy with more regularization: 0.6809090909090909


from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=5)
model = tree.fit(X_train, y_train)

tree = DecisionTreeClassifier(max_depth=None)
model2 = tree.fit(X_train, y_train)
print('accuracy with maximum tree depth 5:', model.score(X_test, y_test),
      '\naccuracy with unlimited tree depth:', model2.score(X_test, y_test))

accuracy with maximum tree depth 5: 0.5896969696969697 
accuracy with unlimited tree depth: 0.5281818181818182


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=3)
model = rfc.fit(X_train, y_train)

rfc = RandomForestClassifier(n_estimators=20)
model2 = rfc.fit(X_train, y_train)
print('accuracy with 3 trees:', model.score(X_test, y_test),
      '\naccuracy with 20 trees:', model2.score(X_test, y_test))

accuracy with 3 trees: 0.5384848484848485 
accuracy with 20 trees: 0.6236363636363637


from sklearn.model_selection import GridSearchCV
# set the search space for grid search. In this case, between 2 and 20 nearest neighbors
parameters = {'n_neighbors': [2,20]}
knn = KNeighborsClassifier()
search = GridSearchCV(knn, parameters)
search.fit(X_train, y_train)
# the best score achieved
print(search.score(X_test, y_test))
# get_params() gives the parameters leading to this best score (in 'estimator')
search.get_params()

/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_split.py:700: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.
  warnings.warn(

0.656969696969697

{'cv': None,
 'error_score': nan,
 'estimator__algorithm': 'auto',
 'estimator__leaf_size': 30,
 'estimator__metric': 'minkowski',
 'estimator__metric_params': None,
 'estimator__n_jobs': None,
 'estimator__n_neighbors': 5,
 'estimator__p': 2,
 'estimator__weights': 'uniform',
 'estimator': KNeighborsClassifier(),
 'n_jobs': None,
 'param_grid': {'n_neighbors': [2, 20]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}


from sklearn.ensemble import VotingClassifier

vc = VotingClassifier(estimators=[('knn', knn), ('nb', nb), ('svm', svm), ('tree', tree)])
vc.fit(X_train, y_train)
vc.score(X_test, y_test)

0.6196969696969697

	rating_no	Unnamed: 1	id	age_category	book_genre	rating_no.1	tokenised_text	n_tokens
0	1.0	284434	review_244526687	Adult	Popular fiction - general	1.0	like adult book concept simply ya spoiler exam...	30
1	1.0	30788	review_528067373	Adult	Literary fiction	1.0	okay read college maybe little biased rating l...	21
2	1.0	84989	review_3210428778	Adult	Literary fiction	1.0	remember read book club hating probably chance...	18
3	1.0	61511	review_112612281	Adult	Literary fiction	1.0	yeah star cause know like make like plus depre...	13
4	1.0	112948	review_380001099	Adult	Literary fiction	1.0	assign book brit lit class read email teacher ...	22

Genre classification with sklearn¶

Genre classification with `sklearn`¶