In this practical, we will apply a clustering method on news articles to cluster them into different groups. Here we are going to use the following packages:
library(tm)
library(tidytext)
library(dplyr)
library(proxy)
library(ggplot2)
library(tidyr)
library(dbscan)
load("data/news_dataset.rda")
head(df_final)
# Note that the next chunk (when using `DocumentTermMatrix`) will not work
# returns error invalid multibyte string 1512
# Two possible solutions:
# 1) Omission of the whole observation
# df_final <- df_final[-1512,]
# 2) Omission of the string "\xa315.8m" from that specifc text
df_final[1512,][2] <- gsub("\xa315.8m", "", df_final[1512,][2])
# I prefer the second solution
news
column of the dataframe. Complete the
following preprocessing steps:docs <- VCorpus(VectorSource(df_final$Content))
dtm <- DocumentTermMatrix(docs,
control = list(tolower = TRUE,
removeNumbers = TRUE,
removePunctuation = TRUE,
stopwords = TRUE
))
# We remove A LOT of features. R is natively very weak with high dimensional data
dtm_cut <- removeSparseTerms(dtm, sparse = 0.93)
# with sparse = 0.93 the dtm will end up with 359 terms; you can adjust this number based on your available memory
# dtm <- as.matrix(dtm) # if you have a supercomputer you can continue with this object, otherwise use the dtm_cut
dtm_cut <- as.matrix(dtm_cut)
# you can also check the wordclouds for dtm_cut
#library(wordcloud)
#wordcloud(colnames(dtm), dtm[5,], max.words = 50)
# Cosine distance matrix
dist_matrix <- dist(dtm_cut, method = "cosine")
# if it takes a lot of time for your computer to create the distance matrix load the available computed dist_matrix. We made this available for you. # save(dist_matrix, file = "dist_matrix.RData")
# load("dist_matrix.RData")
kmeans
. What does the output look like? Also check the
cluster centers.set.seed(321)
text_kmeans_clust3 <- kmeans(dtm_cut, centers = 3)
str(text_kmeans_clust3)
## List of 9
## $ cluster : Named int [1:2225] 1 1 1 1 1 1 1 1 1 1 ...
## ..- attr(*, "names")= chr [1:2225] "1" "2" "3" "4" ...
## $ centers : num [1:3, 1:359] 0.127 0.193 0 0.287 0.208 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:359] "â£bn" "â£m" "able" "according" ...
## $ totss : num 353497
## $ withinss : num [1:3] 162523 142640 17042
## $ tot.withinss: num 322205
## $ betweenss : num 31292
## $ size : int [1:3] 1620 600 5
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
# print the results
# text_kmeans_clust3
# show the centers
broom::tidy(text_kmeans_clust3)
The output of kmeans is a list with several bits of information. The most important being:
Principal Component Analysis, or PCA, is a dimensionality-reduction method that is often used to reduce the dimensionality of large data sets, by transforming a large set of variables into a smaller one that still contains most of the information in the large set. Principal Component Analysis (PCA) is a useful technique for exploratory data analysis, allowing you to better visualize the variation present in a dataset with many variables.
set.seed(321)
# Running the PCA
points <- cmdscale(dist_matrix, k = 2)
kmeans_clusters <- text_kmeans_clust3$cluster
plot(points,
main = 'K-Means clustering with 3 clusters',
col = as.factor(kmeans_clusters),
mai = c(0, 0, 0, 0),
mar = c(0, 0, 0, 0),
xaxt = 'n', yaxt = 'n',
xlab = '', ylab = '')
set.seed(321)
text_kmeans_clust4 <- kmeans(dtm_cut, centers = 4)
tidy(text_kmeans_clust4)
kmeans_clusters <- text_kmeans_clust4$cluster
plot(points,
main = 'K-Means clustering with 4 clusters',
col = as.factor(kmeans_clusters),
mai = c(0, 0, 0, 0),
mar = c(0, 0, 0, 0),
xaxt = 'n', yaxt = 'n',
xlab = '', ylab = '')
set.seed(321)
text_kmeans_clust5 <- kmeans(dtm_cut, centers = 5)
tidy(text_kmeans_clust5)
kmeans_clusters <- text_kmeans_clust5$cluster
plot(points,
main = 'K-Means clustering with 5 clusters',
col = as.factor(kmeans_clusters),
mai = c(0, 0, 0, 0),
mar = c(0, 0, 0, 0),
xaxt = 'n', yaxt = 'n',
xlab = '', ylab = '')
set.seed(321)
text_kmeans_clust6 <- kmeans(dtm_cut, centers = 6)
tidy(text_kmeans_clust6)
kmeans_clusters <- text_kmeans_clust6$cluster
plot(points,
main = 'K-Means clustering with 6 clusters',
col = as.factor(kmeans_clusters),
mai = c(0, 0, 0, 0),
mar = c(0, 0, 0, 0),
xaxt = 'n', yaxt = 'n',
xlab = '', ylab = '')
set.seed(321)
hierarcom_clustering <- hclust(dist_matrix, method = "complete")
plot(hierarcom_clustering, cex = 0.9, hang = -1)
rect.hclust(hierarcom_clustering, k = 5)
set.seed(321)
hierarWard_clustering <- hclust(dist_matrix, method = "ward.D2")
plot(hierarWard_clustering, cex = 0.9, hang = -1)
rect.hclust(hierarWard_clustering, k = 5)
set.seed(321)
hierar_clusters_com <- cutree(hierarcom_clustering, k = 5)
plot(points,
main = 'Hierarchical clustering complete linkage',
col = as.factor(hierar_clusters_com),
mai = c(0, 0, 0, 0),
mar = c(0, 0, 0, 0),
xaxt = 'n', yaxt = 'n',
xlab = '', ylab = '')
set.seed(321)
hierar_clusters_ward <- cutree(hierarWard_clustering, k = 5)
plot(points,
main = 'Hierarchical clustering complete Ward',
col = as.factor(hierar_clusters_ward),
mai = c(0, 0, 0, 0),
mar = c(0, 0, 0, 0),
xaxt = 'n', yaxt = 'n',
xlab = '', ylab = '')
set.seed(321)
dbscan_clustering <- hdbscan(dist_matrix, minPts = 10)
dbscan_clusters <- dbscan_clustering$cluster
plot(points,
main = 'Density-based clustering',
col = as.factor(dbscan_clusters),
mai = c(0, 0, 0, 0),
mar = c(0, 0, 0, 0),
xaxt = 'n', yaxt = 'n',
xlab = '', ylab = '')
In this practical, we learned about:
End of Practical