a = "Hello @Text Mining World! I'm here to learn everything, right?"
a

"Hello @Text Mining World! I'm here to learn everything, right?"


print(a[0]) # if you do not use the print function, it will print only the last argument in the cell
print(a[61])
l = len(a)
print("Length of your string is: ", l)
print(a[l-1])

H
?
Length of your string is:  62
?


!pip install -q numpy
!pip install -q nltk
!pip install -q gensim
!pip install -q spacy


import nltk
b = a.lower()
b

"hello @text mining world! i'm here to learn everything, right?"


import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


# Remember there are many ways to remove punctuations! This is only one of them:
c = "".join([char for char in b if char not in string.punctuation])
print(c)

hello text mining world im here to learn everything right


from nltk.tokenize import word_tokenize
print(word_tokenize(b))
print(word_tokenize(c))
# You might need to download Punkt Tokenizer Models
# In this case, run the code nltk.download('punkt')

['hello', '@', 'text', 'mining', 'world', '!', 'i', "'m", 'here', 'to', 'learn', 'everything', ',', 'right', '?']
['hello', 'text', 'mining', 'world', 'im', 'here', 'to', 'learn', 'everything', 'right']


from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(b)

['hello',
 'text',
 'mining',
 'world',
 'i',
 'm',
 'here',
 'to',
 'learn',
 'everything',
 'right']


from nltk.tokenize import sent_tokenize
print(sent_tokenize(b))
print(sent_tokenize(c))

['hello @text mining world!', "i'm here to learn everything, right?"]
['hello text mining world im here to learn everything right']


import pandas as pd
ts_lyrics = pd.read_csv("taylor_swift_lyrics.csv")


ts_lyrics.head()


ts_lyrics.tail()


ts_lyrics.iloc[0]

Artist                                         Taylor Swift 
Album                                          Taylor Swift 
Title                                             Tim McGraw
Lyrics     He said the way my blue eyes shinx\nPut those ...
Name: 0, dtype: object


ts_lyrics.head(1)


import re
def remove_linebreaks(text):
    """custom function to remove the line breaks"""
    return re.sub(r'\n', ' ', text)

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Lyrics"].apply(lambda text: remove_linebreaks(text))
ts_lyrics.head()


def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', string.punctuation))

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: remove_punctuation(text))
ts_lyrics.head()


ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].str.lower()
ts_lyrics.head()


from collections import Counter

# To get all lyrics in one text, you can concatenate all of them using the " ".join(list) syntax,
# which joins all elements in a list separating them by whitespace.
text = " ".join(lyric for lyric in ts_lyrics["Preprocessed Lyrics"])

# split() returns list of all the words in the string
split_it = text.split()

# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)

# most_common() produces k frequently encountered input values and their respective counts.
most_occur = Counter.most_common(20)

print(most_occur)

[('i', 2377), ('you', 2319), ('the', 1623), ('and', 1403), ('me', 885), ('to', 843), ('a', 787), ('in', 686), ('it', 674), ('my', 642), ('of', 492), ('your', 475), ('we', 441), ('that', 436), ('all', 436), ('but', 428), ('like', 406), ('im', 404), ('this', 394), ('know', 380)]


!pip install -q wordcloud


from wordcloud import WordCloud
?WordCloud


import matplotlib.pyplot as plt

wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


from nltk.corpus import stopwords
# run the code nltk.download('stopwords') if needed
stop_words = set(stopwords.words('english'))
print(stop_words)

{'herself', 'am', 'what', 'didn', 'they', 'yours', 'such', 'hers', 'between', 'being', 'does', 'be', 'after', "you're", 'how', 'will', 'during', "hadn't", 'his', "wouldn't", "you've", 'couldn', 'don', 'wouldn', 'ma', 'there', 'isn', 'mightn', 'd', 'again', 'her', 'own', 'doing', "isn't", 'was', 'itself', 'as', 'himself', 'were', 'once', "she's", 'same', 'then', 'at', 'too', 'mustn', 'both', "you'd", 'in', 'while', 'by', 'him', 'with', 'before', "mightn't", 'no', 'below', "haven't", "doesn't", 'haven', 'doesn', 'your', 'all', 'can', 'yourself', 'has', 'only', 'into', 'but', 'm', 'because', "should've", 'against', 'you', 'off', 'ourselves', 'i', 't', 'we', 'to', "shouldn't", 'wasn', 'few', 'my', "won't", 'hadn', 'needn', 'and', 'should', 'than', 'had', 'our', 'is', 's', 'll', 'it', 'its', 'of', 'nor', 'some', 'any', 'their', "shan't", 'each', 'those', "wasn't", "hasn't", 'if', 'more', 'here', 'about', "that'll", 'this', 'do', 'on', 'shouldn', 'he', "it's", 'for', 'yourselves', 'up', 'until', 'other', 'me', 'which', 'ain', "aren't", 'have', "needn't", 'from', 'the', 'aren', 'won', "mustn't", 'very', 'ours', 'a', 'not', 'weren', 'o', 'are', 'hasn', 'over', 'why', "couldn't", 'who', "weren't", 'shan', 'these', "didn't", 'whom', 'myself', 'did', 'now', "don't", 'where', 'out', 'having', 'when', 'so', 'above', 'under', 'been', 'through', 'themselves', 'she', 'just', 're', 've', 'further', 'theirs', 'y', 'or', 'that', 'most', "you'll", 'an', 'down', 'them'}


stop_words.update(["im", "youre", "id", "dont", "cant", "didnt", "ive", "ill", "hasnt"])
# stop_words.discard('word') # this is when you want to remove a word from the list
print(stop_words)

{'herself', 'am', 'what', 'didn', 'they', 'yours', 'such', 'hers', 'between', 'being', 'does', 'be', 'after', "you're", 'how', 'will', 'during', "hadn't", 'his', "wouldn't", "you've", 'couldn', 'don', 'wouldn', 'ma', 'there', 'isn', 'mightn', 'd', 'again', 'her', 'own', 'doing', "isn't", 'was', 'itself', 'as', 'himself', 'were', 'once', "she's", 'same', 'then', 'at', 'too', 'mustn', 'both', "you'd", 'in', 'while', 'by', 'him', 'with', 'before', "mightn't", 'no', 'below', "haven't", "doesn't", 'haven', 'doesn', 'your', 'all', 'can', 'youre', 'yourself', 'has', 'only', 'into', 'but', 'm', 'because', "should've", 'against', 'you', 'off', 'ourselves', 'i', 't', 'we', 'to', "shouldn't", 'wasn', 'few', 'my', "won't", 'hadn', 'needn', 'and', 'should', 'than', 'had', 'our', 'is', 's', 'll', 'it', 'its', 'of', 'nor', 'some', 'any', 'their', "shan't", 'each', 'those', "wasn't", "hasn't", 'if', 'more', 'im', 'ive', 'here', 'about', "that'll", 'didnt', 'this', 'do', 'on', 'shouldn', 'he', "it's", 'ill', 'for', 'yourselves', 'up', 'until', 'other', 'me', 'which', 'ain', "aren't", 'have', 'dont', "needn't", 'from', 'hasnt', 'the', 'aren', 'won', "mustn't", 'very', 'cant', 'ours', 'a', 'not', 'weren', 'o', 'are', 'hasn', 'over', 'why', "couldn't", 'who', "weren't", 'shan', 'these', "didn't", 'whom', 'id', 'myself', 'did', 'now', "don't", 'where', 'out', 'having', 'when', 'so', 'above', 'under', 'been', 'through', 'themselves', 'she', 'just', 're', 've', 'further', 'theirs', 'y', 'or', 'that', 'most', "you'll", 'an', 'down', 'them'}


def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in stop_words])

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: remove_stopwords(text))
ts_lyrics.head()


from collections import Counter

# To get all lyrics in one text, you can concatenate all of them using the " ".join(list) syntax,
# which joins all elements in a list separating them by whitespace.
text = " ".join(lyric for lyric in ts_lyrics["Preprocessed Lyrics"])

# split() returns list of all the words in the string
split_it = text.split()

# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)

# most_common() produces k frequently encountered input values and their respective counts.
most_occur = Counter.most_common(20)

print(most_occur)

[('like', 406), ('know', 380), ('oh', 322), ('never', 294), ('love', 246), ('back', 240), ('time', 224), ('cause', 213), ('one', 177), ('say', 176), ('see', 170), ('got', 159), ('wanna', 158), ('think', 153), ('baby', 153), ('come', 150), ('go', 149), ('want', 142), ('ever', 134), ('could', 133)]


wordcloud = WordCloud(max_words=50, background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


from nltk.stem import WordNetLemmatizer
# run the code nltk.download('wordnet') if needed
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: lemmatize_words(text))
ts_lyrics.head()


from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

# since we applied the lemmatization, we don't apply stemming; though you can try it!
# ts_lyrics["Preprocessed Lyrics"] = ts_lyrics["Preprocessed Lyrics"].apply(lambda text: stem_words(text))
# ts_lyrics.head()


from nltk.stem.snowball import SnowballStemmer
SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')


from sklearn.feature_extraction.text import CountVectorizer # for bag of words feature extraction

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer1 = CountVectorizer(max_features = 3000)

# fit_transform() does two functions: First, it fits the model and learns the vocabulary;
# second, it transforms our data into feature vectors.
# The input to fit_transform should be a list of strings.
dtm = vectorizer1.fit_transform(ts_lyrics["Preprocessed Lyrics"])
print(dtm.shape)

(132, 2597)

dtm

<132x2597 sparse matrix of type '<class 'numpy.int64'>'
	with 10530 stored elements in Compressed Sparse Row format>


# we can convert it to a dataframe
dtm_df = dtm.toarray()
dtm_df = pd.DataFrame(dtm_df)
dtm_df.head()


# Take a look at the words in the vocabulary
vocab = vectorizer1.get_feature_names_out()
print(vocab[1:100])

['16' '16th' '45' '4am' 'aah' 'abigail' 'absent' 'absurd' 'accent'
 'accident' 'accused' 'ace' 'achilles' 'aching' 'acing' 'across' 'act'
 'acted' 'actress' 'actually' 'add' 'adjusting' 'admit' 'adore'
 'adventure' 'affair' 'afraid' 'afterglow' 'afternoon' 'age' 'ago' 'ah'
 'ahah' 'ahahah' 'ahead' 'ahh' 'aim' 'aint' 'air' 'airplane' 'aisle'
 'album' 'aligned' 'alive' 'alls' 'almost' 'alone' 'along' 'alpha'
 'already' 'alright' 'altar' 'always' 'ambition' 'amen' 'american'
 'americana' 'amnesia' 'amount' 'andi' 'ane' 'angel' 'angry' 'another'
 'answer' 'anthem' 'anther' 'anticipation' 'anybody' 'anymore' 'anyone'
 'anything' 'anyway' 'anywhere' 'apart' 'apartment' 'apology' 'applause'
 'archer' 'architect' 'arent' 'argue' 'arm' 'armor' 'around' 'arrowhead'
 'ash' 'aside' 'ask' 'asked' 'asking' 'asleep' 'assume' 'assumption' 'ate'
 'ateam' 'attached' 'attack' 'attitude']


from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer2 = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer2.fit_transform(ts_lyrics["Preprocessed Lyrics"])

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

(132, 2597)


# Create TfidfVectorizer object
vectorizer3 = TfidfVectorizer(ngram_range=(1, 2))

# Generate matrix of word vectors
tfidf_matrix3 = vectorizer3.fit_transform(ts_lyrics["Preprocessed Lyrics"])

# Print the shape of tfidf_matrix
print(tfidf_matrix3.shape)

(132, 15016)


friends_theme_lyrics = "So no one told you life was going to be this way. Your job's a joke, you're broke, you're love life's DOA. It's like you're always stuck in second gear, When it hasn\'t been your day, your week, your month, or even your year. But, I\'ll be there for you, when the rain starts to pour. I\'ll be there for you, like I\'ve been there before. I\'ll be there for you, cause you\'re there for me too."
friends_theme_lyrics

"So no one told you life was going to be this way. Your job's a joke, you're broke, you're love life's DOA. It's like you're always stuck in second gear, When it hasn't been your day, your week, your month, or even your year. But, I'll be there for you, when the rain starts to pour. I'll be there for you, like I've been there before. I'll be there for you, cause you're there for me too."


friends_theme_lyrics = remove_punctuation(friends_theme_lyrics)
friends_theme_lyrics = friends_theme_lyrics.lower()
friends_theme_lyrics = remove_stopwords(friends_theme_lyrics)
friends_theme_lyrics = lemmatize_words(friends_theme_lyrics)
friends_theme_lyrics

'one told life going way job joke broke love life doa like always stuck second gear day week month even year rain start pour like cause'


friends_theme_lyrics_tf = vectorizer1.transform([friends_theme_lyrics])
friends_theme_lyrics_tf.shape
dtm.shape

(132, 2597)


from sklearn.metrics.pairwise import cosine_similarity

# compute and print the cosine similarity matrix
cosine_sim_dtm = cosine_similarity(dtm, friends_theme_lyrics_tf)

print(cosine_sim_dtm)

[[0.07295675]
 [0.05749499]
 [0.05668202]
 [0.099573  ]
 [0.09816136]
 [0.00975761]
 [0.16365771]
 [0.11501093]
 [0.02125256]
 [0.15038123]
 [0.07792865]
 [0.09200874]
 [0.17766726]
 [0.0360492 ]
 [0.0786839 ]
 [0.1062023 ]
 [0.23829304]
 [0.08566568]
 [0.15519271]
 [0.1352231 ]
 [0.03202563]
 [0.19158319]
 [0.09043166]
 [0.19051587]
 [0.10639904]
 [0.12562973]
 [0.13526614]
 [0.1340465 ]
 [0.10332549]
 [0.14529915]
 [0.08091962]
 [0.0428993 ]
 [0.05358677]
 [0.11510231]
 [0.03928371]
 [0.05463417]
 [0.0946985 ]
 [0.0745356 ]
 [0.24685715]
 [0.07198268]
 [0.09507654]
 [0.11511347]
 [0.11136921]
 [0.10401235]
 [0.1946593 ]
 [0.15567091]
 [0.21439196]
 [0.13088543]
 [0.11021668]
 [0.09369712]
 [0.11888042]
 [0.06581261]
 [0.00903711]
 [0.21465394]
 [0.22794562]
 [0.04007421]
 [0.06975801]
 [0.05602768]
 [0.01563873]
 [0.10146346]
 [0.13488377]
 [0.1500909 ]
 [0.0521599 ]
 [0.16455472]
 [0.20490974]
 [0.17563692]
 [0.13237606]
 [0.02857238]
 [0.03055662]
 [0.17989569]
 [0.0790393 ]
 [0.00461099]
 [0.09170196]
 [0.02086808]
 [0.03288424]
 [0.11242975]
 [0.044955  ]
 [0.02726372]
 [0.18975469]
 [0.06574775]
 [0.08736843]
 [0.07787518]
 [0.24627294]
 [0.04908068]
 [0.13145637]
 [0.05978084]
 [0.14187609]
 [0.15555556]
 [0.04961695]
 [0.05384297]
 [0.09147674]
 [0.03362627]
 [0.09035781]
 [0.05615828]
 [0.07207214]
 [0.0340633 ]
 [0.16397832]
 [0.05627802]
 [0.09116057]
 [0.06624405]
 [0.07803834]
 [0.06365683]
 [0.04996305]
 [0.09610043]
 [0.10304734]
 [0.19756782]
 [0.01443376]
 [0.1750503 ]
 [0.18845876]
 [0.05396298]
 [0.17025131]
 [0.11426298]
 [0.10506787]
 [0.22829105]
 [0.08403295]
 [0.06714701]
 [0.05614346]
 [0.13715477]
 [0.01830783]
 [0.13255879]
 [0.07392213]
 [0.08475223]
 [0.20016019]
 [0.09798273]
 [0.10425721]
 [0.07647191]
 [0.04792568]
 [0.23490916]
 [0.0564445 ]
 [0.11111111]
 [0.08512565]
 [0.05363453]]


import numpy as np
max_index = np.argmax(cosine_sim_dtm, axis=0)
print(cosine_sim_dtm[max_index])
max_index

[[0.24685715]]

array([38])


ts_lyrics.iloc[max_index]


ts_lyrics["Preprocessed Lyrics"].iloc[38]

'used think one day wed tell story u met spark flew instantly people would say theyre lucky one used know place spot next searching room empty seat cause lately even know page oh simple complication miscommunications lead fall many thing wish knew many wall break standing alone crowded room speaking dying know killing like killing yeah know say since twist fate broke story u look lot like tragedy next chapter howd end way see nervously pulling clothes trying look busy best avoid starting think one day tell story u losing mind saw held pride like held oh scared see ending pretending nothing tell miss know never heard silence quite loud standing alone crowded room speaking dying know killing like killing yeah know say since twist fate broke story u look lot like tragedy looking like contest act like care le liked better side battle hand would lay armor youd say youd rather love fight many thing wish knew story u might ending soon standing alone crowded room speaking dying know killing like killing yeah know say since twist fate broke story u look lot like tragedy end'


friends_theme_lyrics_tfidf = vectorizer3.transform([friends_theme_lyrics])
print(friends_theme_lyrics_tfidf.shape)
print(tfidf_matrix3.shape)
# compute and print the cosine similarity matrix
cosine_sim_tfidf = cosine_similarity(tfidf_matrix3, friends_theme_lyrics_tfidf)

print(cosine_sim_tfidf)

(1, 15016)
(132, 15016)
[[0.02369657]
 [0.01318075]
 [0.01128244]
 [0.03986478]
 [0.03108815]
 [0.00181568]
 [0.02572803]
 [0.02691028]
 [0.00512609]
 [0.04532813]
 [0.01615807]
 [0.00787449]
 [0.03826574]
 [0.0068687 ]
 [0.01320367]
 [0.01245819]
 [0.09768082]
 [0.03284433]
 [0.01891928]
 [0.05104409]
 [0.00801751]
 [0.045455  ]
 [0.02005361]
 [0.04590047]
 [0.0319897 ]
 [0.01850863]
 [0.02299573]
 [0.0238499 ]
 [0.01617267]
 [0.03525199]
 [0.02914826]
 [0.01417113]
 [0.01017535]
 [0.02329621]
 [0.01165122]
 [0.00883778]
 [0.0213434 ]
 [0.01468301]
 [0.052752  ]
 [0.01398688]
 [0.02316361]
 [0.02104993]
 [0.03310764]
 [0.01013489]
 [0.04137598]
 [0.04265813]
 [0.04249053]
 [0.03391019]
 [0.02922056]
 [0.01882679]
 [0.01167784]
 [0.01196412]
 [0.00448242]
 [0.03975051]
 [0.02942139]
 [0.02500672]
 [0.0149794 ]
 [0.01541409]
 [0.00149821]
 [0.01839457]
 [0.03851785]
 [0.02703587]
 [0.01044214]
 [0.01812665]
 [0.04455981]
 [0.03015706]
 [0.03756264]
 [0.00366365]
 [0.00325763]
 [0.0333111 ]
 [0.01114174]
 [0.00036564]
 [0.03036674]
 [0.00187995]
 [0.00755594]
 [0.01588253]
 [0.01206606]
 [0.02056343]
 [0.07499416]
 [0.01513767]
 [0.03379185]
 [0.01961567]
 [0.04624439]
 [0.00618851]
 [0.02567295]
 [0.01120954]
 [0.04119311]
 [0.01641921]
 [0.00560346]
 [0.00851409]
 [0.02050939]
 [0.0099525 ]
 [0.01615719]
 [0.00906767]
 [0.00936747]
 [0.01777334]
 [0.02409146]
 [0.02385133]
 [0.03111177]
 [0.01181365]
 [0.04510753]
 [0.0069002 ]
 [0.0124372 ]
 [0.00991338]
 [0.01700991]
 [0.0369523 ]
 [0.00665362]
 [0.02971692]
 [0.02632021]
 [0.01001789]
 [0.02780341]
 [0.01487108]
 [0.01907687]
 [0.03361355]
 [0.00807426]
 [0.00985324]
 [0.01145457]
 [0.01459391]
 [0.00972955]
 [0.03375572]
 [0.01890663]
 [0.01118828]
 [0.02691614]
 [0.01158261]
 [0.03914842]
 [0.01562798]
 [0.01257735]
 [0.02771936]
 [0.0127158 ]
 [0.03262289]
 [0.02177201]
 [0.00782077]]


max_index = np.argmax(cosine_sim_tfidf, axis=0)
print(cosine_sim_tfidf[max_index])
max_index

[[0.09768082]]

array([16])


ts_lyrics.iloc[max_index]


ts_lyrics["Preprocessed Lyrics"].iloc[16]

'upon time believe tuesday caught eye caught onto something hold onto night looked eye told loved kidding cause seems thing breaking almost never speak feel welcome anymore baby happened please tell cause one second perfect halfway door stare phone still called feel low feel nothing flashback said forever always oh rain bedroom everything wrong rain rain gone cause said forever always line say something way honest made run hide like scared little boy looked eye thought knew minute sure here everything coming nothing here silence cut core going thought knew minute anymore stare phone still called feel low feel nothing flashback said forever always oh rain bedroom everything wrong rain rain gone cause said forever always mean baby think oh back baby back forget everything back baby back forget everything cause rain bedroom everything wrong rain rain gone cause said forever always oh stare phone still called feel low feel nothing flashback said forever always rain bedroom everything wrong rain rain gone cause said forever always mean baby said forever always yeah'

	Artist	Album	Title	Lyrics
127	Taylor Swift	folklore	mad woman	What did you think I'd say to that?\nDoes a sc...
128	Taylor Swift	folklore	epiphany	Keep your helmet\nKeep your life, son\nJust a ...
129	Taylor Swift	folklore	betty	Betty, I won't make assumptions about why you ...
130	Taylor Swift	folklore	peace	Our coming of age has come and gone\nSuddenly ...
131	Taylor Swift	folklore	hoax	My only one\nMy smoking gun\nMy eclipsed sun\n...

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1

Practical 1: Text Pre-processing¶

Ayoub Bagheri¶

Applied Text Mining - Utrecht Summer School¶

Pre-processing simple texts¶

Let's get started!¶

Pre-processing a text corpus (dataset)¶

Vector space and BOW models¶

	Artist	Album	Title	Lyrics
0	Taylor Swift	Taylor Swift	Tim McGraw	He said the way my blue eyes shinx\nPut those ...
1	Taylor Swift	Taylor Swift	Picture to Burn	State the obvious, I didn't get my perfect fan...
2	Taylor Swift	Taylor Swift	Teardrops on my Guitar	Drew looks at me,\nI fake a smile so he won't ...
3	Taylor Swift	Taylor Swift	A Place in This World	I don't know what I want, so don't ask me\n'Ca...
4	Taylor Swift	Taylor Swift	Cold As You	You have a way of coming easily to me\nAnd whe...

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1

	...	2592	2593	2596
0	...	0	2	0
1	...	1	1	0
2	...	0	0	0
3	...	0	0	0
4	...	0	0	1