Cara Marta Messina
PhD Candidate in English, Writing & Rhetoric
Northeastern University
To be published in The Journal of Writing Analytics, Volume 3
Using the corpora created in the text preparation notebooks, this notebook will use several computational text analysis methods, including some NLP (natural language processing) and word embedding models. The fourth notebook will then use concordances to "fold" the computational models back to the text (William Reed Quinn, forthcoming).
#pandas for working with dataframes
import pandas as pd
#nltk libraries
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
#word2vec models
import gensim
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec as wv
First, I created a function that reads in a text file of a string of words, tokenizes that string, and returns it as ready to be analyzed. Then, I used this function for each of the text corpora that I created.
def read_txt(filePath):
'''
This function reads a file (specifically a text file) and tokenizes that file
Input: a .txt filepath of a string of words
Output: a tokenized list of words
'''
file = open(filePath, "r")
new_string = file.read()
file.close()
corpus_token = word_tokenize(new_string)
return corpus_token
preKorrasami = read_txt('../../../data/korra/korra2018/time/preKorrasami.txt')
subtextKorrasami = read_txt('../../../data/korra/korra2018/time/subtextKorrasami.txt')
postKorrasami = read_txt('../../../data/korra/korra2018/time/postKorrasami.txt')
type(preKorrasami)
print(preKorrasami[:10])
print(subtextKorrasami[:10])
print(postKorrasami[:10])
Using basic Natural Language Processing methods (parts of speech and word counters), I will use the functions below to explore trends in each corpus as a place to begin pulling together results.
The below function (POS_tag) does several things:
Although I wound up not using this function or these results in the Journal of Writing Analytics article to be published, I want to still publish this function so others may use it, if they are interested. I have also used this function in other research.
def printWordRatio(freqDist,wordcount,stringName,num):
'''
This takes the frequent distribution list, which counts basic frequency, and then finds the most common words/nouns/nGrams/verbs/etc
input: the frequent distribution list (does basic frequency count), the wordcount from the overall text, a string that explains the output, the number of most common published
output: a printed list of the top most common words/nGrams and the ratio of their appearance (ratio found by dividing the number they appear with the overall wordcount of the corpus)
'''
print("Word count for text:")
print(wordcount)
print("________________________")
print(stringName)
for tup0,tup1 in freqDist.most_common(num):
print(tup0, tup1/wordcount)
def POS_tag(text,num):
'''
This takes a tokenized text, tags it with parts of speech, and then counts the most frequent words used for particular parts of speech.
input: a tokenized text (could be clean or not)
output: a printed list of the most frequent words tagged in different parts of speech
'''
#word count!
wordcount=(len(text))
text_word_frequency = nltk.FreqDist(text)
#do some bigram stuff
bigram = list(ngrams(text,2))
biGramFreq = nltk.FreqDist(bigram)
#do some trigram stuff
triGram = list(ngrams(text,3))
triGramFreq = nltk.FreqDist(triGram)
#then use parts of speech tag!
text_tagged = nltk.pos_tag(text)
#count the number of nouns
text_nouns = [word for word,pos in text_tagged if pos=='NN' or pos=='NNS']
text_freq_nouns=nltk.FreqDist(text_nouns)
#count the verbs
text_verbs = [word for word,pos in text_tagged if pos == 'VB' or pos=='VBD' or pos=='VBG' or pos=='VBN' or pos=='VBP' or pos=='VBZ']
text_freq_verbs=nltk.FreqDist(text_verbs)
#count the adjectives
text_adj = [word for word,pos in text_tagged if pos == 'JJ' or pos == 'JJR' or pos == 'JJS']
text_freq_adj=nltk.FreqDist(text_adj)
#count the prepositions
text_prep = [word for word,pos in text_tagged if pos == 'IN']
text_freq_prep=nltk.FreqDist(text_prep)
printWordRatio(text_word_frequency,wordcount,"Most frequent words:",num)
printWordRatio(biGramFreq,wordcount,"Most frequent bigrams:",num)
printWordRatio(triGramFreq,wordcount,"Most frequent trigrams:",num)
printWordRatio(text_freq_nouns,wordcount,"Most frequent nouns:",num)
printWordRatio(text_freq_verbs,wordcount,"Most frequent verbs:",num)
printWordRatio(text_freq_adj,wordcount,"Most frequent adjectives:",num)
printWordRatio(text_freq_prep,wordcount,"Most frequent prepositions:",num)
preKorra = POS_tag(preKorrasami,40)
preKorra
subtextKorra = POS_tag(subtextKorrasami,40)
subtextKorra
postKorra = POS_tag(postKorrasami,40)
postKorra
Using the LineSentence function (from gensim), which takes a file, reads it in, and does the necessary pre-processing for you, I read in all my files and then created word2vec models for each.
sent_preKorra = LineSentence('../../../data/korra/korra2018/time/preKorrasami.txt')
sent_subKorra = LineSentence('../../../data/korra/korra2018/time/subtextKorrasami.txt')
sent_postKorra = LineSentence('../../../data/korra/korra2018/time/postKorrasami.txt')
preKorra_w2v = wv(sent_preKorra, window=20, min_count=10, workers=4)
preKorra_w2v.wv.most_similar(['asami'], topn=20)
preKorra_w2v.wv.most_similar(['girlfriend'], topn=20)
preKorra_w2v.wv.most_similar(['korra'], topn=20)
preKorra_w2v.wv.most_similar(['cheerlead'], topn=20)
preKorra_w2v.wv.most_similar(['heiress'], topn=20)
preKorra_w2v.wv.most_similar(['woman'], topn=20)
preKorra_w2v.wv.most_similar(['man'], topn=20)
preKorra_w2v.wv.most_similar(['muscular'], topn=20)
preKorra_w2v.wv.most_similar(['feminin'], topn=20)
preKorra_w2v.wv.most_similar(['masculin'], topn=20)
preKorra_w2v.wv.most_similar(['gender'], topn=20)
preKorra_w2v.wv.most_similar(['marri'], topn=20)
preKorra_w2v.wv.most_similar(['pregnant'], topn=20)
subKorra_w2v = wv(sent_subKorra, window=20, min_count=10, workers=4)
subKorra_w2v.wv.most_similar(['asami'], topn=20)
subKorra_w2v.wv.most_similar(['girlfriend'], topn=20)
subKorra_w2v.wv.most_similar(['korra'], topn=20)
subKorra_w2v.wv.most_similar(['cheerlead'], topn=20)
subKorra_w2v.wv.most_similar(['heiress'], topn=20)
subKorra_w2v.wv.most_similar(['woman'], topn=20)
subKorra_w2v.wv.most_similar(['feminin'], topn=20)
subKorra_w2v.wv.most_similar(['man'], topn=20)
subKorra_w2v.wv.most_similar(['masculin'], topn=20)
subKorra_w2v.wv.most_similar(['gender'], topn=20)
subKorra_w2v.wv.most_similar(['lesbian'], topn=20)
subKorra_w2v.wv.most_similar(['marri'], topn=20)
subKorra_w2v.wv.most_similar(['pregnant'], topn=20)
postKorra_w2v = wv(sent_postKorra, window=20, min_count=10, workers=4)
postKorra_w2v.wv.most_similar(['asami'], topn=20)
postKorra_w2v.wv.most_similar(['girlfriend'], topn=20)
postKorra_w2v.wv.most_similar(['korra'], topn=20)
postKorra_w2v.wv.most_similar(['heiress'], topn=20)
postKorra_w2v.wv.most_similar(['woman'], topn=20)
postKorra_w2v.wv.most_similar(['feminin'], topn=20)
postKorra_w2v.wv.most_similar(['masculin'], topn=20)
postKorra_w2v.wv.most_similar(['gender'], topn=20)
postKorra_w2v.wv.most_similar(['queer'], topn=20)
postKorra_w2v.wv.most_similar(['bisexu'], topn=20)
postKorra_w2v.wv.most_similar(['racist'], topn=20)
postKorra_w2v.wv.most_similar(['marri'], topn=20)
postKorra_w2v.wv.most_similar(['pregnant'], topn=20)
Good things to look at:
feminin (feminine):
masculin (masculine):
gender:
queer:
bisexual:
lesbian:
heterosexu (heterosexual, heterosexuality, etc) :
racist:
marri (marry, etc):
pregnant:
Although NLTK has a concordance function, it only shows the first 25 results. I instead found this function "makeConc" from Geoffrey Rockwell that shows more than 25 results and is fairly flexible in its results.
The "makeConc" function requires a tokenized list, so I will still be using the "read_txt" function, but I will be using the concordance function on the uncleaned versions of the corpora so the context is a bit more clear.
I have chosen to keep the output results hidden because these excerpts and texts do not belong to me, so I would prefer not to publish someone else's writing and language unless I have their permission.
preKorra_string = read_txt('../../../data/korra/korra2018/time/preKorrasami_unclean.txt')
subtextKorra_string = read_txt('../../../data/korra/korra2018/time/subtextKorrasami_unclean.txt')
postKorra_string = read_txt('../../../data/korra/korra2018/time/postKorrasami_unclean.txt')
def makeConc(word2conc,list2FindIn,context2Use,concList):
# Lets get
end = len(list2FindIn)
for location in range(end):
if list2FindIn[location] == word2conc:
# Here we check whether we are at the very beginning or end
if (location - context2Use) < 0:
beginCon = 0
else:
beginCon = location - context2Use
if (location + context2Use) > end:
endCon = end
else:
endCon = location + context2Use + 1
theContext = (list2FindIn[beginCon:endCon])
concordanceLine = ' '.join(theContext)
# print(str(location) + ": " + concordanceLine)
concList.append(str(location) + ": " + concordanceLine)
gender1 = []
makeConc('gender',preKorra_string,5,gender1)
gender1
gender2 = []
makeConc('gender',subtextKorra_string,5,gender2)
gender2
gender3 = []
makeConc('gender',postKorra_string,5,gender3)
gender3
fem1 = []
makeConc('feminine',preKorra_string,6,fem1)
fem1
masc1 = []
makeConc('masculine',preKorra_string,6,masc1)
masc1
fem2 = []
makeConc('feminine',subtextKorra_string,7,fem2)
fem2
masc2 = []
makeConc('masculine',subtextKorra_string,7,masc2)
masc2
fem3 = []
makeConc('feminine',postKorra_string,6,fem3)
fem3
masc3 = []
makeConc('masculine',postKorra_string,6,masc3)
masc3
gay1 = []
makeConc('gay',preKorra_string,9,gay1)
gay1
les1 = []
makeConc('lesbian',preKorra_string,9,les1)
les1
bi1 = []
makeConc('bi',preKorra_string,7,bi1)
bi1
gay2 = []
makeConc('gay',subtextKorra_string,9,gay2)
gay2
les2 = []
makeConc('lesbian',subtextKorra_string,9,les2)
les2
bi2 = []
makeConc('bisexual',subtextKorra_string,9,bi2)
bi2
gay3 = []
makeConc('gay',postKorra_string,7,gay3)
gay3
les3 = []
makeConc('lesbian',postKorra_string,7,les3)
les3
gf1 = []
makeConc('girlfriend',preKorra_string,6,gf1)
gf1
gf2 = []
makeConc('girlfriend',subtextKorra_string,8,gf2)
gf2
gf3 = []
makeConc('girlfriend',postKorra_string,5,gf3)
gf3
bio1 = []
makeConc('biological',preKorra_string,6,bio1)
bio1
bio2 = []
makeConc('biological',subtextKorra_string,6,bio2)
bio2
bio3 = []
makeConc('biological',postKorra_string,6,bio3)
bio3
bi3 = []
makeConc('bi',postKorra_string,8,bi3)
bi3