Cara Marta Messina
Northeastern University
messina [dot] c [at] husky [dot] neu [dot] edu
This notebook takes data collected from Archive of Our Own, a popular fanfiction repository, and sets it up to be analyzed. The data was collected using this AO3 python scraper. The corpus consists of The Legend of Korra and Game of Thrones fanfics, from the first one published on AO3 to 2019.
This notebook is part of the Critical Fan Toolkit, Cara Marta Messina's public + digital dissertation
#pandas for working with dataframes
import pandas as pd
#regular expression library
import re
#numpy specifically works with numbers
import numpy as np
from nltk import word_tokenize
import string
punctuations = list(string.punctuation)
#has the nice counter feature for counting tags
import collections
from collections import Counter
#for making a string of elements separated by commas into a list
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktLanguageVars
#visualizations
import plotly as py
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
#calling my plotly thing
import chart_studio
chart_studio.tools.set_credentials_file(username='caramessina', api_key='IdA4LjtaqYKmFJnfS8Uv')
korra_all = pd.read_csv('./data/group_month/allkorra_months.csv').set_index('month')
korra_all.head(3)
#reading in multiple csv files, since one large one breaks my kernels
gotmonth0 = pd.read_csv('data/group_month/got_1.csv')
gotmonth1 = pd.read_csv('data/group_month/got_2.csv')
gotmonth2 = pd.read_csv('data/group_month/got_3.csv')
gotmonth3 = pd.read_csv('data/group_month/got_4.csv')
got_all = pd.concat([gotmonth0, gotmonth1, gotmonth2, gotmonth3]).set_index('month')
got_all.head(5)
created a function that will take the different tags (which are phrased as characterA/characterB, characterA/characterB, etc in the data) and count the most common relationships to then output it as the most common relationship tags used.
def column_to_list(df,columnName):
'''
this function takes all the information from a specific column, joins it to a string, and then tokenizes & cleans that string.
input: the name of the dataframe and the column name
output: the tokenized list of the text with all lower case, punctuation removed, and no stop words
'''
df[columnName] = df[columnName].replace(np.nan,'',regex=True)
string = ' '.join(df[columnName].tolist())
return string
def clean_tokens(string):
stopwords = ['i', 'me', 'my', 'myself', "“", "”", 'we', 'our', '’', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "would", "could", 'won', "won't", 'wouldn', "wouldn't"]
text_lc = [word.lower() for word in string]
text_tokens_clean = [word for word in text_lc if word not in stopwords]
text_tokens_clean = [word for word in text_tokens_clean if word not in punctuations]
return text_tokens_clean
print(text_tokens_clean[:20])
def TagsAnalyzer(df, monthBegin, monthEnd, columnName):
'''
input: the index month+year, such as '2012-04', and the specific metadata, such as 'additional tags'
output: a tupple of the count of tags in a specific month/year
load in the proper data into a string'''
#choose the months to analyze
months_df = df.loc[monthBegin:monthEnd, :]
#replace empty values & make a list of all the words
string = column_to_list(months_df, columnName)
#the function to tokenize, or put each value as an element in a list
class CommaPoint(PunktLanguageVars):
sent_end_chars = (',')
tokenizer = PunktSentenceTokenizer(lang_vars = CommaPoint())
#tokenizing the list of strings based on the COMMA, not the white space (as seen in the CommaPoint above)
ListOfTags = tokenizer.tokenize(string)
#the "Counter" function is from the collections library
allCounter=collections.Counter(ListOfTags)
return allCounter.most_common(50)
korra_preKArel = TagsAnalyzer(korra_all,'2011-02','2014-07','relationship')
korra_subKArel = TagsAnalyzer(korra_all,'2014-02','2014-11','relationship')
korra_postKArel = TagsAnalyzer(korra_all,'2014-12','2015-07','relationship')
print('Pre-Korrasami')
print(korra_preKArel)
print('\n Korrasami Subtext')
print(korra_subKArel)
print('\n Post-Korrasami')
print(korra_postKArel)
korra_preKA_at = TagsAnalyzer(korra_all,'2011-02','2014-07','additional tags')
korra_subKA_at = TagsAnalyzer(korra_all,'2014-02','2014-11','additional tags')
korra_postKA_at = TagsAnalyzer(korra_all,'2014-12','2015-07','additional tags')
print('Pre-Korrasami')
print(korra_preKA_at)
print('\n Korrasami Subtext')
print(korra_subKA_at)
print('\n Post-Korrasami')
print(korra_postKA_at)
korra_preKAcat = TagsAnalyzer(korra_all,'2011-02','2014-07','category')
korra_subKAcat = TagsAnalyzer(korra_all,'2014-02','2014-11','category')
korra_postKAcat = TagsAnalyzer(korra_all,'2014-12','2015-07','category')
print('Pre-Korrasami')
print(korra_preKAcat)
print('\n Korrasami Subtext')
print(korra_subKAcat)
print('\n Post-Korrasami')
print(korra_postKAcat)
#seasons 1 and 2 – season 3 starts March 2013
got1_2relationship = TagsAnalyzer(got_all,'2006-08','2013-02','relationship')
print('\n Seasons 1 and 2')
print(got1_2relationship)
#seasons 3 and 4 – season 5 starts April 2015
got3_4relationship = TagsAnalyzer(got_all,'2013-03','2015-03','relationship')
print('\n Seasons 3 and 4')
print(got3_4relationship)
#seasons 5 and 6 – season 7 starts July 2017
got5_6relationship = TagsAnalyzer(got_all,'2015-07','2017-06','relationship')
print('\n Seasons 5 and 6')
print(got5_6relationship)
#season 7 – seasons 8 starts April 2019
got7relationship = TagsAnalyzer(got_all,'2017-07','2019-03','relationship')
print('\n Season 7')
print(got7relationship)
#season 7 – seasons 8 starts April 2019
got8relationship = TagsAnalyzer(got_all,'2019-04','2019-09','relationship')
print('\n Season 8')
print(got8relationship)
#seasons 1 and 2 – season 3 starts March 2013
got1_2AT = TagsAnalyzer(got_all,'2006-08','2013-02','additional tags')
print('\n Seasons 1 and 2')
print(got1_2AT)
#seasons 3 and 4 – season 5 starts April 2015
got3_4AT = TagsAnalyzer(got_all,'2013-03','2015-03','additional tags')
print('\n Seasons 3 and 4')
print(got3_4AT)
#seasons 5 and 6 – season 7 starts July 2017
got5_6AT = TagsAnalyzer(got_all,'2015-07','2017-06','additional tags')
print('\n Seasons 5 and 6')
print(got5_6AT)
#season 7 – seasons 8 starts April 2019
got7AT = TagsAnalyzer(got_all,'2017-07','2019-03','additional tags')
print('\n Season 7')
print(got7AT)
#season 7 – seasons 8 starts April 2019
got8AT = TagsAnalyzer(got_all,'2019-04','2019-09','additional tags')
print('\n Season 8')
print(got8AT)
#seasons 1 and 2 – season 3 starts March 2013
got1_2cat = TagsAnalyzer(got_all,'2006-08','2013-02','category')
print('\n Seasons 1 and 2')
print(got1_2cat)
#seasons 3 and 4 – season 5 starts April 2015
got3_4cat = TagsAnalyzer(got_all,'2013-03','2015-03','category')
print('\n Seasons 3 and 4')
print(got3_4cat)
#seasons 5 and 6 – season 7 starts July 2017
got5_6cat = TagsAnalyzer(got_all,'2015-07','2017-06','category')
print('\n Seasons 5 and 6')
print(got5_6cat)
#season 7 – seasons 8 starts April 2019
got7cat = TagsAnalyzer(got_all,'2017-07','2019-03','category')
print('\n Season 7')
print(got7cat)
#season 7 – seasons 8 starts April 2019
got8cat = TagsAnalyzer(got_all,'2019-04','2019-09','category')
print('\n Season 8')
print(got8cat)
For the first portion, I have to take the tuples I made above to then transform them into dataframes so that they may be analyzed. First, I will do the categories for both GoT and TLoK.
def tuple_to_df(tup):
newdf = pd.DataFrame(list(tup))
# newdf = newdf.rename(columns={0:column1, 1:column2})
return newdf
gotcat1 = tuple_to_df(got1_2cat)
gotcat2 = tuple_to_df(got3_4cat)
gotcat3 = tuple_to_df(got5_6cat)
gotcat4 = tuple_to_df(got7cat)
gotcat5 = tuple_to_df(got8cat)
korracat1 = tuple_to_df(korra_preKAcat)
korracat2 = tuple_to_df(korra_subKAcat)
korracat3 = tuple_to_df(korra_postKAcat)
figGOT = make_subplots(
rows=2, cols=3,
shared_yaxes=True,
subplot_titles=("Seasons 1–2", "Seasons 3–4", "Seasons 5–6", "Season 7", "Season 8 and Beyond"))
figGOT.add_trace(go.Bar(
y=gotcat1[1],
x=gotcat1[0],
name="Seasons 1–2"),
row=1,
col=1)
figGOT.add_trace(go.Bar(y=gotcat2[1], x=gotcat2[0], name="Seasons 3–4"), row=1, col=2)
figGOT.add_trace(go.Bar(y=gotcat3[1], x=gotcat3[0], name="Seasons 5–6"), row=1, col=3)
figGOT.add_trace(go.Bar(y=gotcat4[1], x=gotcat4[0], name="Seasons 7"), row=2, col=1)
figGOT.add_trace(go.Bar(y=gotcat5[1], x=gotcat5[0], name="Seasons 8"), row=2, col=2)
figGOT.update_layout(
title='Game of Thrones Romantic Pairing Trends'
)
figGOT.write_html('images/GoT-Romantic-Pairings.html', auto_open=True)
figTLOK = make_subplots(
rows=1, cols=3,
shared_yaxes=True,
subplot_titles=("Before Korrasami", "Korrasami Subtext", "Post Korrasami"))
figTLOK.add_trace(go.Bar(
y=korracat1[1],
x=korracat1[0],
name='Up To July 2014'),
row=1,
col=1)
figTLOK.add_trace(go.Bar(y=korracat2[1], x=korracat2[0], name='August–November 2014'), row=1, col=2)
figTLOK.add_trace(go.Bar(y=korracat3[1], x=korracat3[0], name='December 2014 and Beyond'), row=1, col=3)
figTLOK.update_layout(
title='The Legend of Korra Romantic Pairing Trends'
)
figTLOK.write_html('images/TLoK-Romantic-Pairings.html', auto_open=True)