“Appendix C” in “Dickens’s Idiomatic Imagination”
APPENDIX C
Full Code Used for Data Comparisons
# Main Script for Identifying the ngrams (aka idioms)
Sys.setenv(
RETICULATE_PYTHON =
“/Library/Frameworks/Python.framework/Versions/3.7/bin/python3”
)
library(tidyverse)
library(readxl)
library(cleanNLP)
cleanNLP::cnlp_init_spacy()
idioms <- sort(read_excel(“More.idioms.xlsx”)$idioms)
novels <- read_excel(path = “CAPUANO_CORPUS.xlsx”) %>%
filter(Capuano == 1) %>%
mutate(path = gsub(“:”, “_”, `#filename`))
get_all_ngrams <- function(text_string, ngrams){
message(“CHECKING FOR NGRAMS”)
padded <- paste(“ “, ngrams, “ “, sep=“ “) # white space padding for regex
x <- trimws(
unlist(
str_match_all(ngram::preprocess(text_string), padded)
)
)
return(x)
}
idioms_in_books <- tibble()
for(i in 1:lenght(novels)){
raw_ms <- readtext::readtext(novels$path[i])$text
normalized_text <- tolower(clean_manuscript(raw_ms, “British English”))
annotation <- get_annotation(tibble::tibble(doc_id = novels$id[i], text = normalized_text))
just_words <- filter(annotation$token,!upos == “PUNCT”)
total_words <- nrow(just_words)
found_idioms <- get_all_ngrams(normalized_text, idioms)
new_rows <- bind_cols(author = novels$author_sort[i], title = novels$title[i], word_count = total_words, idioms = found_idioms)
idioms_in_books <- bind_rows(idioms_in_books, new_rows)
cat(i, “\n”)
}
idioms_in_books$idioms <- gsub(“’ ”, “ ‘ “, idioms_in_books$idioms)
write_csv(idioms_in_books, “idioms_in_books.csv”)
x <- group_by(idioms_in_books, author, title, idioms) %>%
summarise(count = n(), word_count) %>%
mutate(count_per_100k = (100000 * (count / word_count))) %>%
select(-count, -word_count) %>%
ungroup() %>%
distinct()
wide1 <- pivot_wider(x, names_from = idioms, values_from = count_per_100k, values_fill = 0)
write_csv(wide1, “book_x_idiom_frequencies.csv”)
y <- group_by(idioms_in_books, author, title, idioms) %>%
summarise(count = n(), word_count) %>%
ungroup() %>%
distinct() %>%
select(-word_count)
wide2 <- pivot_wider(y, names_from = idioms, values_from = count, values_fill = 0)
write_csv(wide2, “book_x_idiom_count.csv”)
# Post processing code for merging the data into one file with metadata
novels <- read_excel(path = “CAPUANO_CORPUS.xlsx”) %>%
filter(Capuano == 1) %>%
mutate(path = gsub(“:”, “_”, `#filename`))
counts <- read_csv(“book_x_idiom_count.csv”) %>%
full_join(novels, by = “title”) %>%
select(-id, -Capuano, -`#filename`,-`#formats`,-`path`, -authors) %>%
relocate(`#pubyear`, .before = author) %>%
relocate(author_sort, .before = title) %>%
mutate(idiom_count = rowSums(select(., -`#pubyear`, -author, -title, -author_sort)!= 0)) %>%
select(-author) %>%
relocate(idiom_count, .after = title) %>%
replace(is.na(.), 0) %>%
select(title, idiom_count)
freqs <- read_csv(“book_x_idiom_frequencies.csv”) %>%
full_join(novels, by=“title”) %>%
select(-id, -Capuano, -`#filename`,-`#formats`,-`path`, -authors) %>%
relocate(`#pubyear`, .before = author) %>%
relocate(author_sort, .before = title) %>%
mutate(idiom_sum = rowSums(select(., -`#pubyear`, -author, -title, -author_sort))) %>%
select(-author) %>%
relocate(idiom_sum, .after = title) %>%
replace(is.na(.), 0) %>%
full_join(counts, by = “title”) %>%
relocate(idiom_count, .after = idiom_sum)
write_csv(freqs, “book_x_idiom_freqs_with_dates.csv”)
We use cookies to analyze our traffic. Please decide if you are willing to accept cookies from our website. You can change this setting anytime in Privacy Settings.