APPENDIX C

Full Code Used for Data Comparisons

# Main Script for Identifying the ngrams (aka idioms)

Sys.setenv(

RETICULATE_PYTHON =

“/Library/Frameworks/Python.framework/Versions/3.7/bin/python3”

)

library(tidyverse)

library(readxl)

library(cleanNLP)

cleanNLP::cnlp_init_spacy()

idioms <- sort(read_excel(“More.idioms.xlsx”)$idioms)

novels <- read_excel(path = “CAPUANO_CORPUS.xlsx”) %>%

filter(Capuano == 1) %>%

mutate(path = gsub(“:”, “_”, `#filename`))

get_all_ngrams <- function(text_string, ngrams){

message(“CHECKING FOR NGRAMS”)

padded <- paste(“ “, ngrams, “ “, sep=“ “) # white space padding for regex

x <- trimws(

unlist(

str_match_all(ngram::preprocess(text_string), padded)

)

return(x)

}

idioms_in_books <- tibble()

for(i in 1:lenght(novels)){

raw_ms <- readtext::readtext(novels$path[i])$text

normalized_text <- tolower(clean_manuscript(raw_ms, “British English”))

annotation <- get_annotation(tibble::tibble(doc_id = novels$id[i], text = normalized_text))

just_words <- filter(annotation$token,!upos == “PUNCT”)

total_words <- nrow(just_words)

found_idioms <- get_all_ngrams(normalized_text, idioms)

new_rows <- bind_cols(author = novels$author_sort[i], title = novels$title[i], word_count = total_words, idioms = found_idioms)

idioms_in_books <- bind_rows(idioms_in_books, new_rows)

cat(i, “\n”)

}

idioms_in_books$idioms <- gsub(“’ ”, “ ‘ “, idioms_in_books$idioms)

write_csv(idioms_in_books, “idioms_in_books.csv”)

x <- group_by(idioms_in_books, author, title, idioms) %>%

summarise(count = n(), word_count) %>%

mutate(count_per_100k = (100000 * (count / word_count))) %>%

select(-count, -word_count) %>%

ungroup() %>%

distinct()

wide1 <- pivot_wider(x, names_from = idioms, values_from = count_per_100k, values_fill = 0)

write_csv(wide1, “book_x_idiom_frequencies.csv”)

y <- group_by(idioms_in_books, author, title, idioms) %>%

summarise(count = n(), word_count) %>%

ungroup() %>%

distinct() %>%

select(-word_count)

wide2 <- pivot_wider(y, names_from = idioms, values_from = count, values_fill = 0)

write_csv(wide2, “book_x_idiom_count.csv”)

# Post processing code for merging the data into one file with metadata

novels <- read_excel(path = “CAPUANO_CORPUS.xlsx”) %>%

filter(Capuano == 1) %>%

mutate(path = gsub(“:”, “_”, `#filename`))

counts <- read_csv(“book_x_idiom_count.csv”) %>%

full_join(novels, by = “title”) %>%

select(-id, -Capuano, -`#filename`,-`#formats`,-`path`, -authors) %>%

relocate(`#pubyear`, .before = author) %>%

relocate(author_sort, .before = title) %>%

mutate(idiom_count = rowSums(select(., -`#pubyear`, -author, -title, -author_sort)!= 0)) %>%

select(-author) %>%

relocate(idiom_count, .after = title) %>%

replace(is.na(.), 0) %>%

select(title, idiom_count)

freqs <- read_csv(“book_x_idiom_frequencies.csv”) %>%

full_join(novels, by=“title”) %>%

select(-id, -Capuano, -`#filename`,-`#formats`,-`path`, -authors) %>%

relocate(`#pubyear`, .before = author) %>%

relocate(author_sort, .before = title) %>%

mutate(idiom_sum = rowSums(select(., -`#pubyear`, -author, -title, -author_sort))) %>%

select(-author) %>%

relocate(idiom_sum, .after = title) %>%

replace(is.na(.), 0) %>%

full_join(counts, by = “title”) %>%

relocate(idiom_count, .after = idiom_sum)

write_csv(freqs, “book_x_idiom_freqs_with_dates.csv”)

Bibliography

Show the following:

Adjust appearance:

Notes

APPENDIX C

Annotate