Skip to main content

Dickens’s Idiomatic Imagination: Appendix C

Dickens’s Idiomatic Imagination
Appendix C
  • Show the following:

    Annotations
    Resources
  • Adjust appearance:

    Font
    Font style
    Color Scheme
    Light
    Dark
    Annotation contrast
    Low
    High
    Margins
  • Search within:
    • Notifications
    • Privacy
  • Project HomeDickens's Idiomatic Imagination
  • Projects
  • Learn more about Manifold

Notes

table of contents
  1. Acknowledgments
  2. Introduction
  3. 1. The Beginnings of Dickens’s Idiomatic Imagination
  4. 2. “Shouldering the Wheel” in Bleak House
  5. 3. “Brought Up by Hand”
  6. 4. Sweat Work and Nose Grinding in Our Mutual Friend
  7. Conclusion
  8. Appendix A
  9. Appendix B
  10. Appendix C
  11. Bibliography
  12. Index

APPENDIX C

Full Code Used for Data Comparisons

# Main Script for Identifying the ngrams (aka idioms)

Sys.setenv(

RETICULATE_PYTHON =

“/Library/Frameworks/Python.framework/Versions/3.7/bin/python3”

)

library(tidyverse)

library(readxl)

library(cleanNLP)

cleanNLP::cnlp_init_spacy()

idioms <- sort(read_excel(“More.idioms.xlsx”)$idioms)

novels <- read_excel(path = “CAPUANO_CORPUS.xlsx”) %>%

filter(Capuano == 1) %>%

mutate(path = gsub(“:”, “_”, `#filename`))

get_all_ngrams <- function(text_string, ngrams){

message(“CHECKING FOR NGRAMS”)

padded <- paste(“ “, ngrams, “ “, sep=“ “) # white space padding for regex

x <- trimws(

unlist(

str_match_all(ngram::preprocess(text_string), padded)

)

)

return(x)

}

idioms_in_books <- tibble()

for(i in 1:lenght(novels)){

raw_ms <- readtext::readtext(novels$path[i])$text

normalized_text <- tolower(clean_manuscript(raw_ms, “British English”))

annotation <- get_annotation(tibble::tibble(doc_id = novels$id[i], text = normalized_text))

just_words <- filter(annotation$token,!upos == “PUNCT”)

total_words <- nrow(just_words)

found_idioms <- get_all_ngrams(normalized_text, idioms)

new_rows <- bind_cols(author = novels$author_sort[i], title = novels$title[i], word_count = total_words, idioms = found_idioms)

idioms_in_books <- bind_rows(idioms_in_books, new_rows)

cat(i, “\n”)

}

idioms_in_books$idioms <- gsub(“’ ”, “ ‘ “, idioms_in_books$idioms)

write_csv(idioms_in_books, “idioms_in_books.csv”)

x <- group_by(idioms_in_books, author, title, idioms) %>%

summarise(count = n(), word_count) %>%

mutate(count_per_100k = (100000 * (count / word_count))) %>%

select(-count, -word_count) %>%

ungroup() %>%

distinct()

wide1 <- pivot_wider(x, names_from = idioms, values_from = count_per_100k, values_fill = 0)

write_csv(wide1, “book_x_idiom_frequencies.csv”)

y <- group_by(idioms_in_books, author, title, idioms) %>%

summarise(count = n(), word_count) %>%

ungroup() %>%

distinct() %>%

select(-word_count)

wide2 <- pivot_wider(y, names_from = idioms, values_from = count, values_fill = 0)

write_csv(wide2, “book_x_idiom_count.csv”)

# Post processing code for merging the data into one file with metadata


novels <- read_excel(path = “CAPUANO_CORPUS.xlsx”) %>%

filter(Capuano == 1) %>%

mutate(path = gsub(“:”, “_”, `#filename`))


counts <- read_csv(“book_x_idiom_count.csv”) %>%

full_join(novels, by = “title”) %>%

select(-id, -Capuano, -`#filename`,-`#formats`,-`path`, -authors) %>%

relocate(`#pubyear`, .before = author) %>%

relocate(author_sort, .before = title) %>%

mutate(idiom_count = rowSums(select(., -`#pubyear`, -author, -title, -author_sort)!= 0)) %>%

select(-author) %>%

relocate(idiom_count, .after = title) %>%

replace(is.na(.), 0) %>%

select(title, idiom_count)


freqs <- read_csv(“book_x_idiom_frequencies.csv”) %>%

full_join(novels, by=“title”) %>%

select(-id, -Capuano, -`#filename`,-`#formats`,-`path`, -authors) %>%

relocate(`#pubyear`, .before = author) %>%

relocate(author_sort, .before = title) %>%

mutate(idiom_sum = rowSums(select(., -`#pubyear`, -author, -title, -author_sort))) %>%

select(-author) %>%

relocate(idiom_sum, .after = title) %>%

replace(is.na(.), 0) %>%

full_join(counts, by = “title”) %>%

relocate(idiom_count, .after = idiom_sum)


write_csv(freqs, “book_x_idiom_freqs_with_dates.csv”)

Annotate

Next Chapter
Bibliography
PreviousNext
All rights reserved
Powered by Manifold Scholarship. Learn more at
Opens in new tab or windowmanifoldapp.org