Title: | Building a Concordance of Terms in a Series of Texts |
---|---|
Description: | Compute the frequency distribution of a search term in a series of texts. For example, Arthur Conan Doyle wrote a total of 60 Sherlock Holmes stories, comprised of 54 short stories and 4 longer novels. I wanted to test my own subjective impression that, in many of the stories, Sherlock Holmes' popularity was used as bait to induce the reader to read a story that is essentially not primarily a Sherlock Holmes story. I used the term "Holmes" as a search pattern, since Watson would frequently address him by name, or use his name to describe something that he was doing. My hypothesis is that the frequency distribution of the search pattern "Holmes" is a good proxy for the degree to which a story is or is not truly a Sherlock Holmes story. The results are presented in a manuscript that is available as a vignette and online at <https://barryzee.github.io/Concordance/index.html>. |
Authors: | Barry Zeeberg [aut, cre] |
Maintainer: | Barry Zeeberg <[email protected]> |
License: | GPL (>= 2) |
Version: | 1.0.1 |
Built: | 2024-11-02 03:57:34 UTC |
Source: | https://github.com/cran/SherlockHolmes |
frequencies plotted in order of date (if the titles are given in order of date)
chronology(titles.vec, patterns, starts, freqs, chronDir, overlay = FALSE)
chronology(titles.vec, patterns, starts, freqs, chronDir, overlay = FALSE)
titles.vec |
character vector containing the titles of the stories |
patterns |
vector of character string query patterns |
starts |
integer vector of starting positions |
freqs |
return value of frequency() |
chronDir |
character string full path name for output directory |
overlay |
Boolean if TRUE overlay the chronolgy for multiple search patterns |
returns no value, but has side effect generating graph
freqDir<-tempdir() chronDir<-sprintf("%s/chronology",freqDir) dir.create(chronDir) dir.create(sprintf("%s/plots",chronDir)) dir.create(sprintf("%s/archive",chronDir)) print(chronDir) chr<-chronology(titles.vec,c("Holmes","Watson"),starts,freqs,chronDir)
freqDir<-tempdir() chronDir<-sprintf("%s/chronology",freqDir) dir.create(chronDir) dir.create(sprintf("%s/plots",chronDir)) dir.create(sprintf("%s/archive",chronDir)) print(chronDir) chr<-chronology(titles.vec,c("Holmes","Watson"),starts,freqs,chronDir)
graphical indicator of search patterns within stories
coChronology(titles.vec, patterns, starts, freqs, chronDir)
coChronology(titles.vec, patterns, starts, freqs, chronDir)
titles.vec |
character vector containing the titles of the stories |
patterns |
vector of character string query patterns |
starts |
integer vector of starting positions |
freqs |
return value of frequency() |
chronDir |
character string full path name for output directory |
returns an integer matrix whose rows are search patterns and columns are stories, value of 1 indicates the presence of the corresponding search pattern in the corresponding story
freqDir<-tempdir() chronDir<-sprintf("%s/chronology",freqDir) dir.create(chronDir) dir.create(sprintf("%s/plots",chronDir)) dir.create(sprintf("%s/archive",chronDir)) print(chronDir) coch<-coChronology(titles.vec,c("Holmes","Watson"),starts,freqs,chronDir)
freqDir<-tempdir() chronDir<-sprintf("%s/chronology",freqDir) dir.create(chronDir) dir.create(sprintf("%s/plots",chronDir)) dir.create(sprintf("%s/archive",chronDir)) print(chronDir) coch<-coChronology(titles.vec,c("Holmes","Watson"),starts,freqs,chronDir)
retrieve words that are close to occurrences of pattern
concordance(freqs, titles.vec, texts.vec, starts, window, odir)
concordance(freqs, titles.vec, texts.vec, starts, window, odir)
freqs |
return value of frequency() |
titles.vec |
character vector containing the titles of the stories |
texts.vec |
character vector of entire text |
starts |
integer vector of starting positions |
window |
integer number of lines to take before and after the pattern match |
odir |
character string containing the full path name for the output directory |
returns no value but has side effect of generating graphs
con<-concordance(freqs,titles.vec[3],texts.vec,starts,window=2,odir=tempdir())
con<-concordance(freqs,titles.vec[3],texts.vec,starts,window=2,odir=tempdir())
compute chisq value for a 2 x 2 contingency table
contingency(inside, outside)
contingency(inside, outside)
inside |
numeric vector of raw counts |
outside |
numeric vector of raw counts |
numeric vector of chisq.test() p.values
con<-contingency(inside=c(4,5),outside=c(20,7))
con<-contingency(inside=c(4,5),outside=c(20,7))
compute distribution of ratio of number of occurrences of query string divided by total number of words
distributions(freqs, titles.vec, minl, P, odir)
distributions(freqs, titles.vec, minl, P, odir)
freqs |
return value of frequency() |
titles.vec |
character vector containing the titles of the stories |
minl |
is an integer param passed to dpseg::dpseg |
P |
is a numeric param passed to dpseg::dpseg |
odir |
character string containing the full path name for the output directory |
returns no value but has side effect of generating graphs
dis<-distributions(freqs,titles.vec[1],minl=100,P=0.00001,tempdir())
dis<-distributions(freqs,titles.vec[1],minl=100,P=0.00001,tempdir())
histogram of frequencies
freqHist(patterns, starts, titles.vec, freqs, histDir)
freqHist(patterns, starts, titles.vec, freqs, histDir)
patterns |
vector of character string query patterns |
starts |
integer vector of starting positions |
titles.vec |
character vector containing the titles of the stories |
freqs |
return value of frequency() |
histDir |
character string full path name for output directory |
returns no value, but has side effect generating histogram
fh<-freqHist(patterns,starts,titles.vec,freqs,histDir=tempdir())
fh<-freqHist(patterns,starts,titles.vec,freqs,histDir=tempdir())
compute ratio of number of occurrences of query string divided by total number of words
frequency(texts.vec, starts, patterns)
frequency(texts.vec, starts, patterns)
texts.vec |
character vector of entire text |
starts |
integer vector of starting positions |
patterns |
vector of character string query patterns |
a list whose components are sub-lists
# indexed by the titles of the stories
start integer starting line in text
end integer ending line in text
wPerLine integer words perline
wordSum integer sum of wPerLine
patterns a sub-list
integer pPerLine integer patterns per line
patSum integer total of pPerLine
fraction numeric ratio of patSum/wordSum
fr<-frequency(texts.vec,starts,patterns)
fr<-frequency(texts.vec,starts,patterns)
retrieve capture all of the parameter names and values passed in
grabFunctionParameters()
grabFunctionParameters()
copied and pasted from https://stackoverflow.com/questions/66329835/using-r-how-to-get-all-parameters-passed-into-a-function-with-their-values
a list whose components are the symbolic names of the function parameters, and their values.
frequencies plotted in order of story length
lengths(titles.vec, patterns, starts, freqs, lengthDir)
lengths(titles.vec, patterns, starts, freqs, lengthDir)
titles.vec |
character vector containing the titles of the stories |
patterns |
vector of character string query patterns |
starts |
integer vector of starting positions |
freqs |
return value of frequency() |
lengthDir |
character string full path name for output directory |
returns no value, but has side effect generating graph
freqDir<-tempdir() lengthDir<-sprintf("%s/length",freqDir) dir.create(lengthDir) print(lengthDir) dir.create(sprintf("%s/plots",lengthDir)) dir.create(sprintf("%s/archive",lengthDir)) le<-lengths(titles.vec,patterns,starts,freqs,lengthDir)
freqDir<-tempdir() lengthDir<-sprintf("%s/length",freqDir) dir.create(lengthDir) print(lengthDir) dir.create(sprintf("%s/plots",lengthDir)) dir.create(sprintf("%s/archive",lengthDir)) le<-lengths(titles.vec,patterns,starts,freqs,lengthDir)
merge (inner join) the results in 2 tables generated from 2 vectors
mergeTables(tv, tw, cnv, cnw)
mergeTables(tv, tw, cnv, cnw)
tv |
first table |
tw |
second table |
cnv |
character name for column coming from v |
cnw |
character name for column coming from w |
numeric matrix generated from merging tables from v and w
mt<-mergeTables(inside,outside,"in","out")[1:10,]
mt<-mergeTables(inside,outside,"in","out")[1:10,]
Alternative plot procedure for dpseg, special function provided personally by dpseg curator. I made a few custom tweeks Including option to overlay multiple plots
plot_dpseg2( x, delog = FALSE, col, main, xlab, ylab, res = 10, vlines, overlay, textX, textY, textLabel, ylim )
plot_dpseg2( x, delog = FALSE, col, main, xlab, ylab, res = 10, vlines, overlay, textX, textY, textLabel, ylim )
x |
dpseg object to plot |
delog |
Boolean use log scale if TRUE |
col |
color |
main |
character title of graph |
xlab |
character label for x axis |
ylab |
character label for y axis |
res |
numeric resolution |
vlines |
Boolean if FALSE suppress vertical lines in graph |
overlay |
Boolean if TRUE this plot is an overlay of previous plot |
textX |
numeric x position for text box |
textY |
numeric y position for text box |
textLabel |
character string to label the points in the graph |
ylim |
numeric vector ylim for plot |
returns no value but has side effect of producing a graph
pdp<-plot_dpseg2(segs,overlay=FALSE,xlab="xaxis", ylab="yaxis",vlines=FALSE,textX=2000,textY=20, textLabel="label",ylim=c(0,60))
pdp<-plot_dpseg2(segs,overlay=FALSE,xlab="xaxis", ylab="yaxis",vlines=FALSE,textX=2000,textY=20, textLabel="label",ylim=c(0,60))
read and edit titles to remove blank lines and white space
readTitles(titles)
readTitles(titles)
titles |
is a character string containing the full path name for a text file containing the titles of the stories in the same order that thney appear in the texts file |
a character vector of titles
titles<-system.file("extdata/contents3.txt",package="SherlockHolmes") rt<-readTitles(titles)
titles<-system.file("extdata/contents3.txt",package="SherlockHolmes") rt<-readTitles(titles)
This function retrieves intercept, slope, r.squared, and adj.r.squared from lm()
retrieveLmStats(x, y)
retrieveLmStats(x, y)
x |
is second argument to lm() |
y |
is first argument to lm() |
returns a list containing the return value of lm, intercept, slope, r.squared, and adj.r.squared
retr<-retrieveLmStats(1:10,runif(10,0,1))
retr<-retrieveLmStats(1:10,runif(10,0,1))
compute rolling average of ratio of number of occurrences of query string divided by total number of words
rolling(freqs, titles.vec, windowPct = 0.1, odir, verbose)
rolling(freqs, titles.vec, windowPct = 0.1, odir, verbose)
freqs |
return value of frequency() |
titles.vec |
character vector containing the titles of the stories |
windowPct |
a numeric control size of plot window |
odir |
character string containing the full path name for the output directory |
verbose |
Boolean if TRUE print informative or diagnostic messages to console |
returns noo value, but has side effect of generating graphs
rol<-rolling(freqs,titles.vec,windowPct=0.10,odir=tempdir(),verbose=FALSE)
rol<-rolling(freqs,titles.vec,windowPct=0.10,odir=tempdir(),verbose=FALSE)
reformat seqs$segments as a legend to insert into segment plot
segments(segs)
segments(segs)
segs |
return value of dpseg::dpseg() |
reformatted matrix suitable for printing
seg<-segments(segs)
seg<-segments(segs)
This function is the driver that organizes the computation of concordances in Sherlock Holmes stories
Sherlock( titles = "NONE", texts, patterns, toupper, odir, concord = FALSE, minl = 100, P = 1e-05, verbose = FALSE )
Sherlock( titles = "NONE", texts, patterns, toupper, odir, concord = FALSE, minl = 100, P = 1e-05, verbose = FALSE )
titles |
is a character string containing the full path name for a text file containing the titles of the stories in the same order that they appear in the texts file. If titles=="NONE", treat the entire book as one story. |
texts |
is a character string containing the full path name for a text file containing the full texts of all of the stories |
patterns |
is a vector containing the search patterns |
toupper |
is a Boolean TRUE if the titles should be converted to upper case |
odir |
is a character string containing the full path name of the output directory |
concord |
Boolean if TRUE invoke concordance() |
minl |
is an integer param passed to dpseg::dpseg |
P |
is a numeric param passed to dpseg::dpseg |
verbose |
Boolean if TRUE print informative or diagnostic messages to console |
returns no value but has side effect of driving the concordance computations
titles<-system.file("extdata/contents3.txt",package="SherlockHolmes") texts<-system.file("extdata/processed_download3.txt",package="SherlockHolmes") SH<-Sherlock(titles=titles,texts=texts,patterns=patterns[1], toupper=TRUE,odir=tempdir(),concord=FALSE,minl=100,P=0.00001, verbose=FALSE)
titles<-system.file("extdata/contents3.txt",package="SherlockHolmes") texts<-system.file("extdata/processed_download3.txt",package="SherlockHolmes") SH<-Sherlock(titles=titles,texts=texts,patterns=patterns[1], toupper=TRUE,odir=tempdir(),concord=FALSE,minl=100,P=0.00001, verbose=FALSE)
where does each story start?
startLine(titles.vec, texts.vec, toupper)
startLine(titles.vec, texts.vec, toupper)
titles.vec |
is a character string containing the full path name for a text file containing the titles of the stories in the same order that they appear in the texts file |
texts.vec |
is a character string containing the full path name for a text file containing the full texts of all of the stories |
toupper |
is a Boolean TRUE if the titles should be converted to upper case |
each title in titles.vec must appear on a single line in titles.vec and texts.vec - a title cannot be split across multiple lines. each title must only appear one time within titles.vec and texts.vec
an integer vector of the starting lines of each story
sl<-startLine(titles.vec,texts.vec,toupper=TRUE)
sl<-startLine(titles.vec,texts.vec,toupper=TRUE)
use strsplit to parse words from text t, delete the empty string from the result, and compile into a sorted table of word frequencies
strSplitTab(t)
strSplitTab(t)
t |
vector of character strings representing lines of the orginal text |
a sorted table of raw word counts
sst<-strSplitTab(texts.vec)
sst<-strSplitTab(texts.vec)