wfm - Generate a word frequency matrix by grouping variable(s).
wfdf - Generate a word frequency data frame by grouping variable.
wfm_expanded - Expand a word frequency matrix to have multiple rows for each word.
wfm_combine - Combines words (rows) of a word frequency matrix (wfdf) together.
weight - Weight a word frequency matrix for analysis where such weighting is sensible.
weight.wfdf - Weight a word frequency matrix for analysis where such weighting is sensible.
as.wfm - Attempts to coerce a matrix to a wfm.
wfm( text.var =NULL, grouping.var =NULL, output ="raw", stopwords =NULL, char2space ="~~",...)## S3 method for class 'wfdf'wfm( text.var =NULL, grouping.var =NULL, output ="raw", stopwords =NULL, char2space ="~~",...)## S3 method for class 'character'wfm( text.var =NULL, grouping.var =NULL, output ="raw", stopwords =NULL, char2space ="~~",...)## S3 method for class 'factor'wfm( text.var =NULL, grouping.var =NULL, output ="raw", stopwords =NULL, char2space ="~~",...)wfdf( text.var, grouping.var =NULL, stopwords =NULL, margins =FALSE, output ="raw", digits =2, char2space ="~~",...)wfm_expanded(text.var, grouping.var =NULL,...)wfm_combine(wf.obj, word.lists, matrix =TRUE)## S3 method for class 'wfm'weight(x, type ="prop",...)## S3 method for class 'wfdf'weight(x, type ="prop",...)as.wfm(x,...)## S3 method for class 'matrix'as.wfm(x,...)## Default S3 method:as.wfm(x,...)## S3 method for class 'TermDocumentMatrix'as.wfm(x,...)## S3 method for class 'DocumentTermMatrix'as.wfm(x,...)## S3 method for class 'data.frame'as.wfm(x,...)## S3 method for class 'wfdf'as.wfm(x,...)## S3 method for class 'Corpus'as.wfm(x, col ="docs", row ="text",...)## S3 method for class 'Corpus'wfm(text.var,...)
Arguments
text.var: The text variable.
grouping.var: The grouping variables. Default NULL generates one word list for all text. Also takes a single grouping variable or a list of 1 or more grouping variables.
output: Output type (either "proportion" or "percent").
stopwords: A vector of stop words to remove.
char2space: A vector of characters to be turned into spaces. If char.keep is NULL, char2space will activate this argument.
margins: logical. If TRUE provides grouping.var and word variable totals.
digits: An integer indicating the number of decimal places (round) or significant digits (signif) to be used. Negative values are allowed.
wf.obj: A wfm or wfdf object.
word.lists: A list of character vectors of words to pass to wfm_combine
matrix: logical. If TRUE returns the output as a wfm rather than a wfdf object.
x: An object with words for row names and integer values.
type: The type of weighting to use: c("prop", "max", "scaled"). All weight by column. "prop" uses a proportion weighting and all columns sum to 1. "max" weights in proportion to the max value; all values are integers and column sums may not be equal. "scaled" uses scale to scale with center = FALSE; output is not integer and column sums may not be equal.
col: The column name (generally not used).
row: The row name (generally not used).
``: Other arguments supplied to Corpus or TermDocumentMatrix. If as.wfm this is other arguments passed to as.wfm methods (currently ignored).
Returns
wfm - returns a word frequency of the class matrix.
wfdf - returns a word frequency of the class data.frame with a words column and optional margin sums.
wfm_expanded - returns a matrix similar to a word frequency matrix (wfm) but the rows are expanded to represent the maximum usages of the word and cells are dummy coded to indicate that number of uses.
wfm_combine - returns a word frequency matrix (wfm) or dataframe (wfdf) with counts for the combined word.lists merged and remaining terms (else).
weight - Returns a weighted matrix for use with other R packages. The output is not of the class "wfm".
as.wfm - Returns a matrix of the class "wfm".
Note
Words can be kept as one by inserting a double tilde ("~~"), or other character strings passed to char2space, as a single word/entry. This is useful for keeping proper names as a single unit.
Examples
## Not run:## word frequency matrix (wfm) example:with(DATA, wfm(state, list(sex, adult)))[1:15,]with(DATA, wfm(state, person))[1:15,]Filter(with(DATA, wfm(state, list(sex, adult))),5)with(DATA, wfm(state, list(sex, adult)))## Filter particular words based on max/min values in wfmv <- with(DATA, wfm(state, list(sex, adult)))Filter(v,5)Filter(v,5, count.apostrophe =FALSE)Filter(v,5,7)Filter(v,4,4)Filter(v,3,4)Filter(v,3,4, stopwords = Top25Words)## insert double tilde ("~~") to keep phrases(i.e., first last name)alts <- c(" fun","I ")state2 <- space_fill(DATA$state, alts, rm.extra =FALSE)with(DATA, wfm(state2, list(sex, adult)))[1:18,]## word frequency dataframe (wfdf) example:with(DATA, wfdf(state, list(sex, adult)))[1:15,]with(DATA, wfdf(state, person))[1:15,]## wfm_expanded example:z <- wfm(DATA$state, DATA$person)wfm_expanded(z)[30:45,]#two "you"s## wf_combine examples:#===================## raw no margins (will work) x <- wfm(DATA$state, DATA$person)## raw with margin (will work) y <- wfdf(DATA$state, DATA$person, margins =TRUE)## Proportion matrixz2 <- wfm(DATA$state, DATA$person, output="proportion")WL1 <- c(y[,1])WL2 <- list(c("read","the","a"), c("you","your","you're"))WL3 <- list(bob = c("read","the","a"), yous = c("you","your","you're"))WL4 <- list(bob = c("read","the","a"), yous = c("a","you","your","your're"))WL5 <- list(yous = c("you","your","your're"))WL6 <- list(c("you","your","your're"))#no name so will be called words 1 WL7 <- c("you","your","your're")wfm_combine(z2, WL2)#Won't work not a raw frequency matrix wfm_combine(x, WL2)#Works (raw and no margins) wfm_combine(y, WL2)#Works (raw with margins) wfm_combine(y, c("you","your","your're"))wfm_combine(y, WL1)wfm_combine(y, WL3)## wfm_combine(y, WL4) #Error wfm_combine(y, WL5)wfm_combine(y, WL6)wfm_combine(y, WL7)worlis <- c("you","it","it's","no","not","we")y <- wfdf(DATA$state, list(DATA$sex, DATA$adult), margins =TRUE)z <- wfm_combine(y, worlis)chisq.test(z)chisq.test(wfm(y))## Dendrogrampresdeb <- with(pres_debates2012, wfm(dialogue, list(person, time)))library(sjPlot)sjc.dend(t(presdeb),2:4)## Words correlated within turns of talk## EXAMPLE 1library(qdapTools)x <- factor(with(rajSPLIT, paste(act, pad(TOT(tot)), sep ="|")))dat <- wfm(rajSPLIT$dialogue, x)cor(t(dat)[, c("romeo","juliet")])cor(t(dat)[, c("romeo","banished")])cor(t(dat)[, c("romeo","juliet","hate","love")])qheat(cor(t(dat)[, c("romeo","juliet","hate","love")]), diag.na =TRUE, values =TRUE, digits =3, by.column =NULL)dat2 <- wfm(DATA$state, id(DATA))qheat(cor(t(dat2)), low ="yellow", high ="red", grid ="grey90", diag.na =TRUE, by.column =NULL)## EXAMPLE 2x2 <- factor(with(pres_debates2012, paste(time, pad(TOT(tot)), sep ="|")))dat2 <- wfm(pres_debates2012$dialogue, x2)wrds <- word_list(pres_debates2012$dialogue, stopwords = c("it's","that's", Top200Words))wrds2 <- tolower(sort(wrds$rfswl[[1]][,1]))qheat(word_cor(t(dat2), word = wrds2, r =NULL), diag.na =TRUE, values =TRUE, digits =3, by.column =NULL, high="red", low="yellow", grid=NULL)## EXAMPLE 3library(gridExtra); library(ggplot2); library(grid)dat3 <- lapply(qcv(OBAMA, ROMNEY),function(x){ with(pres_debates2012, wfm(dialogue[person == x], x2[person == x]))})# Presidential debates by persondat5 <- pres_debates2012
dat5 <- dat5[dat5$person %in% qcv(ROMNEY, OBAMA),]disp <- with(dat5, dispersion_plot(dialogue, wrds2, grouping.var = person, total.color =NULL, rm.vars=time))cors <- lapply(dat3,function(m){ word_cor(t(m), word = wrds2, r =NULL)})plots <- lapply(cors,function(x){ qheat(x, diag.na =TRUE, values =TRUE, digits =3, plot =FALSE, by.column =NULL, high="red", low="yellow", grid=NULL)})plots <- lapply(1:2,function(i){ plots[[i]]+ ggtitle(qcv(OBAMA, ROMNEY)[i])+ theme(axis.title.x = element_blank(), plot.margin = unit(rep(0,4),"lines"))})grid.arrange(disp, arrangeGrob(plots[[1]], plots[[2]], ncol=1), ncol=2)## With `word_cor`worlis <- list( pronouns = c("you","it","it's","we","i'm","i"), negative = qcv(no, dumb, distrust, not, stinks), literacy = qcv(computer, talking, telling))y <- wfdf(DATA$state, qdapTools::id(DATA, prefix =TRUE))z <- wfm_combine(y, worlis)word_cor(t(z), word = names(worlis), r =NULL)## Plotting methodplot(y,TRUE)plot(z)## Correspondence Analysislibrary(ca)dat <- pres_debates2012
dat <- dat[dat$person %in% qcv(ROMNEY, OBAMA),]speech <- stemmer(dat$dialogue)mytable1 <- with(dat, wfm(speech, list(person, time), stopwords = Top25Words))fit <- ca(mytable1)summary(fit)plot(fit)plot3d.ca(fit, labels=1)mytable2 <- with(dat, wfm(speech, list(person, time), stopwords = Top200Words))fit2 <- ca(mytable2)summary(fit2)plot(fit2)plot3d.ca(fit2, labels=1)## Weight a wfmWFM <- with(DATA, wfm(state, list(sex, adult)))plot(weight(WFM,"scaled"),TRUE)weight(WFM,"prop")weight(WFM,"max")weight(WFM,"scaled")## End(Not run)