If I remember correctly, you can construct a TermDocumentMatrix of Bigrams (2 words that always occur together) using weka, and then process them as needed
library("tm") #text mining
library("RWeka") # for tokenization algorithms more complicated than single-word
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tdm <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
# process tdm
# findFreqTerms(tdm, lowfreq=3, highfreq=Inf)
# ...
tdm <- removeSparseTerms(tdm, 0.99)
print("----")
print("tdm properties")
str(tdm)
tdm_top_N_percent = tdm$nrow / 100 * topN_percentage_wanted
Alternatively,
#words combinations that occur at least once together an at most 5 times
wmin=1
wmax = 5
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = wmin, max = wmax))
Sometimes it helps to perform word stemming first in order to get "better" word groups.