Converting a list of tokens to n-grams

https://stackoverflow.com/questions/16489748

21-04-2022
|

Question

I have a list of documents that have already been tokenized:

dat <- list(c("texaco", "canada", "lowered", "contract", "price", "pay", 
"crude", "oil", "canadian", "cts", "barrel", "effective", "decrease", 
"brings", "companys", "posted", "price", "benchmark", "grade", 
"edmonton", "swann", "hills", "light", "sweet", "canadian", "dlrs", 
"bbl", "texaco", "canada", "changed", "crude", "oil", "postings", 
"feb", "reuter"), c("argentine", "crude", "oil", "production", 
"pct", "january", "mln", "barrels", "mln", "barrels", "january", 
"yacimientos", "petroliferos", "fiscales", "january", "natural", 
"gas", "output", "totalled", "billion", "cubic", "metrers", "pct", 
"billion", "cubic", "metres", "produced", "january", "yacimientos", 
"petroliferos", "fiscales", "added", "reuter"))

I'm trying to efficiently convert this list of tokens to a list of n-grams. Here's the function I've written so far:

find_ngrams <- function(x, n){

  if (n==1){ return(x)}

  out <- as.list(rep(NA, length(x)))

  for (i in 1:length(x)){
    words <- x[[i]]
    out[[i]] <- words

    for (j in 2:n){

      phrases <- sapply(1:j, function(k){
        words[k:(length(words)-n+k)]
      })

      phrases <- apply(phrases, 1, paste, collapse=" ")

      out[[i]]  <- c(out[[i]], phrases)

    }
  }
  return(out)
}

This works fine for finding n-grams, but it seems inefficient. Replacing the for-loops with *apply functions would still leaves me with loops nested 3-deep:

result <- find_ngrams(dat, 2)
> result[[2]]
 [1] "argentine"                "crude"                    "oil"                     
 [4] "production"               "pct"                      "january"                 
 [7] "mln"                      "barrels"                  "mln"                     
[10] "barrels"                  "january"                  "yacimientos"             
[13] "petroliferos"             "fiscales"                 "january"                 
[16] "natural"                  "gas"                      "output"                  
[19] "totalled"                 "billion"                  "cubic"                   
[22] "metrers"                  "pct"                      "billion"                 
[25] "cubic"                    "metres"                   "produced"                
[28] "january"                  "yacimientos"              "petroliferos"            
[31] "fiscales"                 "added"                    "reuter"                  
[34] "argentine crude"          "crude oil"                "oil production"          
[37] "production pct"           "pct january"              "january mln"             
[40] "mln barrels"              "barrels mln"              "mln barrels"             
[43] "barrels january"          "january yacimientos"      "yacimientos petroliferos"
[46] "petroliferos fiscales"    "fiscales january"         "january natural"         
[49] "natural gas"              "gas output"               "output totalled"         
[52] "totalled billion"         "billion cubic"            "cubic metrers"           
[55] "metrers pct"              "pct billion"              "billion cubic"           
[58] "cubic metres"             "metres produced"          "produced january"        
[61] "january yacimientos"      "yacimientos petroliferos" "petroliferos fiscales"   
[64] "fiscales added"           "added reuter"

Are there any significant parts of this code that could be vectorized?

/edit: here's an updated version of Matthew Plourde's function, that does "up-to-n-grams" and works across the entire list:

find_ngrams_base <- function(x, n) {
  if (n == 1) return(x)
  out <- lapply(1:n, function(n_i) embed(x, n_i))
  out <- sapply(out, function(y) apply(y, 1, function(row) paste(rev(row), collapse=' ')))
  unlist(out)
}

find_ngrams_plourde <- function(x, ...){
  lapply(x, find_ngrams_base, ...)
}

We can benchmark against the function I wrote, and see that it's a bit slower:

library(rbenchmark)
benchmark(
  replications=100,
  a <- find_ngrams(dat, 2),
  b <- find_ngrams(dat, 3),
  c <- find_ngrams(dat, 4),
  d <- find_ngrams(dat, 10),
  w <- find_ngrams_plourde(dat, 2),
  x <- find_ngrams_plourde(dat, 3),
  y <- find_ngrams_plourde(dat, 4),
  z <- find_ngrams_plourde(dat, 10),
  columns=c('test', 'elapsed', 'relative'),
  order='relative'
)
                               test elapsed relative
1          a <- find_ngrams(dat, 2)   0.040    1.000
2          b <- find_ngrams(dat, 3)   0.081    2.025
3          c <- find_ngrams(dat, 4)   0.117    2.925
5  w <- find_ngrams_plourde(dat, 2)   0.144    3.600
6  x <- find_ngrams_plourde(dat, 3)   0.212    5.300
7  y <- find_ngrams_plourde(dat, 4)   0.277    6.925
4         d <- find_ngrams(dat, 10)   0.361    9.025
8 z <- find_ngrams_plourde(dat, 10)   0.669   16.725

However, it also find a lot of ngrams my function misses (whoops):

for (i in 1:length(dat)){
  print(setdiff(w[[i]], a[[i]]))
  print(setdiff(x[[i]], b[[i]]))
  print(setdiff(y[[i]], c[[i]]))
  print(setdiff(z[[i]], d[[i]]))
}

I feel like both function can be improved, but I can't think of any way to avoid the triple loop (loop over the vectors, loop over the number of ngrams needed, 1-n, loop over the words to construct ngrams)

/edit 2: Here's a revised function, based off Matt's answer:

find_ngrams_2 <- function(x, n){
  if (n == 1) return(x)
  lapply(x, function(y) c(y, unlist(lapply(2:n, function(n_i) do.call(paste, unname(rev(data.frame(embed(y, n_i), stringsAsFactors=FALSE))))))))
}

It seems to return the correct list of ngrams, and it is faster (in most cases) than my original function:

library(rbenchmark)
benchmark(
  replications=100,
  a <- find_ngrams(dat, 2),
  b <- find_ngrams(dat, 3),
  c <- find_ngrams(dat, 4),
  d <- find_ngrams(dat, 10),
  w <- find_ngrams_2(dat, 2),
  x <- find_ngrams_2(dat, 3),
  y <- find_ngrams_2(dat, 4),
  z <- find_ngrams_2(dat, 10),
  columns=c('test', 'elapsed', 'relative'),
  order='relative'
)

                         test elapsed relative
5  w <- find_ngrams_2(dat, 2)   0.039    1.000
1    a <- find_ngrams(dat, 2)   0.041    1.051
6  x <- find_ngrams_2(dat, 3)   0.078    2.000
2    b <- find_ngrams(dat, 3)   0.081    2.077
7  y <- find_ngrams_2(dat, 4)   0.119    3.051
3    c <- find_ngrams(dat, 4)   0.123    3.154
4   d <- find_ngrams(dat, 10)   0.399   10.231
8 z <- find_ngrams_2(dat, 10)   0.436   11.179

Solution

Here's one way with embed.

find_ngrams <- function(x, n) {
    if (n == 1) return(x)
    c(x, apply(embed(x, n), 1, function(row) paste(rev(row), collapse=' ')))
}

There seems to be a bug in your function. If you fix that, we can do a benchmark.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow