Question

Any help with this problem I'm having would be greatly appreciated. I am a moderately advanced R programmer, but so far all my solutions have failed me. I start with the logic behind what I am trying to do, followed by my attempt, followed by the test cases. I tried to be as explicit as possible.

I should probably mention that I sort of know what the problem is, but I don't know what the solution is.

# sqldf has some limitations:

cpaste <- function(x) paste(x, collapse = ", ")

dd <- data.frame(a = 1:10)
b <- 5:8

# this is what I want to get
sqldf("select * from dd where a in (5, 6, 7, 8)")

# but I want to get it by typing this
sqldf(sprintf("select * from %s where a in (%s)", dd, b)) # error

# and it doesn't work, because this is what sprintf expects:
sqldf(sprintf("select * from %s where a in (%s)", "dd", paste(b, collapse = ", ")))

# in other words, 
# (1) the name of data frame, not the data frame itself, and
# (2) the vector must be turned into a single string with comma separated values

# I wrote a wrapper function for sqldf
# it uses sprintf to create the sql string that I need to feed to sqldf
# but before doing that it does (1) and (2) as mentioned above
# so I can do this and it would work:
run_sql("select * from %s where a in (%s)", dd, b)

# it works until I try running it inside another funciton
# where I start running into some problems

# here's the function, followed by test cases

run_sql <- function(zcode = NULL, ..., display = TRUE, eval = TRUE) {

  require(sqldf)

  ellipsis <- as.list(match.call(expand.dots = TRUE))
  ellipsis[1] <- NULL
  ellipsis$inline <- NULL
  ellipsis$display <- NULL
  ellipsis$eval <- NULL
  # print(ellipsis)
  # print(lapply(ellipsis, class))

  ffn <- function(x) {
    if (is.name(x)) { # the argument passed is itself a variable
      if (is.data.frame(eval(x))) {
        as.character(x) # returns just the name of the data frame
      } else if (is.atomic(eval(x))) {
        cpaste(eval(x)) # return the atomic vector as comma-sep string
      } else "_____FAIL1_____"
    } else if (is.call(x)) { # the argument passed is a function call, eg 2:4
      if (is.atomic(eval(x))) cpaste(eval(x)) else "_____FAIL2_____"
    } else {
      if (is.atomic(x)) cpaste(x) else "_____FAIL3_____"
    }
  }
  ellipsis <- lapply(ellipsis, ffn)

  zcode <- do.call(sprintf, unname(ellipsis))
  if (display == TRUE) cat(paste0(zcode, "\n\n"))

  if (eval == TRUE) {
    sqldf(zcode)
  } else {
    zcode
  }

}

dd <- data.frame(a = 1:10)
b <- 5:8
run_sql("select * from %s where a > %s", dd, 5)
run_sql("select * from %s where a in (%s)", dd, b)

# it works when the function uses variables in .GlobalEnv
# but this is not the preferred way:
foo <- function() {
  run_sql("select * from %s where a in (%s)", dd, b)
}
foo()

# here's the preferred way
# but things stop working:
foo <- function(x, y) {
  run_sql("select * from %s where a in (%s)", x, y)
}
foo(dd, b) 

# here's one solution to the above, but I am hoping there's a better way
foo <- function(x, y) {
  do.call(run_sql, list("select * from %s where a in (%s)", 
    substitute(x), 
    substitute(y)))
}
foo(dd, b) 

# also, the above solution does not work with local variables
foo <- function() {
  bb <- dd
  do.call(run_sql, list("select * from %s where a in (%s)", 
    bb, 
    substitute(y)))
}
foo()
Was it helpful?

Solution

@G. Grothendieck's solution will probably be easier for many readers of this post. That said, I think you can fix your function by identifying the parent environment of the call to run_sql, then using envir= to specify that environment anytime you call a function that depends on environments -- specifically, eval() and sqldf(). Like so:

cpaste <- function(x) paste(x, collapse = ", ")
run_sql <- function(zcode = NULL, ..., display = TRUE, eval = TRUE, envir=parent.frame()) {

  require(sqldf)

  ellipsis <- as.list(match.call(expand.dots = TRUE))
  ellipsis[1] <- NULL
  ellipsis$inline <- NULL
  ellipsis$display <- NULL
  ellipsis$eval <- NULL
  # print(ellipsis)
  # print(lapply(ellipsis, class))

  ffn <- function(x) {
    if (is.name(x)) { # the argument passed is itself a variable
      if (is.data.frame(eval(x, envir=envir))) {
        as.character(x) # returns just the name of the data frame
      } else if (is.atomic(eval(x, envir=envir))) {
        cpaste(eval(x, envir=envir)) # return the atomic vector as comma-sep string
      } else "_____FAIL1_____"
    } else if (is.call(x)) { # the argument passed is a function call, eg 2:4
      if (is.atomic(eval(x, envir=envir))) cpaste(eval(x, envir=envir)) else "_____FAIL2_____"
    } else {
      if (is.atomic(x)) cpaste(x) else "_____FAIL3_____"
    }
  }
  ellipsis <- lapply(ellipsis, ffn)

  zcode <- do.call(sprintf, unname(ellipsis))
  if (display == TRUE) cat(paste0(zcode, "\n\n"))

  if (eval == TRUE) {
    sqldf(zcode, envir=envir)
  } else {
    zcode
  }

}

This works in your test case using x and y:

foo <- function(x, y) {
  run_sql("select * from %s where a in (%s)", x, y)
}
foo(dd, b) 

And, with some tweaking, in a test case using do.call and local variables:

foo <- function(y) {
  bb <- dd
  do.call(run_sql, list("select * from %s where a in (%s)", 
                        as.name("bb"), 
                        substitute(y),
                        envir=environment()))
}
foo(b)

To understand the problem with your original function, and to understand what environment was visible to eval() each time it was called, I wrote a function called enveval to wrap several sys.xxx functions and the eval() call. Then, back in the run_sql function, I replaced all calls to eval with calls to enveval.

# enveval: Replace an eval() call with enveval() to see a description of the stack of environments experienced by eval()
enveval <- function(x, envir=parent.frame()) {
  cat(paste0("EVALUATING ",as.character(x),":\n"))
  stack <- data.frame(frame_num=1:sys.nframe(), call=strtrim(as.character(sys.calls()),15), 
                      is_eval_envir=NA, vars_in_frame=NA, x_exists=NA, eval_x=NA)
  for(i in 1:nrow(stack)) {
    f <- which(i==stack$frame_num)
    stack[f,"is_eval_envir"] <- identical(envir,sys.frame(f))
    stack[f,"vars_in_frame"] <- paste(ls(envir=sys.frame(f)),collapse=",")
    stack[f,"x_exists"] <- exists(as.character(x), where=sys.frame(f))
    if(stack[f,"is_eval_envir"] & stack[f,"x_exists"]) {
      # if all the variables to evaluate are single-element atomic, you can also run the following line:
      if(is.atomic(eval(x, envir=sys.frame(f)))) {
        stack[f,"eval_x"] <- cpaste(eval(x, envir=sys.frame(f)))
      } else {
        stack[f,"eval_x"] <- "[non-atomic]"
      }
    }
  }
  print(stack)
  eval(x, envir=envir)
}

# The new run_sql where eval is replaced with enveval:
run_sql <- function(zcode = NULL, ..., display = TRUE, eval = TRUE, envir=parent.frame()) {

  require(sqldf)

  ellipsis <- as.list(match.call(expand.dots = TRUE))
  ellipsis[1] <- NULL
  ellipsis$inline <- NULL
  ellipsis$display <- NULL
  ellipsis$eval <- NULL
  # print(ellipsis)
  # print(lapply(ellipsis, class))

  ffn <- function(x) {
    if (is.name(x)) { # the argument passed is itself a variable
      if (is.data.frame(enveval(x, envir=envir))) {
        as.character(x) # returns just the name of the data frame
      } else if (is.atomic(enveval(x, envir=envir))) {
        cpaste(enveval(x, envir=envir)) # return the atomic vector as comma-sep string
      } else "_____FAIL1_____"
    } else if (is.call(x)) { # the argument passed is a function call, eg 2:4
      if (is.atomic(enveval(x, envir=envir))) cpaste(enveval(x, envir=envir)) else "_____FAIL2_____"
    } else {
      if (is.atomic(x)) cpaste(x) else "_____FAIL3_____"
    }
  }
  ellipsis <- lapply(ellipsis, ffn)

  zcode <- do.call(sprintf, unname(ellipsis))
  if (display == TRUE) cat(paste0(zcode, "\n\n"))

  if (eval == TRUE) {
    sqldf(zcode, envir=envir)
  } else {
    zcode
  }

}

Playing around with the test cases shows you what enveval sees (and what eval would have seen) each time it's called. For example, running the first test function:

foo <- function(x, y) {
  run_sql("select * from %s where a in (%s)", x, y)
}
foo(dd, b) 

gave the following print-out showing that the frame for the foo(dd, b) call was the useful environment for every call to eval():

EVALUATING x:
  frame_num            call is_eval_envir                         vars_in_frame x_exists       eval_x
1         1      foo(dd, b)          TRUE                                   x,y     TRUE [non-atomic]
2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE         <NA>
3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE         <NA>
4         4 FUN(X[[2]], ...         FALSE                                     x     TRUE         <NA>
5         5 is.data.frame(e         FALSE                                     x     TRUE         <NA>
6         6 enveval(x, envi         FALSE                     envir,f,i,stack,x     TRUE         <NA>
EVALUATING y:
  frame_num            call is_eval_envir                         vars_in_frame x_exists     eval_x
1         1      foo(dd, b)          TRUE                                   x,y     TRUE 5, 6, 7, 8
2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE       <NA>
3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE       <NA>
4         4 FUN(X[[3]], ...         FALSE                                     x    FALSE       <NA>
5         5 is.data.frame(e         FALSE                                     x    FALSE       <NA>
6         6 enveval(x, envi         FALSE                     envir,f,i,stack,x    FALSE       <NA>
EVALUATING y:
  frame_num            call is_eval_envir                         vars_in_frame x_exists     eval_x
1         1      foo(dd, b)          TRUE                                   x,y     TRUE 5, 6, 7, 8
2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE       <NA>
3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE       <NA>
4         4 FUN(X[[3]], ...         FALSE                                     x    FALSE       <NA>
5         5 enveval(x, envi         FALSE                     envir,f,i,stack,x    FALSE       <NA>
EVALUATING y:
  frame_num            call is_eval_envir                         vars_in_frame x_exists     eval_x
1         1      foo(dd, b)          TRUE                                   x,y     TRUE 5, 6, 7, 8
2         2 run_sql("select         FALSE display,ellipsis,envir,eval,ffn,zcode    FALSE       <NA>
3         3 lapply(ellipsis         FALSE                                 FUN,X    FALSE       <NA>
4         4 FUN(X[[3]], ...         FALSE                                     x    FALSE       <NA>
5         5 cpaste(enveval(         FALSE                                     x    FALSE       <NA>
6         6 paste(x, collap         FALSE                          collapse,sep    FALSE       <NA>
7         7 enveval(x, envi         FALSE                     envir,f,i,stack,x    FALSE       <NA>

select * from x where a in (5, 6, 7, 8)

a
1 5
2 6
3 7
4 8

OTHER TIPS

The functionality you are looking for already exists in the gsubfn package which is automatically pulled in by sqldf. See Example 5 on the sqldf home page for the way to do this and look at the vignette from the gsubfn package for even more information on fn. In terms of the present examples:

dd <- data.frame(a = 1:10)
ddname <- "dd"

b <- 5
fn$sqldf("select * from $ddname where a > $b")

b <- 5:8
fn$sqldf("select * from $ddname where a in (`toString(b)`)")
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top