Question

How to efficiently merge two data.tables with full outer join, while handling missing values with rolling last observation forward (LOCF) on the both left and right sides ?

Real world application - there are two not necessarily interleaving trading rule signal tables, X, Y, holding (sparse) signal values over time. The overall goal is to define composite signal, where Signal.z = Signal.x AND Signal.y

X <- data.table(Instrument=rep("SPX",3)
                , Date=as.IDate(c("2013-11-20","2013-11-22","2013-11-24"))
                , Signal=c(TRUE,FALSE,TRUE), key=c("Instrument", "Date"))

Y <- data.table(Instrument=rep("SPX",3)
                , Date=as.IDate(c("2013-11-21","2013-11-23","2013-11-25"))
                , Signal=c(FALSE,TRUE,FALSE), key=c("Instrument", "Date"))

Desired outcome:

   Instrument       Date Signal.x Signal.y Signal.z
1:        SPX 2013-11-20     TRUE       NA       NA
2:        SPX 2013-11-21     TRUE    FALSE    FALSE
3:        SPX 2013-11-22    FALSE    FALSE    FALSE
4:        SPX 2013-11-23    FALSE     TRUE    FALSE
5:        SPX 2013-11-24     TRUE     TRUE     TRUE
6:        SPX 2013-11-25     TRUE    FALSE    FALSE
Was it helpful?

Solution

Something like this perhaps:

dates = sort(c(X$Date, Y$Date))

setkey(X, Date)
setkey(Y, Date)

Z = X[J(dates), roll = T][,
      Signal.y := Y[J(dates), roll = T]$Signal][,
      Signal.z := as.logical(Signal * Signal.y)]

Building on this idea, here's a way of doing it for your large example data:

# assuming keys are set to Instrument, Date in both data.tables

Z = unique(setkey(rbind(setnames(X[Y, roll = T],
                                 c("Instrument", "Date", "Signal.x", "Signal.y")),
                        setnames(Y[X, roll = T],
                                 c("Instrument", "Date", "Signal.y", "Signal.x")),
                        use.names = TRUE),
                  Instrument, Date))[,
           Signal.z := as.logical(Signal.x * Signal.y)]

OTHER TIPS

Linked here is an excellent answer from mnel explaining how to do a full outer join in the data.table package.

The application here is straightforward, adding the wrinkle of rolling the last observation forward (via roll = TRUE in a join).

Create a data.table holding all (unique) keys in either X or Y.

## one way to do the outer join
keys <- unique(rbind(X[,key(X),with = FALSE], Y[,key(Y), with = FALSE]))
## alternate way if you have multiple data.tables to outer join
keys <- lapply(list(X,Y), function(z) z[,key(z), with = FALSE])
keys <- rbindlist(keys)

## this setkey is mostly cosmetic - 
## determines whether the final output is sorted or not
setkeyv(keys, names(keys))

##cosmetic changing of column names to minimize confusion
setnames(X,"Signal","Signal.X")
setnames(Y,"Signal","Signal.Y")

## two joins, followed by the definition of the new column
X[Y[keys, roll = TRUE], roll = TRUE][,
    Signal.Z := as.logical(Signal.X * Signal.Y)]
## this output is returned invisibly. either assign it or force print
.Last.value
#    Instrument       Date Signal.X Signal.Y Signal.Z
# 1:        SPX 2013-11-20     TRUE       NA       NA
# 2:        SPX 2013-11-21     TRUE    FALSE    FALSE
# 3:        SPX 2013-11-22    FALSE    FALSE    FALSE
# 4:        SPX 2013-11-23    FALSE     TRUE    FALSE
# 5:        SPX 2013-11-24     TRUE     TRUE     TRUE
# 6:        SPX 2013-11-25     TRUE    FALSE    FALSE

The idiom as.logical(. * .) to replicate & where NA propagates is inspired by Eddi's answer.

I am going to measure times of the three available solutions (Daniel.Krizian, Blue.Magister, eddi).

For this purpose I created bigger, benchmark data - large signal tables X and Y

Benchmark data: X and Y tables

nobs <- 5000 # number of observations for each instrument
nopps <- nobs * 3 # opportunities to trade in the time window studied
ninstr <- 200 # number of instruments

set.seed(2)  # set.seed(1) generates "MPM" instrument twice :)
universe <-  replicate( ninstr , paste( sample( LETTERS , 3 , repl = TRUE ), collapse = "" ) )
window <- as.Date("2013-11-26") - 1:nopps + 1
frame <- CJ(Instrument=universe, Date=rep(1:nobs))

gen.sig.tbl <- function() {
  frame[, Date:= as.IDate(sample(window, size=nobs, replace=F)), by="Instrument"]
  setkey(frame,Instrument,Date)

  rnd.sig.sparse <- function(nobs) {
    frst <- sample(c(FALSE,TRUE), 1)
    rep(c(frst,!frst), nobs/2)
  }

  frame[, Signal:=rnd.sig.sparse(nobs), by="Instrument"]
  return(copy(frame))
}
set.seed(1)
X <- gen.sig.tbl()
set.seed(2)
Y <- gen.sig.tbl()

X
             Instrument       Date Signal
      1:        AAS 1972-11-02  FALSE
      2:        AAS 1972-11-04   TRUE
      3:        AAS 1972-11-07  FALSE
      4:        AAS 1972-11-08   TRUE
      5:        AAS 1972-11-10  FALSE
     ---                             
 999996:        ZVH 2013-11-14  FALSE
 999997:        ZVH 2013-11-15   TRUE
 999998:        ZVH 2013-11-18  FALSE
 999999:        ZVH 2013-11-25   TRUE
1000000:        ZVH 2013-11-26  FALSE

Y
         Instrument       Date Signal
      1:        AAS 1972-11-13   TRUE
      2:        AAS 1972-11-17  FALSE
      3:        AAS 1972-11-20   TRUE
      4:        AAS 1972-11-21  FALSE
      5:        AAS 1972-11-23   TRUE
     ---                             
 999996:        ZVH 2013-11-16   TRUE
 999997:        ZVH 2013-11-19  FALSE
 999998:        ZVH 2013-11-23   TRUE
 999999:        ZVH 2013-11-24  FALSE
1000000:        ZVH 2013-11-25   TRUE

The three solutions:

Daniel.Krizian <- function () {
  Z <- merge(X, Y, all=TRUE)[, c("Signal.x","Signal.y"):=list( na.locf(Signal.x, na.rm = F)
                                                               , na.locf(Signal.y, na.rm = F))
                             , by=Instrument]

  Z[, Signal.z := Signal.x & Signal.y]

  # and the last line because (FALSE & NA) == FALSE, whereas NA result is desired
  Z[, Signal.z := ifelse(is.na(Signal.x) | is.na(Signal.y), NA, Signal.z)]
  return(Z)
}



Blue.Magister <- function() {
  keys <- unique(rbind(X[,key(X),with = FALSE], Y[,key(Y), with = FALSE]))

  ## this setkey is mostly cosmetic - 
  ## determines whether the final output is sorted or not
  setkeyv(keys, names(keys))

  ##cosmetic changing of column names to minimize confusion
  setnames(X,"Signal","Signal.X")
  setnames(Y,"Signal","Signal.Y")

  ## two joins, followed by the definition of the new column
  Z <- X[Y[keys, roll = TRUE], roll = TRUE][,
                                       Signal.Z := as.logical(Signal.X * Signal.Y)]
  Z <- unique(Z)
  return(Z)
}

eddi <- function (){

  # assuming keys are set to Instrument, Date in both data.tables
  Z = unique(setkey(rbind(setnames(X[Y, roll = T],
                                   c("Instrument", "Date", "Signal.x", "Signal.y")),
                          setnames(Y[X, roll = T],
                                   c("Instrument", "Date", "Signal.y", "Signal.x")),
                          use.names = TRUE),
                    Instrument, Date))[,
                                       Signal.z := as.logical(Signal.x * Signal.y)]
  return(Z)
}

Benchmarking:

system.time(Z.DK <- Daniel.Krizian())

user  system elapsed 
2.70    0.07    3.01 

system.time(Z.eddi <- eddi())

user  system elapsed 
1.14    0.03    1.84 

system.time(Z.BM <- Blue.Magister())

user  system elapsed 
3.35    0.14    3.52

setnames(X,"Signal.X", "Signal") # reset original data back after Blue.Magister() call
setnames(Y,"Signal.Y", "Signal") # reset original data back after Blue.Magister() call
setnames(Z.BM
         , c("Signal.X", "Signal.Y", "Signal.Z")
         , c("Signal.x", "Signal.y", "Signal.z"))
identical(Z.DK, Z.BM)

TRUE

identical(Z.DK, Z.eddi)

TRUE

My solution is following; if you know of more efficient approach, let me know!

Z <- merge(X, Y, all=TRUE)[, c("Signal.x","Signal.y"):=list( na.locf(Signal.x, na.rm = F)
                                                           , na.locf(Signal.y, na.rm = F))
                           , by=Instrument]

Z[, Signal.z := Signal.x & Signal.y]

# and the last line because (FALSE & NA) == FALSE, whereas NA result is desired
Z[, Signal.z := ifelse(is.na(Signal.x) | is.na(Signal.y), NA, Signal.z)]
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top