R- creating a counter-party frequency matrix

Question 1

Alright, I think I've got this figured out. The short answer is:

Reduce("+",by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1)))

Which gives the following matrix, matching the desired result:

  1 2 3 4 5 6 7 8
1 1 1 1 1 1 1 1 2
2 1 0 1 1 0 0 0 0
3 1 1 0 0 0 0 0 0
4 1 1 0 0 0 0 0 0
5 1 0 0 0 0 0 1 0
6 1 0 0 0 0 0 0 0
7 1 0 0 0 1 0 0 0
8 2 0 0 0 0 0 0 0

Here's the long answer. You can get a list of matrices for each TradeID using the by and outer (%o%) and table functions. But this double-counts Trade 7, where item 1 is traded for item 1, so I use the pmax function to fix this. Then I sum across the list by using the Reduce function.

And here's the steps to get there. Note the addition of TradeID # 9, which was left out of the question's code.

# Data
myDat <- data.frame(
  TradeID = as.factor(c(1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8,9,9)),
  Origin = as.factor(c(1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0)),
  ItemID = as.factor(c(1,2,3,4,5,1,1,6,7,1,1,8,7,5,1,1,2,3,4,1,8))
)

# Sum in 1 direction
by(myDat, myDat$TradeID, function(x) table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]))

# Sum in both directions
by(myDat, myDat$TradeID, function(x) table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]) + table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]))

# Remove double-count in trade 7
by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1))

# Sum across lists
Reduce("+",by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1)))

One way to speed this up would be to sum in only 1 direction (taking advantage of symmetry) and then clean up the results.

result = Reduce("+",by(myDat, myDat$TradeID, function(x) table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1])))
result2 = result + t(result)
diag(result2) = diag(result)
result2
    1 2 3 4 5 6 7 8
  1 1 1 1 1 1 1 1 2
  2 1 0 1 1 0 0 0 0
  3 1 1 0 0 0 0 0 0
  4 1 1 0 0 0 0 0 0
  5 1 0 0 0 0 0 1 0
  6 1 0 0 0 0 0 0 0
  7 1 0 0 0 1 0 0 0
  8 2 0 0 0 0 0 0 0

This appears to run nearly twice as fast.

> microbenchmark(Reduce("+",by(myDat, myDat$TradeID, function(x) pmin(table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]) + table(x$ItemID[x$Origin==1]) %o% table(x$ItemID[x$Origin==0]),1))))
Unit: milliseconds
      min       lq   median       uq     max neval
 7.489092 7.733382 7.955861 8.536359 9.83216   100

> microbenchmark(Reduce("+",by(myDat, myDat$TradeID, function(x) table(x$ItemID[x$Origin==0]) %o% table(x$ItemID[x$Origin==1]))))
Unit: milliseconds

      min      lq   median       uq      max neval
 4.023964 4.18819 4.277767 4.452824 5.801171   100

Question 2

This will give you the number of observations per TradeID and ItemID

myDat <- data.frame(
  TradeID = as.factor(c(1,1,1,2,2,2,3,3,4,4,5,5,6,6,7,7,8,8,8)),
  Origin = as.factor(c(1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0)),
  ItemID = as.factor(c(1,2,3,4,5,1,1,6,7,1,1,8,7,5,1,1,2,3,4))
)
result = tapply(myDat$Origin, list(myDat$ItemID,myDat$TradeID), length)
result[is.na(result)] = 0
result["1","7"]

result will then be:

> result
  1 2 3 4 5 6 7 8
1 1 1 1 1 1 0 2 0
2 1 0 0 0 0 0 0 1
3 1 0 0 0 0 0 0 1
4 0 1 0 0 0 0 0 1
5 0 1 0 0 0 1 0 0
6 0 0 1 0 0 0 0 0
7 0 0 0 1 0 1 0 0
8 0 0 0 0 1 0 0 0

This will give you the proportion of 1 Origin per TradeID and ItemID

result = tapply(myDat$Origin, list(myDat$ItemID,myDat$TradeID), function(x) { sum(as.numeric(as.character(x)))/length(x) })

You can set the NA values in the last matrix to 0 using result[is.na(result)] = 0 but that would confuse no observations with nothing but 0 Origin trades.

Question 3

This will give you the number of observations per consecutive ItemIDs:

idxList <-  with(myDat, tapply(ItemID, TradeID, FUN = function(items) 
  lapply(seq(length(items) - 1), 
         function(i) sort(c(items[i], items[i + 1])))))

# indices of observations  
idx <- do.call(rbind, unlist(idxList, recursive = FALSE))

# create a matrix
ids <- unique(myDat$ItemID)
mat <- matrix(0, length(ids), length(ids))

# place values in matrix
for (i in seq(nrow(idx))) {
  mat[idx[i, , drop = FALSE]] <- mat[idx[i, , drop = FALSE]] + 1      
}

# create symmatric marix 
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]


     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,]    1    1    0    0    1    1    1    1
[2,]    1    0    2    0    0    0    0    0
[3,]    0    2    0    1    0    0    0    0
[4,]    0    0    1    0    1    0    0    0
[5,]    1    0    0    1    0    0    1    0
[6,]    1    0    0    0    0    0    0    0
[7,]    1    0    0    0    1    0    0    0
[8,]    1    0    0    0    0    0    0    0