Aggregating sub totals and grand totals with data.table
-
26-10-2019 - |
Question
I've got a data.table
in R:
library(data.table)
set.seed(1)
DT = data.table(
group=sample(letters[1:2],100,replace=TRUE),
year=sample(2010:2012,100,replace=TRUE),
v=runif(100))
Aggregating this data into a summary table by group and year is simple and elegant:
table <- DT[,mean(v),by='group, year']
However, aggregating this data into a summary table, including subtotals and grand totals, is a little more difficult, and a lot less elegant:
library(plyr)
yearTot <- DT[,list(mean(v),year='Total'),by='group']
groupTot <- DT[,list(mean(v),group='Total'),by='year']
Tot <- DT[,list(mean(v), year='Total', group='Total')]
table <- rbind.fill(table,yearTot,groupTot,Tot)
table$group[table$group==1] <- 'Total'
table$year[table$year==1] <- 'Total'
This yields:
table[order(table$group, table$year), ]
Is there a simple way to specify subtotals and grand totals with data.table, such as the margins=TRUE
command for plyr? I would prefer to use data.table over plyr on my dataset, as it is a very large dataset that I already have in the data.table format.
Solution
In recent devel data.table you can use new feature called "grouping sets" to produce sub totals:
library(data.table)
set.seed(1)
DT = data.table(
group=sample(letters[1:2],100,replace=TRUE),
year=sample(2010:2012,100,replace=TRUE),
v=runif(100))
cube(DT, mean(v), by=c("group","year"))
# group year V1
# 1: a 2011 0.4176346
# 2: b 2010 0.5231845
# 3: b 2012 0.4306871
# 4: b 2011 0.4997119
# 5: a 2012 0.4227796
# 6: a 2010 0.2926945
# 7: NA 2011 0.4463616
# 8: NA 2010 0.4278093
# 9: NA 2012 0.4271160
#10: a NA 0.3901875
#11: b NA 0.4835788
#12: NA NA 0.4350153
cube(DT, mean(v), by=c("group","year"), id=TRUE)
# grouping group year V1
# 1: 0 a 2011 0.4176346
# 2: 0 b 2010 0.5231845
# 3: 0 b 2012 0.4306871
# 4: 0 b 2011 0.4997119
# 5: 0 a 2012 0.4227796
# 6: 0 a 2010 0.2926945
# 7: 2 NA 2011 0.4463616
# 8: 2 NA 2010 0.4278093
# 9: 2 NA 2012 0.4271160
#10: 1 a NA 0.3901875
#11: 1 b NA 0.4835788
#12: 3 NA NA 0.4350153
OTHER TIPS
I'm not aware of a simple way. Here's a first stab at an implementation. I don't know margins=TRUE
in plyr, is this what that does?
crossby = function(DT, j, by) {
j = substitute(j)
ans = rbind(
DT[,eval(j),by],
DT[,list("Total",eval(j)),by=by[1]],
cbind("Total",DT[,eval(j),by=by[2]]),
list("Total","Total",DT[,eval(j)]),
use.names=FALSE
# 'use.names' argument added in data.table v1.8.0
)
setkeyv(ans,by)
ans
}
crossby(DT, mean(v), c("group","year"))
group year V1
[1,] a 2010 0.2926945
[2,] a 2011 0.4176346
[3,] a 2012 0.4227796
[4,] a Total 0.3901875
[5,] b 2010 0.5231845
[6,] b 2011 0.4997119
[7,] b 2012 0.4306871
[8,] b Total 0.4835788
[9,] Total 2010 0.4278093
[10,] Total 2011 0.4463616
[11,] Total 2012 0.4271160
[12,] Total Total 0.4350153
See below for a solution - similar to @MattDowle's above - that takes any number of groups.
crossby2 <- function(data, j, by, grand.total = T, total.label = "(all)", value.label = "value") {
j = substitute(j)
# Calculate by each group
lst <- lapply(1:length(by), function(i) {
x <- data[, list(..VALUE.. = eval(j)), by = eval(by[1:i])]
if (i != length(by)) x[, (by[-(1:i)]) := total.label]
return(x)
})
# Grand total
if (grand.total) lst <- c(lst, list(data[, list(..VALUE.. = eval(j))][, (by) := total.label]))
# Combine all tables
res <- rbindlist(lst, use.names = T, fill = F)
# Change value column name
setnames(res, "..VALUE..", value.label)
# Set proper column order
setcolorder(res, c(by, value.label))
# Sort values
setkeyv(res, by)
return(res)
}
Using current answers I've added support for multiple measures and aggregate functions and can add aggregation level indicator.
#' @title SQL's ROLLUP function
#' @description Returns data.table of aggregates value for each level of hierarchy provided in `by`.
#' @param x data.table input data.
#' @param j expression to evaluate in `j`, support multiple measures.
#' @param by character a hierarchy level for aggregations.
#' @param level logical, use `TRUE` to add `level` column of sub-aggregation.
#' @seealso [postgres: GROUPING SETS, CUBE, and ROLLUP](http://www.postgresql.org/docs/9.5/static/queries-table-expressions.html#QUERIES-GROUPING-SETS), [SO: Aggregating sub totals and grand totals with data.table](http://stackoverflow.com/a/24828162/2490497)
#' @return data.table
#' @examples
#' set.seed(1)
#' x = data.table(group=sample(letters[1:2],100,replace=TRUE),
#' year=sample(2010:2012,100,replace=TRUE),
#' v=runif(100))
#' rollup(x, .(vmean=mean(v), vsum=sum(v)), by = c("group","year"))
library(data.table)
rollup = function(x, j, by, level=FALSE){
stopifnot(is.data.table(x), is.character(by), length(by) >= 2L, is.logical(level))
j = substitute(j)
aggrs = rbindlist(c(
lapply(1:(length(by)-1L), function(i) x[, eval(j), c(by[1:i])][, (by[-(1:i)]) := NA]), # subtotals
list(x[, eval(j), c(by)]), # leafs aggregations
list(x[, eval(j)][, c(by) := NA]) # grand total
), use.names = TRUE, fill = FALSE)
if(level) aggrs[, c("level") := sum(sapply(.SD, is.na)), 1:nrow(aggrs), .SDcols = by]
setcolorder(aggrs, neworder = c(by, names(aggrs)[!names(aggrs) %in% by]))
setorderv(aggrs, cols = by, order=1L, na.last=TRUE)
return(aggrs[])
}
set.seed(1)
x = data.table(group=sample(letters[1:2],100,replace=TRUE),
year=sample(2010:2012,100,replace=TRUE),
month=sample(1:12,100,replace=TRUE),
v=runif(100))
rollup(x, .(vmean=mean(v), vsum=sum(v)), by = c("group","year","month"), level=TRUE)
Borrowing from this answer (https://stackoverflow.com/a/39536828/4241780), the below provides an all-subsets summary (unlike crossby2
, and rollup
which appear to miss rows 9 to 11 of the OP's desired output). This function is expandable to any number of by or aggregate variables, although in its current state only allows one type of aggregation function. Great for calculating row substotals by group interactions (what I used it for).
add_col_sums.data.table <- function(data, aggvars, byvars, FUN = sum, level = "level") {
# Find all possible subsets of your data
subsets <- lapply(0:length(byvars), combn, x = byvars, simplify = FALSE)
subsets <- do.call(c, subsets)
# Calculate summary value by each subset
agg_values <- lapply(subsets, function(x)
data[,lapply(.SD, FUN), by = x, .SDcols = aggvars])
# Pull them all into one dataframe
dat_out <- rbindlist(agg_values, fill = TRUE)
# Order columns and rows
setorderv(dat_out, byvars, na.last = TRUE)
setcolorder(dat_out, c(byvars, aggvars))
# Add level indication
dat_out[, c(level) := Reduce("+", lapply(.SD, is.na))]
# Return data.table
dat_out[]
}
add_col_sums.data.table(DT, "v", c("group", "year"), FUN = mean)