Aggregazione sub totali e totali con data.table

https://stackoverflow.com/questions/9315258

26-10-2019
|

Domanda

Ho un data.table in R:

library(data.table)
set.seed(1)
DT = data.table(
  group=sample(letters[1:2],100,replace=TRUE), 
  year=sample(2010:2012,100,replace=TRUE),
  v=runif(100))

L'aggregazione questi dati in una tabella riepilogativa per gruppo e anno è semplice ed elegante:

table <- DT[,mean(v),by='group, year']

Tuttavia, aggregando questi dati in una tabella di sintesi, compresi i totali parziali e totali, è un po 'più difficile, e molto meno elegante:

library(plyr)
yearTot <- DT[,list(mean(v),year='Total'),by='group']
groupTot <- DT[,list(mean(v),group='Total'),by='year']
Tot <- DT[,list(mean(v), year='Total', group='Total')]
table <- rbind.fill(table,yearTot,groupTot,Tot)
table$group[table$group==1] <- 'Total'
table$year[table$year==1] <- 'Total'

Questa rendimenti:

table[order(table$group, table$year), ]

C'è un modo semplice per specificare subtotali e totali con data.table, come ad esempio il comando margins=TRUE per plyr? Io preferirei usare data.table plyr sopra sul mio set di dati, in quanto è un grande insieme di dati che ho già in formato data.table.

Soluzione

Nel recente data.table devel è possibile utilizzare la nuova funzionalità chiamata "set di raggruppamenti" per sub totali produrre:

library(data.table)
set.seed(1)
DT = data.table(
    group=sample(letters[1:2],100,replace=TRUE), 
    year=sample(2010:2012,100,replace=TRUE),
    v=runif(100))

cube(DT, mean(v), by=c("group","year"))
#    group year        V1
# 1:     a 2011 0.4176346
# 2:     b 2010 0.5231845
# 3:     b 2012 0.4306871
# 4:     b 2011 0.4997119
# 5:     a 2012 0.4227796
# 6:     a 2010 0.2926945
# 7:    NA 2011 0.4463616
# 8:    NA 2010 0.4278093
# 9:    NA 2012 0.4271160
#10:     a   NA 0.3901875
#11:     b   NA 0.4835788
#12:    NA   NA 0.4350153
cube(DT, mean(v), by=c("group","year"), id=TRUE)
#    grouping group year        V1
# 1:        0     a 2011 0.4176346
# 2:        0     b 2010 0.5231845
# 3:        0     b 2012 0.4306871
# 4:        0     b 2011 0.4997119
# 5:        0     a 2012 0.4227796
# 6:        0     a 2010 0.2926945
# 7:        2    NA 2011 0.4463616
# 8:        2    NA 2010 0.4278093
# 9:        2    NA 2012 0.4271160
#10:        1     a   NA 0.3901875
#11:        1     b   NA 0.4835788
#12:        3    NA   NA 0.4350153

Altri suggerimenti

io non sono a conoscenza di un modo semplice. Ecco un primo tentativo di un'implementazione. Non so margins=TRUE in plyr, è questo quello che fa?

crossby = function(DT, j, by) {
    j = substitute(j)
    ans = rbind(
        DT[,eval(j),by],
        DT[,list("Total",eval(j)),by=by[1]],
        cbind("Total",DT[,eval(j),by=by[2]]),
        list("Total","Total",DT[,eval(j)]),
        use.names=FALSE
        # 'use.names' argument added in data.table v1.8.0
    )
    setkeyv(ans,by)
    ans
}

crossby(DT, mean(v), c("group","year"))

      group  year        V1
 [1,]     a  2010 0.2926945
 [2,]     a  2011 0.4176346
 [3,]     a  2012 0.4227796
 [4,]     a Total 0.3901875
 [5,]     b  2010 0.5231845
 [6,]     b  2011 0.4997119
 [7,]     b  2012 0.4306871
 [8,]     b Total 0.4835788
 [9,] Total  2010 0.4278093
[10,] Total  2011 0.4463616
[11,] Total  2012 0.4271160
[12,] Total Total 0.4350153

Vedi sotto per una soluzione - simile a @ MattDowle di sopra -. Che prende qualsiasi numero di gruppi

crossby2 <- function(data, j, by, grand.total = T, total.label = "(all)", value.label = "value") {
  j = substitute(j)

  # Calculate by each group
  lst <- lapply(1:length(by), function(i) {
    x <- data[, list(..VALUE.. = eval(j)), by = eval(by[1:i])]
    if (i != length(by)) x[, (by[-(1:i)]) := total.label]
    return(x)
  })

  # Grand total
  if (grand.total) lst <- c(lst, list(data[, list(..VALUE.. = eval(j))][, (by) := total.label]))

  # Combine all tables
  res <- rbindlist(lst, use.names = T, fill = F)

  # Change value column name
  setnames(res, "..VALUE..", value.label)

  # Set proper column order
  setcolorder(res, c(by, value.label))

  # Sort values
  setkeyv(res, by)

  return(res)
}

Utilizzando le risposte attuali Ho aggiunto il supporto per più misure e le funzioni di aggregazione e può aggiungere l'aggregazione indicatore di livello.

#' @title SQL's ROLLUP function
#' @description Returns data.table of aggregates value for each level of hierarchy provided in `by`.
#' @param x data.table input data.
#' @param j expression to evaluate in `j`, support multiple measures.
#' @param by character a hierarchy level for aggregations.
#' @param level logical, use `TRUE` to add `level` column of sub-aggregation.
#' @seealso [postgres: GROUPING SETS, CUBE, and ROLLUP](http://www.postgresql.org/docs/9.5/static/queries-table-expressions.html#QUERIES-GROUPING-SETS), [SO: Aggregating sub totals and grand totals with data.table](http://stackoverflow.com/a/24828162/2490497)
#' @return data.table
#' @examples 
#' set.seed(1)
#' x = data.table(group=sample(letters[1:2],100,replace=TRUE),
#'                year=sample(2010:2012,100,replace=TRUE),
#'                v=runif(100))
#' rollup(x, .(vmean=mean(v), vsum=sum(v)), by = c("group","year"))
library(data.table)
rollup = function(x, j, by, level=FALSE){
    stopifnot(is.data.table(x), is.character(by), length(by) >= 2L, is.logical(level))
    j = substitute(j)
    aggrs = rbindlist(c(
        lapply(1:(length(by)-1L), function(i) x[, eval(j), c(by[1:i])][, (by[-(1:i)]) := NA]), # subtotals
        list(x[, eval(j), c(by)]), # leafs aggregations
        list(x[, eval(j)][, c(by) := NA]) # grand total
    ), use.names = TRUE, fill = FALSE)
    if(level) aggrs[, c("level") := sum(sapply(.SD, is.na)), 1:nrow(aggrs), .SDcols = by]
    setcolorder(aggrs, neworder = c(by, names(aggrs)[!names(aggrs) %in% by]))
    setorderv(aggrs, cols = by, order=1L, na.last=TRUE)
    return(aggrs[])
}
set.seed(1)
x = data.table(group=sample(letters[1:2],100,replace=TRUE),
               year=sample(2010:2012,100,replace=TRUE),
               month=sample(1:12,100,replace=TRUE),
               v=runif(100))
rollup(x, .(vmean=mean(v), vsum=sum(v)), by = c("group","year","month"), level=TRUE)

Prendendo in prestito da questa risposta ( https://stackoverflow.com/a/39536828/4241780 ), il che segue un sommario tutti i sottoinsiemi-(crossby2 differenza, e rollup che sembrano righe di mancanza 9 a 11 di uscita desiderato del PO). Questa funzione è espandibile a qualsiasi numero di by o aggregati variabili, anche se nel suo stato attuale consente solo un tipo di funzione di aggregazione. Grande per il calcolo substotals fila da interazioni di gruppo (quello che ho usato per).

add_col_sums.data.table <- function(data, aggvars, byvars, FUN = sum, level = "level") {

  # Find all possible subsets of your data
  subsets <- lapply(0:length(byvars), combn, x = byvars, simplify = FALSE)
  subsets <- do.call(c, subsets)

  # Calculate summary value by each subset
  agg_values <- lapply(subsets, function(x) 
    data[,lapply(.SD, FUN), by = x, .SDcols = aggvars])

  # Pull them all into one dataframe
  dat_out <- rbindlist(agg_values, fill = TRUE)

  # Order columns and rows
  setorderv(dat_out, byvars, na.last = TRUE)
  setcolorder(dat_out, c(byvars, aggvars))

  # Add level indication
  dat_out[, c(level) := Reduce("+", lapply(.SD, is.na))]

  # Return data.table
  dat_out[]

}

add_col_sums.data.table(DT, "v", c("group", "year"), FUN = mean)

Autorizzato sotto: CC-BY-SA insieme a attribuzione

Non affiliato a StackOverflow