Pergunta

I want to summarise a dataframe based on the unique values of a string variable.

df1 <- structure(list(lllocatie = structure(c(3L, 13L, 5L, 10L, 4L, 32L, 10L, 10L, 22L, 4L, 36L, 37L, 31L, 15L, 23L, 20L, 34L, 8L, 35L, 24L, 19L, 19L, 2L, 29L, 26L, 25L, 25L, 30L, 8L, 22L, 9L, 20L, 19L, 12L, 16L, 38L, 6L, 27L, 7L, 11L, 17L, 33L, 14L, 2L, 21L, 18L, 9L, 28L, 32L, 1L), .Label = c("Annen", "Appingedam", "Assen", "Eleveld", "Emmen", "Farmsum", "Froombosch", "Garrelsweer", "Garsthuizen", "Geelbroek", "Hellum", "Hoogezand", "Hooghalen", "Huizinge", "Langelo", "Leermens", "Meedhuizen", "Onderdendam", "Oosterwijtwerd", "Overschild", "Roodeschool", "Roswinkel", "Sappemeer", "Sint Annen", "Slochteren", "Startenhuizen", "Steendam", "Stitswerd", "t-Zandt", "Ten Post", "Tjuchem", "Toornwerd", "Tripscompagnie", "Westerbroek", "Westerwijtwerd", "Winneweer", "Woudbloem", "Zandeweer"), class = "factor"), lat = c(52.992, 52.928, 52.771, 52.952, 52.965, 53.358, 52.953, 52.956, 52.831, 52.961, 53.32, 53.21, 53.294, 53.084, 53.16, 53.285, 53.177, 53.305, 53.316, 53.315, 53.333, 53.336, 53.332, 53.363, 53.368, 53.208, 53.202, 53.294, 53.306, 52.833, 53.37, 53.279, 53.323, 53.17, 53.345, 53.39, 
53.316, 53.275, 53.194, 53.226, 53.294, 53.156, 53.359, 53.335, 53.423, 53.324, 53.372, 53.365, 53.351, 53.061), lon = c(6.548, 6.552, 6.914, 6.575, 6.573, 6.657, 6.572, 6.562, 7.032, 6.57, 6.74, 6.747, 6.868, 6.465, 6.805, 6.795, 6.685, 6.793, 6.65, 6.66, 6.837, 6.808, 6.848, 6.765, 6.675, 6.812, 6.82, 6.753, 6.777, 7.045, 6.72, 6.807, 6.805, 6.747, 6.808, 6.68, 6.962, 6.828, 6.798, 6.835, 6.95, 6.823, 6.682, 6.852, 6.77, 6.613, 6.743, 6.577, 6.628, 6.698), mag.cat = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 2L, 3L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 3L, 2L, 2L, 3L, 3L), names = structure(c(3L, 4L, 9L, 2L, 2L, 11L, 2L, 8L, 10L, 2L, 21L, 29L, 1L, 19L, 1L, 24L, 28L, 1L, 1L, 1L, 23L, 23L, 1L, 27L, 12L, 1L, 15L, 17L, 16L, 10L, 1L, 24L, 1L, 1L, 20L, 14L, 1L, 25L, 1L, 1L, 1L, 1L, 18L, 1L, 22L, 7L, 13L, 26L, 6L, 5L), .Label = c("", "Amen,Assen,Deurze,Ekehaar,Eleveld,Geelbroek,Hooghalen,Marwijksoord,Vredenheim", 
"Amen,Assen,Deurze,Ekehaar,Eleveld,Geelbroek,Taarlo,Ubbena", 
"Amen,Ekehaar,Eleveld,Geelbroek,Hooghalen", "Annen,Gasteren,Nieuw Annerveen,Oud Annerveen,Schipborg,Zeegse,Zuidlaren","Bedum,Eppenhuizen,Garsthuizen,Huizinge,Kantens,Middelstum,Onderdendam,Rottum,Sint Annen,Startenhuizen,Stedum,Stitswerd,Tinallinge,Toornwerd,Uithuizen,Usquert,Warffum,Westeremden,Westerwijtwerd,Zandeweer", 
"Bedum,Huizinge,Kantens,Lellens,Middelstum,Onderdendam,Rottum,Sauwerd,Sint Annen,Stedum,Stitswerd,Thesinge,Tinallinge,Toornwerd,Westeremden,Westerwijtwerd,Wetsinge,Winsum", 
"Eleveld,Geelbroek", "Emmen", "Emmer-Compascuum,Roswinkel", "Eppenhuizen,Garsthuizen,Huizinge,Kantens,Middelstum,Oldenzijl,Onderdendam,Rottum,Startenhuizen,Stedum,Stitswerd,Toornwerd,Uithuizen,Westeremden,Westerwijtwerd,Zandeweer", 
"Eppenhuizen,Garsthuizen,Huizinge,Kantens,Middelstum,Oldenzijl,Rottum,Startenhuizen,Toornwerd,Westeremden,Zandeweer", 
"Eppenhuizen,Garsthuizen,Oldenzijl,Startenhuizen,t-Zandt,Westeremden,Zeerijp,Zijldijk", 
"Eppenhuizen,Oldenzijl,Startenhuizen,Uithuizen,Zandeweer", "Froombosch,Hellum,Noordbroek,Sappemeer,Schildwolde,Slochteren", "Garrelsweer", "Garrelsweer,Overschild,Ten Post,Winneweer", "Huizinge,Startenhuizen", "Langelo", "Leermens,Oosterwijtwerd", "Loppersum,Winneweer", "Oosteinde,Roodeschool", "Oosterwijtwerd", "Overschild", "Steendam", "Stitswerd", "t-Zandt,Zeerijp", "Westerbroek", "Woudbloem"), class = "factor")), .Names = c("lllocatie", "lat", "lon", "mag.cat", "names"), class = "data.frame", row.names = c(NA, -50L))

I know to do this with the lllocatie variable:

df2 <- ddply(df1, .(lllocatie), summarise,
             n = as.numeric(length(lllocatie)),
             lat = round(mean(lat),3),
             lon = round(mean(lon),3),
             n.1 = as.numeric(length(lllocatie[mag.cat == 1])),
             n.2 = as.numeric(length(lllocatie[mag.cat == 2])),
             n.3 = as.numeric(length(lllocatie[mag.cat == 3]))
)

But I want to summarise it with something like:

df2 <- ddply(df1, .(unique(unlist(strsplit(as.character(df1$names), ",")))), summarise,
             n = #code giving frequency of each unique name,
             lat = #code giving mean for each unique name,
             lon = #code giving mean for each unique name,
             n.1 = #code giving frequency of each unique name for "mag.cat ==1",
             n.2 = #code giving frequency of each unique name for "mag.cat ==1",
             n.3 = #code giving frequency of each unique name for "mag.cat ==1"
)

I can get the frequencies with for example table(unlist(strsplit(as.character(df1$names), ","))) or table(unlist(strsplit(as.character(df1$names), ","))[df1$mag.cat == 1]), but I'm having trouble figuring out how to do it inside the ddply function.

Any ideas how to solve this? Could the new dplyr package be of any help?

Foi útil?

Solução

I'd reshape the data beforehand like

reshapelllocatie <- function(df1) {
  tmp <- strsplit(as.character(df1$names), ",")
  len <- sapply(tmp, length)
  tmp <- cbind.data.frame(name=unlist(tmp), row=rep(1:nrow(df1), times=len))
  tmp <- merge(x=tmp, y=df1, by.x="row", by.y="row.names", all.x=TRUE)[-1] 
  return(tmp)
}

df2 <- ddply(reshapelllocatie(df1), .(name), summarise,
             n = as.numeric(length(name)),
             lat = round(mean(lat),3),
             lon = round(mean(lon),3),
             n.1 = as.numeric(length(name[mag.cat == 1])),
             n.2 = as.numeric(length(name[mag.cat == 2])),
             n.3 = as.numeric(length(name[mag.cat == 3]))
)
df2

#                name n    lat   lon n.1 n.2 n.3
# 1              Amen 6 52.959 6.565   0   0   6
# 2             Annen 1 53.061 6.698   0   0   1
# 3             Assen 5 52.965 6.568   0   0   5
# ...

Outras dicas

With the strsplit function and the data.table package, you can achieve the same without writing a separate function:

dt2 <- setDT(df1)[, strsplit(as.character(names), ",", fixed=TRUE), by = setdiff(names(df1),"names")
                  ][, .(.N, lat = round(mean(lat),3),
                        lon = round(mean(lon),3),
                        n.1 = as.numeric(length(V1[mag.cat == 1])),
                        n.2 = as.numeric(length(V1[mag.cat == 2])),
                        n.3 = as.numeric(length(V1[mag.cat == 3]))), by=V1][order(V1)]

the result:

> dt2
                  V1 N    lat   lon n.1 n.2 n.3
 1:             Amen 6 52.959 6.565   0   0   6
 2:            Annen 1 53.061 6.698   0   0   1
 3:            Assen 5 52.965 6.568   0   0   5
.....

The disadvantages of this is that the names column is renamed to V1. With the tstrsplit function from the data.table package, you get the same result and the column keeps the name names:

library(data.table) #v1.9.5
dt2 <- setDT(df1)[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed=TRUE))), by=setdiff(names(df1),"names")
                  ][, .(.N, lat = round(mean(lat),3),
                        lon = round(mean(lon),3),
                        n.1 = as.numeric(length(names[mag.cat == 1])),
                        n.2 = as.numeric(length(names[mag.cat == 2])),
                        n.3 = as.numeric(length(names[mag.cat == 3]))), by=names][order(names)]

this gives:

> dt2
               names N    lat   lon n.1 n.2 n.3
 1:             Amen 6 52.959 6.565   0   0   6
 2:            Annen 1 53.061 6.698   0   0   1
 3:            Assen 5 52.965 6.568   0   0   5
 .....

Another alternative would be the cSplit function from the splitstackshape package:

library(splitstackshape)
dt3 <- cSplit(df1, sep=",", "names", 'long',
              type.convert=TRUE)[, .(.N, lat = round(mean(lat),3),
                                     lon = round(mean(lon),3),
                                     n.1 = as.numeric(length(names[mag.cat == 1])),
                                     n.2 = as.numeric(length(names[mag.cat == 2])),
                                     n.3 = as.numeric(length(names[mag.cat == 3]))),
                                 by=names][order(names)]

A third alternative is with the combination of dplyr and tidyr:

library(dplyr)
library(tidyr)

df2 <- df %>% 
  mutate(names = strsplit(as.character(names),",")) %>%
  unnest(names) %>%
  group_by(names) %>%
  summarise(n = as.numeric(length(names)),
            lat = round(mean(lat),3),
            lon = round(mean(lon),3),
            n.1 = as.numeric(length(names[mag.cat == 1])),
            n.2 = as.numeric(length(names[mag.cat == 2])),
            n.3 = as.numeric(length(names[mag.cat == 3]))) %>%
  arrange(names)

this gives:

> df2
Source: local data table [69 x 7]

              names n    lat   lon n.1 n.2 n.3
1              Amen 1 52.959 6.565   0   0   6
2             Annen 1 53.061 6.698   0   0   1
3             Assen 1 52.965 6.568   0   0   5
.....
Licenciado em: CC-BY-SA com atribuição
Não afiliado a StackOverflow
scroll top