Summarising dataframe based on string values

Question 1

I'd reshape the data beforehand like

reshapelllocatie <- function(df1) {
  tmp <- strsplit(as.character(df1$names), ",")
  len <- sapply(tmp, length)
  tmp <- cbind.data.frame(name=unlist(tmp), row=rep(1:nrow(df1), times=len))
  tmp <- merge(x=tmp, y=df1, by.x="row", by.y="row.names", all.x=TRUE)[-1] 
  return(tmp)
}

df2 <- ddply(reshapelllocatie(df1), .(name), summarise,
             n = as.numeric(length(name)),
             lat = round(mean(lat),3),
             lon = round(mean(lon),3),
             n.1 = as.numeric(length(name[mag.cat == 1])),
             n.2 = as.numeric(length(name[mag.cat == 2])),
             n.3 = as.numeric(length(name[mag.cat == 3]))
)
df2

#                name n    lat   lon n.1 n.2 n.3
# 1              Amen 6 52.959 6.565   0   0   6
# 2             Annen 1 53.061 6.698   0   0   1
# 3             Assen 5 52.965 6.568   0   0   5
# ...

Question 2

With the strsplit function and the data.table package, you can achieve the same without writing a separate function:

dt2 <- setDT(df1)[, strsplit(as.character(names), ",", fixed=TRUE), by = setdiff(names(df1),"names")
                  ][, .(.N, lat = round(mean(lat),3),
                        lon = round(mean(lon),3),
                        n.1 = as.numeric(length(V1[mag.cat == 1])),
                        n.2 = as.numeric(length(V1[mag.cat == 2])),
                        n.3 = as.numeric(length(V1[mag.cat == 3]))), by=V1][order(V1)]

the result:

> dt2
                  V1 N    lat   lon n.1 n.2 n.3
 1:             Amen 6 52.959 6.565   0   0   6
 2:            Annen 1 53.061 6.698   0   0   1
 3:            Assen 5 52.965 6.568   0   0   5
.....

The disadvantages of this is that the names column is renamed to V1. With the tstrsplit function from the data.table package, you get the same result and the column keeps the name names:

library(data.table) #v1.9.5
dt2 <- setDT(df1)[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed=TRUE))), by=setdiff(names(df1),"names")
                  ][, .(.N, lat = round(mean(lat),3),
                        lon = round(mean(lon),3),
                        n.1 = as.numeric(length(names[mag.cat == 1])),
                        n.2 = as.numeric(length(names[mag.cat == 2])),
                        n.3 = as.numeric(length(names[mag.cat == 3]))), by=names][order(names)]

this gives:

> dt2
               names N    lat   lon n.1 n.2 n.3
 1:             Amen 6 52.959 6.565   0   0   6
 2:            Annen 1 53.061 6.698   0   0   1
 3:            Assen 5 52.965 6.568   0   0   5
 .....

Another alternative would be the cSplit function from the splitstackshape package:

library(splitstackshape)
dt3 <- cSplit(df1, sep=",", "names", 'long',
              type.convert=TRUE)[, .(.N, lat = round(mean(lat),3),
                                     lon = round(mean(lon),3),
                                     n.1 = as.numeric(length(names[mag.cat == 1])),
                                     n.2 = as.numeric(length(names[mag.cat == 2])),
                                     n.3 = as.numeric(length(names[mag.cat == 3]))),
                                 by=names][order(names)]

A third alternative is with the combination of dplyr and tidyr:

library(dplyr)
library(tidyr)

df2 <- df %>% 
  mutate(names = strsplit(as.character(names),",")) %>%
  unnest(names) %>%
  group_by(names) %>%
  summarise(n = as.numeric(length(names)),
            lat = round(mean(lat),3),
            lon = round(mean(lon),3),
            n.1 = as.numeric(length(names[mag.cat == 1])),
            n.2 = as.numeric(length(names[mag.cat == 2])),
            n.3 = as.numeric(length(names[mag.cat == 3]))) %>%
  arrange(names)

this gives:

> df2
Source: local data table [69 x 7]

              names n    lat   lon n.1 n.2 n.3
1              Amen 1 52.959 6.565   0   0   6
2             Annen 1 53.061 6.698   0   0   1
3             Assen 1 52.965 6.568   0   0   5
.....