문제

I have a data frame that currently contains two ‘time’ columns in HH:MM:SS format. I would like to condense this data frame so that I only have one row for each unique ‘id’ value. I would like to keep the row for each unique ‘id’ value which has a ‘time1’ value that is the nearest match to the ‘time2’ value. However, 'time1' needs to be greater than ‘time2’.

Here is a simple example:

> dput(df)
structure(list(id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L), count = c(23L, 23L, 23L, 23L, 45L, 45L, 
45L, 45L, 67L, 67L, 67L, 67L, 88L, 88L, 88L, 88L), time1 = structure(c(1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 3L, 3L, 3L, 3L), .Label = c("00:13:00", 
"01:13:00", "07:18:00", "18:14:00"), class = "factor"), time2 = structure(c(4L, 
1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L), .Label = c("00:00:00", 
"06:00:00", "12:00:00", "18:00:00"), class = "factor"), afn = c(3.36, 
0.63, 1.77, 3.89, 3.36, 0.63, 1.77, 3.89, 3.36, 0.63, 1.77, 3.89, 
3.36, 0.63, 1.77, 3.89), dfn = c(201.67, 157.27, 103.55, 191.41, 
201.67, 157.27, 103.55, 191.41, 201.67, 157.27, 103.55, 191.41, 
201.67, 157.27, 103.55, 191.41)), .Names = c("id", "count", "time1", 
"time2", "afn", "dfn"), class = "data.frame", row.names = c(NA, 
-16L))

> df
   id count    time1    time2  afn    dfn
1   1    23 00:13:00 18:00:00 3.36 201.67
2   1    23 00:13:00 00:00:00 0.63 157.27
3   1    23 00:13:00 06:00:00 1.77 103.55
4   1    23 00:13:00 12:00:00 3.89 191.41
5   2    45 01:13:00 18:00:00 3.36 201.67
6   2    45 01:13:00 00:00:00 0.63 157.27
7   2    45 01:13:00 06:00:00 1.77 103.55
8   2    45 01:13:00 12:00:00 3.89 191.41
9   3    67 18:14:00 18:00:00 3.36 201.67
10  3    67 18:14:00 00:00:00 0.63 157.27
11  3    67 18:14:00 06:00:00 1.77 103.55
12  3    67 18:14:00 12:00:00 3.89 191.41
13  4    88 07:18:00 18:00:00 3.36 201.67
14  4    88 07:18:00 00:00:00 0.63 157.27
15  4    88 07:18:00 06:00:00 1.77 103.55
16  4    88 07:18:00 12:00:00 3.89 191.41

I would like to end up with this matrix in the above case:

id  count   time1       time2       afn     dfn
1   23      00:13:00    00:00:00    0.63    157.27
2   45      01:13:00    00:00:00    0.63    157.27
3   67      18:14:00    18:00:00    3.36    201.67
4   88      07:18:00    06:00:00    1.77    103.55

I have used the ddply() function to condense data frames in the past, but not with an incorporated matching rule. I have to apply this is a data frame with lots of columns (many more than the simple example given here) so any suggestions about how to do this would be brilliant. Any help would be greatly appreciated. Many thanks!

도움이 되었습니까?

해결책

Here are a few solutions.

1) ave This uses chron times as well as subset and ave from the base of R:

library(chron)

delta <- as.vector(times(df$time1) - times(df$time2))
df2 <- subset(df, delta > 0)
df2[ave(delta, df2$id, FUN = function(delta) delta == min(delta)) == 1, ]

2) dplyr This uses chron times and the dplyr package:

library(chron)
library(dplyr) 

df %.% 
   mutate(delta = as.vector(times(time1) - times(time2))) %.% 
   filter(delta > 0) %.% 
   group_by(id) %.% 
   filter(delta == min(delta)) %.% 
   select(- delta)

3) sqldf

library(sqldf)

sqldf("select *, min(strftime('%s', time1) - strftime('%s', time2)) delta
  from (select * from df where strftime('%s', time1) > strftime('%s', time2))
  group by id")[seq_along(df)]

or perhaps this variation where we calculate delta in R and then use sqldf:

library(sqldf)
library(chron)

df2 = transform(df, delta = as.vector(times(time1) - times(time2)))

sqldf("select *, min(delta) delta
  from (select * from df2 where delta > 0)
  group by id")[-ncol(df2)]

4) data.table

library(data.table)
library(chron)

DT <- data.table(df)
DT[, delta := times(time1) - times(time2)
 ][delta > 0
 ][, .SD[delta == min(delta)], by = id
 ][, seq_along(df), with = FALSE]

ADDED additional solutions. Corrected library and subset statements. Minor improvements.

다른 팁

Here's an approach with the powerful dplyr package:

library(dplyr)

(df %.%
   mutate(timeDiff = as.integer(strptime(time1, "%X") - strptime(time2, "%X")),
          posDiff = timeDiff >= 0) %.%
   filter(posDiff) %.%
   group_by(id) %.%
   filter(min(timeDiff) == timeDiff))[names(df)]

#   id count    time1    time2  afn    dfn
# 1  1    23 00:13:00 00:00:00 0.63 157.27
# 2  2    45 01:13:00 00:00:00 0.63 157.27
# 3  3    67 18:14:00 18:00:00 3.36 201.67
# 4  4    88 07:18:00 06:00:00 1.77 103.55

An approach using ddply and merge. (Assuming that the "nearest match times" are the minimum absolute values of the difftimes)

t1 <- strptime(df$time1, "%H:%M:%S")
t2 <- strptime(df$time2, "%H:%M:%S")
df$min.diff <- abs(as.numeric(difftime(t1, t2, units='mins')))

d1 <- ddply(df, .(id), summarize, min.diff = min(min.diff))

> merge(df, d1, by = c("id", "min.diff"))
  id min.diff count    time1    time2  afn    dfn
1  1       13    23 00:13:00 00:00:00 0.63 157.27
2  2       73    45 01:13:00 00:00:00 0.63 157.27
3  3       14    67 18:14:00 18:00:00 3.36 201.67
4  4       78    88 07:18:00 06:00:00 1.77 103.55
라이센스 : CC-BY-SA ~와 함께 속성
제휴하지 않습니다 StackOverflow
scroll top