Pregunta

Objetivo: Datos de la bolsa de valores por intervalos de volumen de 5000 acciones

Formato de datos: fecha, hora, precio, volumen

Mi código es realmente lento en un marco de datos de 1 millón de filas, ¿hay una forma más rápida de hacerlo? He incluido mi código y el conjunto de datos que utilicé. ¡Gracias por su ayuda!

Mi código:

# read data
data1<-read.table(text=ZZ,sep=',',header=T)
colnames(data1)<-c("Date","Time","Price","Volume")

#create column
data1[,"volBinIdx"]<-NA

#create index
volBin<-1
sumVol<-0

#create cutting for each volume bin
for(i in 1:nrow(data1))
{
  sumVol<-sumVol + data1[i,"Volume"]
  if (sumVol<= 5000) {
    data1[i,"volBinIdx"]<-volBin
  } else {
    volBin<-(volBin+1)
    data1[i,"volBinIdx"]<-volBin
    sumVol<-data1[i,'Volume']
  }
}

#aggregate data by volBinIdx
a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))

#create a data frame
x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
colnames(x3)<-c("Date","Time","Open","High","Low","Close")

Mi conjunto de datos:

ZZ<-"
Date,Time,Price,Size
02/07/2014,09:30:01,3,500
02/07/2014,09:30:29,3,42
02/07/2014,09:35:56,3,100
02/07/2014,09:37:17,3,100
02/07/2014,09:37:28,3.2,900
02/07/2014,09:37:35,3.2,4900
02/07/2014,09:37:51,3.2,1000
02/07/2014,09:42:11,3.2,500
02/07/2014,10:00:31,3,2400
02/07/2014,10:00:37,3.2,500
02/07/2014,10:00:44,3.2,3347
02/07/2014,10:07:33,3.2,1000
02/07/2014,10:31:42,3.24,1000
02/07/2014,10:33:44,3.24,200
02/07/2014,10:40:28,3.25,300
02/07/2014,10:49:57,3.25,600
02/07/2014,10:53:16,3.25,100
02/07/2014,10:53:32,3.4,1000
02/07/2014,10:54:13,3.4,500
02/07/2014,11:05:37,3.35,1000
02/07/2014,11:11:29,3.25,600
02/07/2014,11:15:26,3.3,60
02/07/2014,11:19:16,3.3,23
02/07/2014,11:21:14,3.25,100
02/07/2014,11:21:22,3.25,100
02/07/2014,11:21:30,3.2,500
02/07/2014,11:21:35,3.2,500
02/07/2014,11:21:43,3.2,500
02/07/2014,11:29:58,3.1,200
02/07/2014,11:35:42,3.19,360
02/07/2014,11:39:51,3.19,1000
02/07/2014,11:52:39,3.15,200
02/07/2014,11:53:51,3.15,100
02/07/2014,11:55:11,3.2,100
02/07/2014,12:17:32,3.2,1500
02/07/2014,12:35:42,3.24,1200
02/07/2014,12:37:53,3.24,100
02/07/2014,12:38:02,3.24,3500
02/07/2014,12:53:57,3.24,400
02/07/2014,13:10:57,3.239,100
02/07/2014,13:11:35,3.24,800
02/07/2014,13:13:41,3.24,1000
02/07/2014,13:39:40,3.24,450
02/07/2014,13:56:04,3.24,500
02/07/2014,14:09:49,3.24,600
02/07/2014,14:11:25,3.24,1000
02/07/2014,14:25:53,3.24,25
02/07/2014,14:30:58,3.24,30
02/07/2014,14:31:36,3.24,30
02/07/2014,14:32:12,3.24,30
02/07/2014,14:33:00,3.24,100
02/07/2014,14:34:49,3.24,1100
02/07/2014,14:36:02,3.24,2000
02/07/2014,14:37:07,3.22,1500
02/07/2014,14:42:30,3.22,3300
02/07/2014,14:42:46,3.22,100
02/07/2014,14:42:54,3.2,1000
02/07/2014,14:53:13,3.23,240
02/07/2014,14:53:27,3.24,500
02/07/2014,14:53:59,3.24,60
02/07/2014,14:54:46,3.2,1500
02/07/2014,14:57:45,3.2,160
02/07/2014,14:57:46,3.2,125
02/07/2014,14:57:54,3.2,100
02/07/2014,15:05:56,3.19,100
02/07/2014,15:22:21,3.19,300
02/07/2014,15:22:28,3.18,150
02/07/2014,15:23:09,3.19,2000
02/07/2014,15:35:23,3.18,1500
02/07/2014,15:44:36,3.18,600
02/10/2014,09:30:02,3.25,100
02/10/2014,09:30:02,3.25,25
02/10/2014,09:30:24,3.25,150
02/10/2014,09:30:40,3.25,100
02/10/2014,09:31:11,3.25,650
02/10/2014,09:35:32,3.24,200
02/10/2014,09:37:59,3.19,100
02/10/2014,09:38:01,3.2,2000
02/10/2014,09:38:09,3.18,185
02/10/2014,09:38:36,3.18,500
02/10/2014,09:39:13,3.18,1042
02/10/2014,09:39:18,3.18,156
02/10/2014,09:39:18,3.17,20
02/10/2014,09:41:24,3.15,100
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:41,3.15,500
02/10/2014,09:42:57,3.15,100
02/10/2014,09:43:24,3.12,500
02/10/2014,09:43:29,3.12,100
02/10/2014,09:43:32,3.1,5000
02/10/2014,09:44:02,3.1,500
02/10/2014,09:44:19,3.1,500
02/10/2014,09:44:22,3.09,100
02/10/2014,09:44:22,3.09,96
02/10/2014,09:44:55,3.05,100
02/10/2014,09:45:11,3.05,676
02/10/2014,09:45:23,3,150
02/10/2014,09:45:44,2.95,1000
02/10/2014,09:45:53,2.95,1500
02/10/2014,09:47:17,2.95,100
02/10/2014,09:47:46,2.9,100
02/10/2014,09:48:24,2.9,500
02/10/2014,09:48:50,2.9,100
02/10/2014,09:49:11,2.85,386
02/10/2014,09:49:13,2.85,100
02/10/2014,09:49:14,2.8,200
02/10/2014,09:49:15,2.7,100
02/10/2014,09:49:22,2.7,100
02/10/2014,09:49:32,2.7,100
02/10/2014,09:50:09,2.65,2500
02/10/2014,09:50:44,2.66,2500
02/10/2014,09:50:49,2.6,100
02/10/2014,09:50:53,2.7,240
02/10/2014,09:50:54,2.61,1000
02/10/2014,09:50:58,2.65,414
02/10/2014,09:55:24,2.95,100
02/10/2014,09:57:22,2.95,400
02/10/2014,10:07:21,2.95,400
02/10/2014,10:16:28,2.95,250
02/10/2014,10:21:20,2.85,300
02/10/2014,10:32:40,2.94,100
02/10/2014,10:33:18,2.95,426
02/10/2014,10:33:38,2.95,70
02/10/2014,10:33:39,2.94,1900
02/10/2014,10:43:46,2.95,4500
02/10/2014,10:44:00,2.99,200
02/10/2014,10:44:20,2.99,505
02/10/2014,10:49:30,2.96,500
02/10/2014,10:57:22,2.95,2500
02/10/2014,10:57:25,2.95,500
02/10/2014,10:57:40,2.95,500
02/10/2014,11:38:29,3,500
02/10/2014,11:38:35,3.05,500
02/10/2014,11:38:45,3.1,1000
02/10/2014,11:45:08,3.05,100
02/10/2014,11:49:55,3.01,100
02/10/2014,11:50:14,3,1900
02/10/2014,11:50:18,3,100
02/10/2014,12:07:51,3,1000
02/10/2014,12:33:26,3,400
02/10/2014,13:57:20,3.1,150
02/10/2014,13:57:34,3,42
02/10/2014,14:21:42,3.15,500
02/10/2014,14:23:35,3.15,1000
02/10/2014,14:25:40,3.05,200
02/10/2014,14:26:01,3.15,100
02/10/2014,14:50:50,3.15,100
02/10/2014,14:51:00,3.1,100
02/10/2014,14:51:09,3.1,100
02/10/2014,14:51:24,3.05,500
02/10/2014,14:51:43,3,100
02/10/2014,14:52:04,2.95,100
02/10/2014,14:52:15,2.99,25
02/10/2014,14:52:17,2.95,100
02/10/2014,14:52:33,2.9,500
02/10/2014,14:52:47,2.95,600
02/10/2014,14:52:49,2.85,100
02/10/2014,14:52:51,2.85,1000
02/10/2014,14:53:08,2.82,500
02/10/2014,14:53:24,2.85,500
02/10/2014,14:53:43,2.84,5400
02/10/2014,14:53:48,2.85,100
02/10/2014,15:00:48,2.99,64
02/10/2014,15:04:08,2.99,412
02/10/2014,15:11:42,2.99,100
02/10/2014,15:11:46,2.99,100
02/10/2014,15:12:06,2.99,100
02/10/2014,15:20:35,3.04,500
02/10/2014,15:30:28,3,500
02/10/2014,15:36:58,2.95,2000
02/10/2014,15:38:09,3,550
02/10/2014,15:39:48,2.97,2000
02/11/2014,09:30:04,3.2,100
02/11/2014,09:30:18,3.2,2000
02/11/2014,10:03:07,3.18,1000
02/11/2014,10:21:35,3.18,26
02/11/2014,10:27:09,3.15,500
02/11/2014,10:37:22,3.15,1108
02/11/2014,10:37:22,3.15,1054
02/11/2014,10:37:23,3.1,100
02/11/2014,10:42:26,3.05,1000
02/11/2014,10:42:57,3.02,1000
02/11/2014,10:43:29,3.02,1000
02/11/2014,10:48:27,3.02,100
02/11/2014,10:50:36,3.01,1000
02/11/2014,10:51:33,3.01,1000
02/11/2014,10:51:43,3.01,1000
02/11/2014,10:52:17,3.01,1000
02/11/2014,10:53:55,3.01,500
02/11/2014,10:54:31,3.05,40
02/11/2014,10:55:41,3.01,100
02/11/2014,10:55:44,3,3300
02/11/2014,10:55:44,3,100
02/11/2014,10:55:44,3,5000
02/11/2014,10:55:44,3,230
02/11/2014,10:56:21,3,100
02/11/2014,11:01:20,3,100
02/11/2014,11:01:21,3,50
02/11/2014,11:17:30,2.99,600
02/11/2014,11:17:34,3,500
02/11/2014,11:18:49,2.99,3000
02/11/2014,11:25:55,3.03,500
02/11/2014,11:29:59,2.99,400
02/11/2014,11:30:08,2.99,100
02/11/2014,11:30:18,2.99,100
02/11/2014,11:30:46,2.99,200
02/11/2014,11:38:48,2.95,100
02/11/2014,11:44:55,2.98,325
02/11/2014,12:32:09,3,500
02/11/2014,12:32:55,3,50
02/11/2014,13:15:49,3.1,1000
02/11/2014,14:16:16,3.05,350
02/11/2014,14:29:12,2.99,650
02/11/2014,14:32:23,2.99,335
02/11/2014,14:32:29,2.99,500
02/11/2014,15:25:01,3,1000
02/11/2014,15:49:37,3,500
02/11/2014,15:51:08,2.98,300
02/12/2014,08:46:23,3,1500
02/12/2014,09:10:01,3,2000
02/12/2014,09:21:31,3.1,1500
02/12/2014,09:26:33,3.2,2000
02/12/2014,09:27:58,3.2,2500
02/12/2014,09:30:00,3.2,2000
02/12/2014,09:30:00,3.2,10000
02/12/2014,09:30:01,3.2,500
02/12/2014,09:30:02,3.2,30
02/12/2014,09:30:18,3.2,30
02/12/2014,09:40:51,3.05,100
02/12/2014,09:40:52,3.05,1250
02/12/2014,09:41:01,3.05,806
02/12/2014,09:41:11,3,100
02/12/2014,09:43:48,2.98,1000
02/12/2014,09:44:22,3,4000
02/12/2014,09:44:27,2.98,1000
02/12/2014,09:44:31,2.98,2900
02/12/2014,09:47:43,2.98,110
02/12/2014,09:50:49,2.96,100
02/12/2014,09:50:51,2.8,750
02/12/2014,09:51:11,2.95,100
02/12/2014,09:55:35,2.95,1050
02/12/2014,09:55:56,2.95,100
02/12/2014,09:56:29,3,100"

¿Fue útil?

Solución

Este es quizás no el código óptimo, pero es al menos una mejora en aproximadamente el factor 4. Sigo usando el bucle para el bucle, pero reemplazó algunos datos. Substajismo con vectores y el agregado con dplyr.

library(dplyr)
library(microbenchmark)

microbenchmark(
  original = {
    data1<-read.table(text=ZZ,sep=',',header=T)
    colnames(data1)<-c("Date","Time","Price","Volume")

    #create column
    data1[,"volBinIdx"]<-NA

    #create index
    volBin<-1
    sumVol<-0

    #create cutting for each volume bin
    for(i in 1:nrow(data1))
    {
      sumVol<-sumVol + data1[i,"Volume"]
      if (sumVol<= 5000) {
        data1[i,"volBinIdx"]<-volBin
      } else {
        volBin<-(volBin+1)
        data1[i,"volBinIdx"]<-volBin
        sumVol<-data1[i,'Volume']
      }
    }

    #aggregate data by volBinIdx
    a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
    a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
    a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))

    #create a data frame
    x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
    colnames(x3)<-c("Date","Time","Open","High","Low","Close")
  },

  beginneR = {

    data1<-read.table(text=ZZ,sep=',',header=T)
    colnames(data1)<-c("Date","Time","Price","Volume")

    #create index
    volBin<-1
    sumVol<-0
    Volume <- data1$Volume
    volBinIdx <- numeric(nrow(data1))

    #create cutting for each volume bin

    for(i in seq_len(nrow(data1))){
      sumVol <- sumVol + Volume[i]
      if (sumVol <= 5000) {
        volBinIdx[i] <- volBin
      } else {
        volBinIdx[i] <-  volBin <- volBin + 1
        sumVol <- Volume[i]
      }
    }

    data1 <- data1 %>%
      mutate(volBinIdx = volBinIdx) %>%
      group_by(volBinIdx) %>%
      summarize(Date = head(Date, 1),
                Time = head(Time, 1),
                Open = head(Price, 1),
                High = max(Price),
                Low = min(Price),
                Close = tail(Price, 1)) %>% 
      select(-volBinIdx)

  }, unit = "relative")

#    Unit: relative
#    expr      min      lq   median       uq       max neval
#original 4.180704 4.24341 4.254675 4.129769 0.7706553   100
#beginneR 1.000000 1.00000 1.000000 1.000000 1.0000000   100

Licenciado bajo: CC-BY-SA con atribución
No afiliado a StackOverflow
scroll top