¿Cómo agregar datos del mercado de valores por tamaño fijo de volumen?
-
20-12-2019 - |
Pregunta
Objetivo: Datos de la bolsa de valores por intervalos de volumen de 5000 acciones
Formato de datos: fecha, hora, precio, volumen
Mi código es realmente lento en un marco de datos de 1 millón de filas, ¿hay una forma más rápida de hacerlo? He incluido mi código y el conjunto de datos que utilicé. ¡Gracias por su ayuda!
Mi código:
# read data
data1<-read.table(text=ZZ,sep=',',header=T)
colnames(data1)<-c("Date","Time","Price","Volume")
#create column
data1[,"volBinIdx"]<-NA
#create index
volBin<-1
sumVol<-0
#create cutting for each volume bin
for(i in 1:nrow(data1))
{
sumVol<-sumVol + data1[i,"Volume"]
if (sumVol<= 5000) {
data1[i,"volBinIdx"]<-volBin
} else {
volBin<-(volBin+1)
data1[i,"volBinIdx"]<-volBin
sumVol<-data1[i,'Volume']
}
}
#aggregate data by volBinIdx
a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))
#create a data frame
x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
colnames(x3)<-c("Date","Time","Open","High","Low","Close")
Mi conjunto de datos:
ZZ<-"
Date,Time,Price,Size
02/07/2014,09:30:01,3,500
02/07/2014,09:30:29,3,42
02/07/2014,09:35:56,3,100
02/07/2014,09:37:17,3,100
02/07/2014,09:37:28,3.2,900
02/07/2014,09:37:35,3.2,4900
02/07/2014,09:37:51,3.2,1000
02/07/2014,09:42:11,3.2,500
02/07/2014,10:00:31,3,2400
02/07/2014,10:00:37,3.2,500
02/07/2014,10:00:44,3.2,3347
02/07/2014,10:07:33,3.2,1000
02/07/2014,10:31:42,3.24,1000
02/07/2014,10:33:44,3.24,200
02/07/2014,10:40:28,3.25,300
02/07/2014,10:49:57,3.25,600
02/07/2014,10:53:16,3.25,100
02/07/2014,10:53:32,3.4,1000
02/07/2014,10:54:13,3.4,500
02/07/2014,11:05:37,3.35,1000
02/07/2014,11:11:29,3.25,600
02/07/2014,11:15:26,3.3,60
02/07/2014,11:19:16,3.3,23
02/07/2014,11:21:14,3.25,100
02/07/2014,11:21:22,3.25,100
02/07/2014,11:21:30,3.2,500
02/07/2014,11:21:35,3.2,500
02/07/2014,11:21:43,3.2,500
02/07/2014,11:29:58,3.1,200
02/07/2014,11:35:42,3.19,360
02/07/2014,11:39:51,3.19,1000
02/07/2014,11:52:39,3.15,200
02/07/2014,11:53:51,3.15,100
02/07/2014,11:55:11,3.2,100
02/07/2014,12:17:32,3.2,1500
02/07/2014,12:35:42,3.24,1200
02/07/2014,12:37:53,3.24,100
02/07/2014,12:38:02,3.24,3500
02/07/2014,12:53:57,3.24,400
02/07/2014,13:10:57,3.239,100
02/07/2014,13:11:35,3.24,800
02/07/2014,13:13:41,3.24,1000
02/07/2014,13:39:40,3.24,450
02/07/2014,13:56:04,3.24,500
02/07/2014,14:09:49,3.24,600
02/07/2014,14:11:25,3.24,1000
02/07/2014,14:25:53,3.24,25
02/07/2014,14:30:58,3.24,30
02/07/2014,14:31:36,3.24,30
02/07/2014,14:32:12,3.24,30
02/07/2014,14:33:00,3.24,100
02/07/2014,14:34:49,3.24,1100
02/07/2014,14:36:02,3.24,2000
02/07/2014,14:37:07,3.22,1500
02/07/2014,14:42:30,3.22,3300
02/07/2014,14:42:46,3.22,100
02/07/2014,14:42:54,3.2,1000
02/07/2014,14:53:13,3.23,240
02/07/2014,14:53:27,3.24,500
02/07/2014,14:53:59,3.24,60
02/07/2014,14:54:46,3.2,1500
02/07/2014,14:57:45,3.2,160
02/07/2014,14:57:46,3.2,125
02/07/2014,14:57:54,3.2,100
02/07/2014,15:05:56,3.19,100
02/07/2014,15:22:21,3.19,300
02/07/2014,15:22:28,3.18,150
02/07/2014,15:23:09,3.19,2000
02/07/2014,15:35:23,3.18,1500
02/07/2014,15:44:36,3.18,600
02/10/2014,09:30:02,3.25,100
02/10/2014,09:30:02,3.25,25
02/10/2014,09:30:24,3.25,150
02/10/2014,09:30:40,3.25,100
02/10/2014,09:31:11,3.25,650
02/10/2014,09:35:32,3.24,200
02/10/2014,09:37:59,3.19,100
02/10/2014,09:38:01,3.2,2000
02/10/2014,09:38:09,3.18,185
02/10/2014,09:38:36,3.18,500
02/10/2014,09:39:13,3.18,1042
02/10/2014,09:39:18,3.18,156
02/10/2014,09:39:18,3.17,20
02/10/2014,09:41:24,3.15,100
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:41,3.15,500
02/10/2014,09:42:57,3.15,100
02/10/2014,09:43:24,3.12,500
02/10/2014,09:43:29,3.12,100
02/10/2014,09:43:32,3.1,5000
02/10/2014,09:44:02,3.1,500
02/10/2014,09:44:19,3.1,500
02/10/2014,09:44:22,3.09,100
02/10/2014,09:44:22,3.09,96
02/10/2014,09:44:55,3.05,100
02/10/2014,09:45:11,3.05,676
02/10/2014,09:45:23,3,150
02/10/2014,09:45:44,2.95,1000
02/10/2014,09:45:53,2.95,1500
02/10/2014,09:47:17,2.95,100
02/10/2014,09:47:46,2.9,100
02/10/2014,09:48:24,2.9,500
02/10/2014,09:48:50,2.9,100
02/10/2014,09:49:11,2.85,386
02/10/2014,09:49:13,2.85,100
02/10/2014,09:49:14,2.8,200
02/10/2014,09:49:15,2.7,100
02/10/2014,09:49:22,2.7,100
02/10/2014,09:49:32,2.7,100
02/10/2014,09:50:09,2.65,2500
02/10/2014,09:50:44,2.66,2500
02/10/2014,09:50:49,2.6,100
02/10/2014,09:50:53,2.7,240
02/10/2014,09:50:54,2.61,1000
02/10/2014,09:50:58,2.65,414
02/10/2014,09:55:24,2.95,100
02/10/2014,09:57:22,2.95,400
02/10/2014,10:07:21,2.95,400
02/10/2014,10:16:28,2.95,250
02/10/2014,10:21:20,2.85,300
02/10/2014,10:32:40,2.94,100
02/10/2014,10:33:18,2.95,426
02/10/2014,10:33:38,2.95,70
02/10/2014,10:33:39,2.94,1900
02/10/2014,10:43:46,2.95,4500
02/10/2014,10:44:00,2.99,200
02/10/2014,10:44:20,2.99,505
02/10/2014,10:49:30,2.96,500
02/10/2014,10:57:22,2.95,2500
02/10/2014,10:57:25,2.95,500
02/10/2014,10:57:40,2.95,500
02/10/2014,11:38:29,3,500
02/10/2014,11:38:35,3.05,500
02/10/2014,11:38:45,3.1,1000
02/10/2014,11:45:08,3.05,100
02/10/2014,11:49:55,3.01,100
02/10/2014,11:50:14,3,1900
02/10/2014,11:50:18,3,100
02/10/2014,12:07:51,3,1000
02/10/2014,12:33:26,3,400
02/10/2014,13:57:20,3.1,150
02/10/2014,13:57:34,3,42
02/10/2014,14:21:42,3.15,500
02/10/2014,14:23:35,3.15,1000
02/10/2014,14:25:40,3.05,200
02/10/2014,14:26:01,3.15,100
02/10/2014,14:50:50,3.15,100
02/10/2014,14:51:00,3.1,100
02/10/2014,14:51:09,3.1,100
02/10/2014,14:51:24,3.05,500
02/10/2014,14:51:43,3,100
02/10/2014,14:52:04,2.95,100
02/10/2014,14:52:15,2.99,25
02/10/2014,14:52:17,2.95,100
02/10/2014,14:52:33,2.9,500
02/10/2014,14:52:47,2.95,600
02/10/2014,14:52:49,2.85,100
02/10/2014,14:52:51,2.85,1000
02/10/2014,14:53:08,2.82,500
02/10/2014,14:53:24,2.85,500
02/10/2014,14:53:43,2.84,5400
02/10/2014,14:53:48,2.85,100
02/10/2014,15:00:48,2.99,64
02/10/2014,15:04:08,2.99,412
02/10/2014,15:11:42,2.99,100
02/10/2014,15:11:46,2.99,100
02/10/2014,15:12:06,2.99,100
02/10/2014,15:20:35,3.04,500
02/10/2014,15:30:28,3,500
02/10/2014,15:36:58,2.95,2000
02/10/2014,15:38:09,3,550
02/10/2014,15:39:48,2.97,2000
02/11/2014,09:30:04,3.2,100
02/11/2014,09:30:18,3.2,2000
02/11/2014,10:03:07,3.18,1000
02/11/2014,10:21:35,3.18,26
02/11/2014,10:27:09,3.15,500
02/11/2014,10:37:22,3.15,1108
02/11/2014,10:37:22,3.15,1054
02/11/2014,10:37:23,3.1,100
02/11/2014,10:42:26,3.05,1000
02/11/2014,10:42:57,3.02,1000
02/11/2014,10:43:29,3.02,1000
02/11/2014,10:48:27,3.02,100
02/11/2014,10:50:36,3.01,1000
02/11/2014,10:51:33,3.01,1000
02/11/2014,10:51:43,3.01,1000
02/11/2014,10:52:17,3.01,1000
02/11/2014,10:53:55,3.01,500
02/11/2014,10:54:31,3.05,40
02/11/2014,10:55:41,3.01,100
02/11/2014,10:55:44,3,3300
02/11/2014,10:55:44,3,100
02/11/2014,10:55:44,3,5000
02/11/2014,10:55:44,3,230
02/11/2014,10:56:21,3,100
02/11/2014,11:01:20,3,100
02/11/2014,11:01:21,3,50
02/11/2014,11:17:30,2.99,600
02/11/2014,11:17:34,3,500
02/11/2014,11:18:49,2.99,3000
02/11/2014,11:25:55,3.03,500
02/11/2014,11:29:59,2.99,400
02/11/2014,11:30:08,2.99,100
02/11/2014,11:30:18,2.99,100
02/11/2014,11:30:46,2.99,200
02/11/2014,11:38:48,2.95,100
02/11/2014,11:44:55,2.98,325
02/11/2014,12:32:09,3,500
02/11/2014,12:32:55,3,50
02/11/2014,13:15:49,3.1,1000
02/11/2014,14:16:16,3.05,350
02/11/2014,14:29:12,2.99,650
02/11/2014,14:32:23,2.99,335
02/11/2014,14:32:29,2.99,500
02/11/2014,15:25:01,3,1000
02/11/2014,15:49:37,3,500
02/11/2014,15:51:08,2.98,300
02/12/2014,08:46:23,3,1500
02/12/2014,09:10:01,3,2000
02/12/2014,09:21:31,3.1,1500
02/12/2014,09:26:33,3.2,2000
02/12/2014,09:27:58,3.2,2500
02/12/2014,09:30:00,3.2,2000
02/12/2014,09:30:00,3.2,10000
02/12/2014,09:30:01,3.2,500
02/12/2014,09:30:02,3.2,30
02/12/2014,09:30:18,3.2,30
02/12/2014,09:40:51,3.05,100
02/12/2014,09:40:52,3.05,1250
02/12/2014,09:41:01,3.05,806
02/12/2014,09:41:11,3,100
02/12/2014,09:43:48,2.98,1000
02/12/2014,09:44:22,3,4000
02/12/2014,09:44:27,2.98,1000
02/12/2014,09:44:31,2.98,2900
02/12/2014,09:47:43,2.98,110
02/12/2014,09:50:49,2.96,100
02/12/2014,09:50:51,2.8,750
02/12/2014,09:51:11,2.95,100
02/12/2014,09:55:35,2.95,1050
02/12/2014,09:55:56,2.95,100
02/12/2014,09:56:29,3,100"
Solución
Este es quizás no el código óptimo, pero es al menos una mejora en aproximadamente el factor 4. Sigo usando el bucle para el bucle, pero reemplazó algunos datos. Substajismo con vectores y el agregado con dplyr
.
library(dplyr)
library(microbenchmark)
microbenchmark(
original = {
data1<-read.table(text=ZZ,sep=',',header=T)
colnames(data1)<-c("Date","Time","Price","Volume")
#create column
data1[,"volBinIdx"]<-NA
#create index
volBin<-1
sumVol<-0
#create cutting for each volume bin
for(i in 1:nrow(data1))
{
sumVol<-sumVol + data1[i,"Volume"]
if (sumVol<= 5000) {
data1[i,"volBinIdx"]<-volBin
} else {
volBin<-(volBin+1)
data1[i,"volBinIdx"]<-volBin
sumVol<-data1[i,'Volume']
}
}
#aggregate data by volBinIdx
a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))
#create a data frame
x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
colnames(x3)<-c("Date","Time","Open","High","Low","Close")
},
beginneR = {
data1<-read.table(text=ZZ,sep=',',header=T)
colnames(data1)<-c("Date","Time","Price","Volume")
#create index
volBin<-1
sumVol<-0
Volume <- data1$Volume
volBinIdx <- numeric(nrow(data1))
#create cutting for each volume bin
for(i in seq_len(nrow(data1))){
sumVol <- sumVol + Volume[i]
if (sumVol <= 5000) {
volBinIdx[i] <- volBin
} else {
volBinIdx[i] <- volBin <- volBin + 1
sumVol <- Volume[i]
}
}
data1 <- data1 %>%
mutate(volBinIdx = volBinIdx) %>%
group_by(volBinIdx) %>%
summarize(Date = head(Date, 1),
Time = head(Time, 1),
Open = head(Price, 1),
High = max(Price),
Low = min(Price),
Close = tail(Price, 1)) %>%
select(-volBinIdx)
}, unit = "relative")
# Unit: relative
# expr min lq median uq max neval
#original 4.180704 4.24341 4.254675 4.129769 0.7706553 100
#beginneR 1.000000 1.00000 1.000000 1.000000 1.0000000 100
Licenciado bajo: CC-BY-SA con atribución
No afiliado a StackOverflow