كيفية تجميع بيانات سوق الأوراق المالية حسب حجم الحجم الثابت؟

StackOverflow https://stackoverflow.com//questions/25008172

سؤال

هدف:شريحة بيانات سوق الأوراق المالية بفواصل حجمية تبلغ 5000 سهم

تنسيق البيانات:التاريخ والوقت والسعر والحجم

الكود الخاص بي بطيء حقًا في إطار بيانات مكون من مليون صف، هل هناك طريقة أسرع للقيام بذلك؟لقد قمت بتضمين الكود الخاص بي ومجموعة البيانات التي استخدمتها.شكرا لك على مساعدتك!

رمز بلدي:

# read data
data1<-read.table(text=ZZ,sep=',',header=T)
colnames(data1)<-c("Date","Time","Price","Volume")

#create column
data1[,"volBinIdx"]<-NA

#create index
volBin<-1
sumVol<-0

#create cutting for each volume bin
for(i in 1:nrow(data1))
{
  sumVol<-sumVol + data1[i,"Volume"]
  if (sumVol<= 5000) {
    data1[i,"volBinIdx"]<-volBin
  } else {
    volBin<-(volBin+1)
    data1[i,"volBinIdx"]<-volBin
    sumVol<-data1[i,'Volume']
  }
}

#aggregate data by volBinIdx
a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))

#create a data frame
x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
colnames(x3)<-c("Date","Time","Open","High","Low","Close")

مجموعة البيانات الخاصة بي:

ZZ<-"
Date,Time,Price,Size
02/07/2014,09:30:01,3,500
02/07/2014,09:30:29,3,42
02/07/2014,09:35:56,3,100
02/07/2014,09:37:17,3,100
02/07/2014,09:37:28,3.2,900
02/07/2014,09:37:35,3.2,4900
02/07/2014,09:37:51,3.2,1000
02/07/2014,09:42:11,3.2,500
02/07/2014,10:00:31,3,2400
02/07/2014,10:00:37,3.2,500
02/07/2014,10:00:44,3.2,3347
02/07/2014,10:07:33,3.2,1000
02/07/2014,10:31:42,3.24,1000
02/07/2014,10:33:44,3.24,200
02/07/2014,10:40:28,3.25,300
02/07/2014,10:49:57,3.25,600
02/07/2014,10:53:16,3.25,100
02/07/2014,10:53:32,3.4,1000
02/07/2014,10:54:13,3.4,500
02/07/2014,11:05:37,3.35,1000
02/07/2014,11:11:29,3.25,600
02/07/2014,11:15:26,3.3,60
02/07/2014,11:19:16,3.3,23
02/07/2014,11:21:14,3.25,100
02/07/2014,11:21:22,3.25,100
02/07/2014,11:21:30,3.2,500
02/07/2014,11:21:35,3.2,500
02/07/2014,11:21:43,3.2,500
02/07/2014,11:29:58,3.1,200
02/07/2014,11:35:42,3.19,360
02/07/2014,11:39:51,3.19,1000
02/07/2014,11:52:39,3.15,200
02/07/2014,11:53:51,3.15,100
02/07/2014,11:55:11,3.2,100
02/07/2014,12:17:32,3.2,1500
02/07/2014,12:35:42,3.24,1200
02/07/2014,12:37:53,3.24,100
02/07/2014,12:38:02,3.24,3500
02/07/2014,12:53:57,3.24,400
02/07/2014,13:10:57,3.239,100
02/07/2014,13:11:35,3.24,800
02/07/2014,13:13:41,3.24,1000
02/07/2014,13:39:40,3.24,450
02/07/2014,13:56:04,3.24,500
02/07/2014,14:09:49,3.24,600
02/07/2014,14:11:25,3.24,1000
02/07/2014,14:25:53,3.24,25
02/07/2014,14:30:58,3.24,30
02/07/2014,14:31:36,3.24,30
02/07/2014,14:32:12,3.24,30
02/07/2014,14:33:00,3.24,100
02/07/2014,14:34:49,3.24,1100
02/07/2014,14:36:02,3.24,2000
02/07/2014,14:37:07,3.22,1500
02/07/2014,14:42:30,3.22,3300
02/07/2014,14:42:46,3.22,100
02/07/2014,14:42:54,3.2,1000
02/07/2014,14:53:13,3.23,240
02/07/2014,14:53:27,3.24,500
02/07/2014,14:53:59,3.24,60
02/07/2014,14:54:46,3.2,1500
02/07/2014,14:57:45,3.2,160
02/07/2014,14:57:46,3.2,125
02/07/2014,14:57:54,3.2,100
02/07/2014,15:05:56,3.19,100
02/07/2014,15:22:21,3.19,300
02/07/2014,15:22:28,3.18,150
02/07/2014,15:23:09,3.19,2000
02/07/2014,15:35:23,3.18,1500
02/07/2014,15:44:36,3.18,600
02/10/2014,09:30:02,3.25,100
02/10/2014,09:30:02,3.25,25
02/10/2014,09:30:24,3.25,150
02/10/2014,09:30:40,3.25,100
02/10/2014,09:31:11,3.25,650
02/10/2014,09:35:32,3.24,200
02/10/2014,09:37:59,3.19,100
02/10/2014,09:38:01,3.2,2000
02/10/2014,09:38:09,3.18,185
02/10/2014,09:38:36,3.18,500
02/10/2014,09:39:13,3.18,1042
02/10/2014,09:39:18,3.18,156
02/10/2014,09:39:18,3.17,20
02/10/2014,09:41:24,3.15,100
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:28,3.15,1000
02/10/2014,09:42:41,3.15,500
02/10/2014,09:42:57,3.15,100
02/10/2014,09:43:24,3.12,500
02/10/2014,09:43:29,3.12,100
02/10/2014,09:43:32,3.1,5000
02/10/2014,09:44:02,3.1,500
02/10/2014,09:44:19,3.1,500
02/10/2014,09:44:22,3.09,100
02/10/2014,09:44:22,3.09,96
02/10/2014,09:44:55,3.05,100
02/10/2014,09:45:11,3.05,676
02/10/2014,09:45:23,3,150
02/10/2014,09:45:44,2.95,1000
02/10/2014,09:45:53,2.95,1500
02/10/2014,09:47:17,2.95,100
02/10/2014,09:47:46,2.9,100
02/10/2014,09:48:24,2.9,500
02/10/2014,09:48:50,2.9,100
02/10/2014,09:49:11,2.85,386
02/10/2014,09:49:13,2.85,100
02/10/2014,09:49:14,2.8,200
02/10/2014,09:49:15,2.7,100
02/10/2014,09:49:22,2.7,100
02/10/2014,09:49:32,2.7,100
02/10/2014,09:50:09,2.65,2500
02/10/2014,09:50:44,2.66,2500
02/10/2014,09:50:49,2.6,100
02/10/2014,09:50:53,2.7,240
02/10/2014,09:50:54,2.61,1000
02/10/2014,09:50:58,2.65,414
02/10/2014,09:55:24,2.95,100
02/10/2014,09:57:22,2.95,400
02/10/2014,10:07:21,2.95,400
02/10/2014,10:16:28,2.95,250
02/10/2014,10:21:20,2.85,300
02/10/2014,10:32:40,2.94,100
02/10/2014,10:33:18,2.95,426
02/10/2014,10:33:38,2.95,70
02/10/2014,10:33:39,2.94,1900
02/10/2014,10:43:46,2.95,4500
02/10/2014,10:44:00,2.99,200
02/10/2014,10:44:20,2.99,505
02/10/2014,10:49:30,2.96,500
02/10/2014,10:57:22,2.95,2500
02/10/2014,10:57:25,2.95,500
02/10/2014,10:57:40,2.95,500
02/10/2014,11:38:29,3,500
02/10/2014,11:38:35,3.05,500
02/10/2014,11:38:45,3.1,1000
02/10/2014,11:45:08,3.05,100
02/10/2014,11:49:55,3.01,100
02/10/2014,11:50:14,3,1900
02/10/2014,11:50:18,3,100
02/10/2014,12:07:51,3,1000
02/10/2014,12:33:26,3,400
02/10/2014,13:57:20,3.1,150
02/10/2014,13:57:34,3,42
02/10/2014,14:21:42,3.15,500
02/10/2014,14:23:35,3.15,1000
02/10/2014,14:25:40,3.05,200
02/10/2014,14:26:01,3.15,100
02/10/2014,14:50:50,3.15,100
02/10/2014,14:51:00,3.1,100
02/10/2014,14:51:09,3.1,100
02/10/2014,14:51:24,3.05,500
02/10/2014,14:51:43,3,100
02/10/2014,14:52:04,2.95,100
02/10/2014,14:52:15,2.99,25
02/10/2014,14:52:17,2.95,100
02/10/2014,14:52:33,2.9,500
02/10/2014,14:52:47,2.95,600
02/10/2014,14:52:49,2.85,100
02/10/2014,14:52:51,2.85,1000
02/10/2014,14:53:08,2.82,500
02/10/2014,14:53:24,2.85,500
02/10/2014,14:53:43,2.84,5400
02/10/2014,14:53:48,2.85,100
02/10/2014,15:00:48,2.99,64
02/10/2014,15:04:08,2.99,412
02/10/2014,15:11:42,2.99,100
02/10/2014,15:11:46,2.99,100
02/10/2014,15:12:06,2.99,100
02/10/2014,15:20:35,3.04,500
02/10/2014,15:30:28,3,500
02/10/2014,15:36:58,2.95,2000
02/10/2014,15:38:09,3,550
02/10/2014,15:39:48,2.97,2000
02/11/2014,09:30:04,3.2,100
02/11/2014,09:30:18,3.2,2000
02/11/2014,10:03:07,3.18,1000
02/11/2014,10:21:35,3.18,26
02/11/2014,10:27:09,3.15,500
02/11/2014,10:37:22,3.15,1108
02/11/2014,10:37:22,3.15,1054
02/11/2014,10:37:23,3.1,100
02/11/2014,10:42:26,3.05,1000
02/11/2014,10:42:57,3.02,1000
02/11/2014,10:43:29,3.02,1000
02/11/2014,10:48:27,3.02,100
02/11/2014,10:50:36,3.01,1000
02/11/2014,10:51:33,3.01,1000
02/11/2014,10:51:43,3.01,1000
02/11/2014,10:52:17,3.01,1000
02/11/2014,10:53:55,3.01,500
02/11/2014,10:54:31,3.05,40
02/11/2014,10:55:41,3.01,100
02/11/2014,10:55:44,3,3300
02/11/2014,10:55:44,3,100
02/11/2014,10:55:44,3,5000
02/11/2014,10:55:44,3,230
02/11/2014,10:56:21,3,100
02/11/2014,11:01:20,3,100
02/11/2014,11:01:21,3,50
02/11/2014,11:17:30,2.99,600
02/11/2014,11:17:34,3,500
02/11/2014,11:18:49,2.99,3000
02/11/2014,11:25:55,3.03,500
02/11/2014,11:29:59,2.99,400
02/11/2014,11:30:08,2.99,100
02/11/2014,11:30:18,2.99,100
02/11/2014,11:30:46,2.99,200
02/11/2014,11:38:48,2.95,100
02/11/2014,11:44:55,2.98,325
02/11/2014,12:32:09,3,500
02/11/2014,12:32:55,3,50
02/11/2014,13:15:49,3.1,1000
02/11/2014,14:16:16,3.05,350
02/11/2014,14:29:12,2.99,650
02/11/2014,14:32:23,2.99,335
02/11/2014,14:32:29,2.99,500
02/11/2014,15:25:01,3,1000
02/11/2014,15:49:37,3,500
02/11/2014,15:51:08,2.98,300
02/12/2014,08:46:23,3,1500
02/12/2014,09:10:01,3,2000
02/12/2014,09:21:31,3.1,1500
02/12/2014,09:26:33,3.2,2000
02/12/2014,09:27:58,3.2,2500
02/12/2014,09:30:00,3.2,2000
02/12/2014,09:30:00,3.2,10000
02/12/2014,09:30:01,3.2,500
02/12/2014,09:30:02,3.2,30
02/12/2014,09:30:18,3.2,30
02/12/2014,09:40:51,3.05,100
02/12/2014,09:40:52,3.05,1250
02/12/2014,09:41:01,3.05,806
02/12/2014,09:41:11,3,100
02/12/2014,09:43:48,2.98,1000
02/12/2014,09:44:22,3,4000
02/12/2014,09:44:27,2.98,1000
02/12/2014,09:44:31,2.98,2900
02/12/2014,09:47:43,2.98,110
02/12/2014,09:50:49,2.96,100
02/12/2014,09:50:51,2.8,750
02/12/2014,09:51:11,2.95,100
02/12/2014,09:55:35,2.95,1050
02/12/2014,09:55:56,2.95,100
02/12/2014,09:56:29,3,100"
هل كانت مفيدة؟

المحلول

ربما لا يكون هذا هو الكود الأمثل، ولكنه على الأقل يمثل تحسينًا بمقدار العامل 4 تقريبًا.ما زلت أستخدم حلقة for ولكني استبدلت بعض إعدادات data.frame الفرعية بالمتجهات والتجميع dplyr.

library(dplyr)
library(microbenchmark)

microbenchmark(
  original = {
    data1<-read.table(text=ZZ,sep=',',header=T)
    colnames(data1)<-c("Date","Time","Price","Volume")

    #create column
    data1[,"volBinIdx"]<-NA

    #create index
    volBin<-1
    sumVol<-0

    #create cutting for each volume bin
    for(i in 1:nrow(data1))
    {
      sumVol<-sumVol + data1[i,"Volume"]
      if (sumVol<= 5000) {
        data1[i,"volBinIdx"]<-volBin
      } else {
        volBin<-(volBin+1)
        data1[i,"volBinIdx"]<-volBin
        sumVol<-data1[i,'Volume']
      }
    }

    #aggregate data by volBinIdx
    a1<-aggregate(data1$Price,list(bin=data1$volBinIdx),function(x) cbind( first(x),max(x),min(x),last(x)))
    a2<-aggregate(data1$Time,list(bin=data1$volBinIdx),function(x) first(x))
    a3<-aggregate(data1$Date,list(bin=data1$volBinIdx),function(x) first(x))

    #create a data frame
    x3<-cbind(a3[,2,drop=F],a2[,2,drop=F],a1[,2])
    colnames(x3)<-c("Date","Time","Open","High","Low","Close")
  },

  beginneR = {

    data1<-read.table(text=ZZ,sep=',',header=T)
    colnames(data1)<-c("Date","Time","Price","Volume")

    #create index
    volBin<-1
    sumVol<-0
    Volume <- data1$Volume
    volBinIdx <- numeric(nrow(data1))

    #create cutting for each volume bin

    for(i in seq_len(nrow(data1))){
      sumVol <- sumVol + Volume[i]
      if (sumVol <= 5000) {
        volBinIdx[i] <- volBin
      } else {
        volBinIdx[i] <-  volBin <- volBin + 1
        sumVol <- Volume[i]
      }
    }

    data1 <- data1 %>%
      mutate(volBinIdx = volBinIdx) %>%
      group_by(volBinIdx) %>%
      summarize(Date = head(Date, 1),
                Time = head(Time, 1),
                Open = head(Price, 1),
                High = max(Price),
                Low = min(Price),
                Close = tail(Price, 1)) %>% 
      select(-volBinIdx)

  }, unit = "relative")

#    Unit: relative
#    expr      min      lq   median       uq       max neval
#original 4.180704 4.24341 4.254675 4.129769 0.7706553   100
#beginneR 1.000000 1.00000 1.000000 1.000000 1.0000000   100
مرخصة بموجب: CC-BY-SA مع الإسناد
لا تنتمي إلى StackOverflow
scroll top