Question

I have a CSV file which contains lines for each (Java GC) Event I am interested in. The object consists of a subsecond timestamp (non equidistant) and some variables. The object looks like this:

gcdata <- read.table("http://bernd.eckenfels.net/view/gc1001.ygc.csv",header=TRUE,sep=",", dec=".")
start = as.POSIXct(strptime("2012-01-01 00:00:00", format="%Y-%m-%d %H:%M:%S"))
gcdata.date = gcdata$Timestamp + start
gcdata = gcdata[,2:7] # remove old date col
gcdata=data.frame(date=gcdata.date,gcdata)
str(gcdata)

Results in

'data.frame':   2997 obs. of  7 variables:
 $ date           : POSIXct, format: "2012-01-01 00:00:06" "2012-01-01 00:00:06" "2012-01-01 00:00:18" ...
 $ Distance.s.    : num  0 0.165 11.289 9.029 11.161 ...
 $ YGUsedBefore.K.: int  1610619 20140726 20148325 20213304 20310849 20404772 20561918 21115577 21479211 21544930 ...
 $ YGUsedAfter.K. : int  7990 15589 80568 178113 272036 429182 982841 1346475 1412181 1355412 ...
 $ Promoted.K.    : int  0 0 0 0 8226 937 65429 71166 62548 143638 ...
 $ YGCapacity.K.  : int  22649280 22649280 22649280 22649280 22649280 22649280 22649280 22649280 22649280 22649280 ...
 $ Pause.s.       : num  0.0379 0.022 0.0287 0.0509 0.109 ...

In this case I care about the Pause time (in seconds). I want to plot a diagram, which will show me for each (wall clock) hour basically the mean as a line, the 2% and 98% as a grey corridor and the max value (within each hour) as a red line.

I have done some work, but using the q98 functions is ugly, having to use multiple lines statements seems to be wastefull, and I dont know how to achieve a grey area between q02 and q98:

q02 <- function(x, ...) {  x <- quantile(x,probs=c(0.2)) }
q98 <- function(x, ...) {  x <- quantile(x,probs=c(0.98)) }
hours = droplevels(cut(gcdata$date, breaks="hours")) # can I have 2 hours?
plot(aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=max),ylim=c(0,2), col="red", ylab="Pause(s)", xlab="Days") # Is always black?
lines(aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=q98),ylim=c(0,2), col="green")
lines(aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=q02),ylim=c(0,2), col="green")
lines(aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=mean),ylim=c(0,2), col="blue")

Now this results in a chart which has black dots as maximum, a blue line as the hourly average and a lower and upper 0,2 + 0,98 green line. I think it would be better readable to have a grey corridor, maybe a dashed maximum (red) line and somehow fix the axis labels. Exported Chart (png) Any suggestions? (the file is available above)

Was it helpful?

Solution

good to see fellow Debian old-timers here :) Your answer is already pretty nice. As I happen to work a lot with time series, I thought I'd throw in a variant using the excellent zoo and xts packages. The latter builds on top of the former and has, among other things, the period.apply() function we can use here along with the endpoints() function to get you two-hourly aggregates.

So at the top I'd use

library(zoo)                                # for zoo objects
library(xts)                                # for period.apply

gcdata <- read.table("http://bernd.eckenfels.net/view/gc1001.ygc.csv",
                     header=TRUE, sep=",", dec=".")
timestamps <- gcdata$Timestamp + 
              as.POSIXct(strptime("2012-01-01 00:00:00", 
                         format="%Y-%m-%d %H:%M:%S"))
gcdatazoo <- zoo(gcdata[-1], order.by=timestamps)    # as zoo object

to create a zoo object. Your function remains:

plotAreaCorridor <- function(x, y, col.poly1="lightgray", col.poly2="gray",...) {
    x.pol <- c(x, rev(x), x[1])
    y.pol <- c(y[,1], rev(y[,5]),y[,1][1])
    plot(x, y[,6]+1, type="n", ...) 
    polygon(x.pol, y.pol, col=col.poly1, lty=0)

    x.pol <- c(x, rev(x), x[1])
    y.pol <- c(y[,2], rev(y[,4]), y[,1][1])
    polygon(x.pol, y.pol, col=col.poly2, lty=0)

    lines(x, y[,3], col="blue") # median
    lines(x, y[,6], col="red")  # max

    invisible(NULL)
}

And we can then simplify a little:

agg <- period.apply(gcdatazoo[,"Pause.s."],               # to which data
                    INDEX=endpoints(gcdatazoo, "hours", k=2), # every 2 hours
                    FUN=function(x) quantile(x,               # what fun.
                                             probs=c(5,20,50,80,95,100)/100)) 

#v99 = q99(gcdata$Pause.s.)        # what is q99 ?
v99 <- mean(agg[,5])                  # mean of 95-th percentile?
plotAreaCorridor(index(agg),          # use time index as x axis
                 coredata(agg),       # and matrix part of zoo object as data
                 ylim=c(0,max(agg[,5])*1.5),
                 ylab="Quantiles of GC events",
                 main="NewPar Collection Activity")
abline(h=median(gcdatazoo[,"Pause.s."]), col="lightblue")
abline(h=v99, col="grey")
labeltxt <- paste("99%=",round(v99,digits=3),"s n=", nrow(gcdatazoo),sep="")
text(x=index(agg)[20], y=1.5*v99, labeltxt, col="grey", pos=3)  # or legend()

which gives

enter image description here

The axis is now automatic and shows weekdays only as the span is less than week; this can be override as needed.

OTHER TIPS

You have to try polygon. This code can be useful:

y98 = aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=q98)
y02 = aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=q02)
ymax = aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=max)
ymin = aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=min)
ymean = aggregate(gcdata$Pause.s. ~ hours, data=gcdata, FUN=mean)

x = ymean[,1]
y1 = cbind(y02[,2], ymean[,2], y98[,2])
y2 = cbind(ymin[,2], ymean[,2], ymax[,2])

plotAreaCI(x,y2, ylim=c(0,2), xlab="time", ylab="variable")
plotAreaCI(x,y1, ylim=c(0,2), poly.col="blue", add=TRUE)

pic1

or

plotAreaCI(x,y2, ylim=c(0,2), xlab="time", ylab="variable", nice.x = TRUE)
plotAreaCI(x,y1, ylim=c(0,2), mean.lwd=2, poly.col="blue", add=TRUE)

pic2

where the function plotAreaCI is defined by:

plotAreaCI = function(x, y, add=FALSE, nice.x = FALSE,
                          xlim=NULL, ylim=NULL,
                          mean.col="black", mean.lwd=1.5,
                          poly.col="gray", poly.lty=3,
                          xlab=NULL, ylab=NULL, main="",
                          ...) {
      isFactorX = isClass("factor", x)
      if(isFactorX) {
        x.label = x
        x = as.numeric(x)
      }
      if(is.null(xlim)) xlim=range(x, na.rm=TRUE)
      if(is.null(ylim)) ylim=range(y, na.rm=TRUE)
      x.pol = c(x, rev(x), x[1])
      y.pol = c(y[,1], rev(y[,3]), y[,1][3])
      if(!add) {
        plot.new()
        plot.window(xlim=xlim, ylim=ylim, ...)
        if(!nice.x & isFactorX) {
          axis(1, at=x, labels=x.label)
        } else {
          xticks = axTicks(1)
          if(isFactorX) {
            xticks = xticks[xticks>=1]
            axis(1, at=xticks, labels=x.label[xticks])
          } else {
            axis(1)
          }
        }
            axis(2, las=1)
        box()
        title(xlab=xlab, ylab=ylab, main=main)
      }
      polygon(x.pol, y.pol, col=poly.col, lty=poly.lty)
      lines(x, y[,2], col=mean.col, lwd=mean.lwd)
      return(invisible())
    }

This is code I use to plot temporal variation in laboratory analytes (systolic blood pressure in this instance):

 SBP.qtr.mat <- aggregate(set1HLI$SBP, 
                          list(  year(set1HLI$Drawdt)+0.25* quarter(set1HLI$Drawdt)), 
                           quantile, prob=c(0.1,0.25,0.5,0.75, 0.9,0.95, 0.975), na.rm=TRUE)
 matplot(SBP.qtr.mat[,1], SBP.qtr.mat$x, type="pl")

Shouldn't be too hard to adapt that to your problem.... or you could post a reproducible example to work with. This give the 10th, 25th, 50th, 75th, 90th, 95th and 97.5th percentiles in a single data.frame and matplot handles the plotting of such an object.

The grey area?, ... The usual approach is to plot a polygon going out on the lower bounds, "turning" at the right extreme and coming back on the high side, and connecting back at the left-hand side. The polygon arguments are set up as x, y. There is a col argument you would set to "grey".

To make '2 hour' sequences to which you could merge your dataframe or use withcut.POSIXt" as a breaks argument , there is the option of using multiples of time units withseq.POSIXt`:

> seq(ISOdate(1910,1,1), ISOdate(1999,1,1), "10 years")
[1] "1910-01-01 12:00:00 GMT" "1920-01-01 12:00:00 GMT" "1930-01-01 12:00:00 GMT" "1940-01-01 12:00:00 GMT"
[5] "1950-01-01 12:00:00 GMT" "1960-01-01 12:00:00 GMT" "1970-01-01 12:00:00 GMT" "1980-01-01 12:00:00 GMT"
[9] "1990-01-01 12:00:00 GMT"

I didn't see it documented but you can use multiples of interval with cut.POSIXt:

> str( cut( seq(ISOdate(1910,1,1), ISOdate(1999,1,1), "years"), "10 years") )
 Factor w/ 9 levels "1910-01-01","1920-01-01",..: 1 1 1 1 1 1 1 1 1 1 ...
> str( cut( seq(ISOdate(1910,1,1), ISOdate(1999,1,1), "years"), "5 years") )
 Factor w/ 18 levels "1910-01-01","1915-01-01",..: 1 1 1 1 1 2 2 2 2 2 ...

I am not currently arrived at the following script (still need to look at the more advanced answer from DWin). It now somewhat looks like I was looking for, but the code is still pretty ugly (For example I dont know how to align the label and how to get proper xlab labels):

plotAreaCorridor = function(x, y, col.poly1="lightgray", col.poly2="gray",...) {
   x.pol = c(x, rev(x), x[1])
   y.pol = c(y[,1], rev(y[,5]),y[,1][1])
   plot(x, y[,6]+1, type="n", ...) # ugly since type="n" does not work for factor
   polygon(x.pol, y.pol, col=col.poly1, lty=0)

   x.pol = c(x, rev(x), x[1])
   y.pol = c(y[,2], rev(y[,4]), y[,1][1])
   polygon(x.pol, y.pol, col=col.poly2, lty=0)

   lines(x, y[,3], col="blue") # median
   lines(x, y[,6], col="red")  # max

   return(invisible())
}
pause = gcdata$Pause.s.
hours = droplevels(cut(gcdata$date, breaks="hours")) # can I have 2 hours?
agg = aggregate(pause ~ hours, FUN=quantile, probs=c(5,20,50,80,95,100)/100)
x = agg$hours
ys = agg$pause
q99 <- function(x, ...) {  x <- quantile(x,probs=c(0.99)) }  
v99 = q99(gcdata$Pause.s.)
vmed = median(gcdata$Pause.s.)
plotAreaCorridor(x, ys,ylim=c(0,v99*1.5))
abline(h=vmed, col="lightblue")
abline(h=v99, col="grey")
label=paste("99%=",round(v99,digits=3),"s n=", length(gcdata$date),sep="")
text(x=30, y=v99, label, col="grey", pos=3)
title("NewPar Collection Activity")

New Chart Version

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top