I'm not sure I understand 100% what the objective is, but if you're looking to fit the measured points to the reference curve then using dtw
seems sensible. Fitting the 10 measured points to the 370-odd curve points does give a slightly strange result (which is just the optimization with the symmetric step.pattern). I think that's largely a function of the small number of points.
One option which may help is to use ggplot()
(or another function) to smooth the measured curve and provide some additional points for matching. But obviously it can only do so much depending on the limitation of the measured points. With so few points you might lose information in the process of fitting your data.
If you could trim curve
to be exactly contemporaneous with the first and last point of the meas
observations, that would also help since you're matching with open.begin
and open.end
FALSE
, but I'm not sure whether the exact dates are available.
This shows smoothing meas
out to 80 points, and mapping the 10-point raw data and 80-point smooth to the reference curve
require(ggplot2)
require(scales)
require(gridExtra)
require(dtw)
require(plyr)
# use ggplot default to smooth the 10 point curve
meas.plot.smooth<-ggplot(meas, aes(x = distance, y = value)) + geom_line() + labs(title = "ggplot smoothed (blue curve)")+geom_smooth()
# use ggplot_build() to get the smoothed points
meas.curve.smooth<-ggplot_build(meas.plot.smooth)$data[[2]]
orig.align<-dtw(meas$value,curve$value,keep=T,step.pattern=symmetric1)
orig.freqs<-count(orig.align$index1)
# reference the matching points (which are effectively dates)
orig.freqs$cumsum<-cumsum(orig.freqs$freq)
g.10<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[orig.freqs$cumsum,"date"], y = meas$value),color="red") +
geom_text(aes(x = curve[orig.freqs$cumsum,"date"], y = meas$value, label=orig.freqs$x),color="red",size=5) +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
labs(title = "Native 10 pt curve - dtw mapped")
smooth.align<-dtw(meas.curve.smooth$y,curve$value,keep=T,step.pattern=symmetric1)
smooth.freqs<-count(smooth.align$index1)
smooth.freqs$cumsum<-cumsum(smooth.freqs$freq)
g.80<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[smooth.freqs$cumsum,"date"], y = meas.curve.smooth$y),color="red") +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
geom_text(aes(x = curve[smooth.freqs$cumsum,"date"], y = meas.curve.smooth$y, label=smooth.freqs$x),color="red",size=3.5,position="jitter") +
labs(title = "80 point loess curve - dtw mapped")
grid.arrange(meas.plot.smooth,g.10,g.80,ncol=1)
EDIT
Obviously part of the problem is confidence intervals. I've included an example here to build a random curve within the standard error levels around the smoothed curve. As you can see, it's quite different to using the projected curve itself. I think the issue is that when you're trying to map 10 measures against a 370-point reference curve, unless they track extremely tightly, it's going to be difficult to get precise predictions.
rand.align<-dtw(meas.curve.smooth$ymin+(meas.curve.smooth$ymax-meas.curve.smooth$ymin)*runif(length(meas.curve.smooth$ymin)),curve$value,keep=T,step.pattern=symmetric1)
rand.freqs<-count(rand.align$index1)
rand.freqs$cumsum<-cumsum(rand.freqs$freq)
g.rand<-ggplot() + geom_line(data = curve, aes(x = date, y = value)) +
geom_line(aes(x = curve[rand.freqs$cumsum,"date"], y = meas.curve.smooth$y),color="red") +
scale_x_date(name = "Month", breaks = date_breaks("months"), labels = date_format("%b")) +
geom_text(aes(x = curve[rand.freqs$cumsum,"date"], y = meas.curve.smooth$y, label=rand.freqs$x),color="red",size=3.5,position="jitter") +
labs(title = "Random curve within standard CI - dtw mapped")
grid.arrange(meas.plot.smooth,g.10,g.80,g.rand,ncol=1)
EDIT updated to include simulation.
OK - this is updated to run 1000 simulations. It creates curves for mapping which are randomised from within the 95% CI. I changed n to 10 (from 80) in the geom_smooth()
function to try and preserve as much info as possible from the measured curve.
It models the cumulative growth (assuming linear growth between unmeasured days)
Not sure if it's completely useful, but provides a decent way of visualizing the uncertainty.
get_scenario<-function(i){
set.seed(i)
# create random curve within the CI
rand.align<-dtw(meas.curve.smooth$ymin+(meas.curve.smooth$ymax-meas.curve.smooth$ymin)*runif(length(meas.curve.smooth$ymin)),curve$value,keep=T,step.pattern=symmetric1)
rand.freqs<-count(rand.align$index1)
rand.freqs$cumsum<-cumsum(rand.freqs$freq)
growth.index<-data.frame(cumsum=curve$index,val=curve$value)
merged<-merge(growth.index,rand.freqs,by="cumsum")
return(data.frame(x=merged$cumsum,growth=cumsum(merged$val*merged$freq),scenario=i))
}
scenario.set <- ldply(lapply(1:1000,function(l)get_scenario(l)), data.frame)
g.s<-ggplot(scenario.set,aes(x,growth)) +
geom_line(aes(,group=scenario,color=scenario),alpha=0.25) +
scale_colour_gradient(low = "yellow", high = "orangered") +
xlab("Days from start") + ylab("Cumulative Growth")
g.xmax<-max(scenario.set$x) # get the final day (or set to another day)
g.xmin<-g.xmax-30 # thirty day window from end
b<-ggplot_build(g.s)
build.data<-b$data[[1]]
ylims<-build.data[build.data$x<=g.xmax & build.data$x>=g.xmin,]$y
g.subplot<-g.s+geom_point(aes(x,growth,color=scenario),alpha=0.25,size=5,position="jitter")+coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims),max(ylims)))
grid.arrange(meas.plot.smooth,g.s,g.subplot,ncol=1)
and here are some other ways of looking at the tail:
g.s<-ggplot(scenario.set,aes(x,growth)) +
geom_line(aes(,group=scenario,color=scenario),alpha=0.25) +
scale_colour_gradient(low = "yellow", high = "orangered") +
xlab("Days from start") + ylab("Cumulative Growth")
g.xmax<-max(scenario.set$x) # get the final day (or set to another day)
g.xmin<-g.xmax-50 # thirty day window from end
b<-ggplot_build(g.s)
build.data<-b$data[[1]]
ylims<-build.data[build.data$x<=g.xmax & build.data$x>=g.xmin,]$y
g.subplot<-g.s+geom_point(aes(x,growth,color=scenario),alpha=0.25,size=5,position="jitter")+coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims),max(ylims)))
grid.arrange(meas.plot.smooth,g.s,g.subplot,ncol=1)
g.box<-ggplot(build.data)+
geom_boxplot(aes(x,y,group=cut(x,max(x)/7),fill=cut(x,max(x)/7)),alpha=0.5)+ # bucket by group
theme(legend.position="none")+
coord_cartesian(xlim=c(g.xmin,g.xmax),ylim=c(min(ylims)-50,max(ylims)+50))
build.data.sum<-ddply(build.data,.(x),summarise,ymax=max(y),ymin=min(y),mean=mean(y))
g.spots<-ggplot(build.data)+
geom_point(aes(x,y,color=group),size=10,alpha=0.25,position="jitter")+
theme(legend.position="none")+scale_colour_gradient(low = "yellow", high = "orangered")+
geom_ribbon(data=build.data.sum,aes(x,ymax=ymax,ymin=ymin),alpha=0.25)+
coord_cartesian(xlim=c(g.xmax-50,g.xmax+1),ylim=c(min(ylims)-50,max(ylims)+50))+geom_smooth(aes(x,y),n=max(build.data$x))
grid.arrange(g.box,g.spots,ncol=1)