Calcular dentro y entre varianzas e intervalos de confianza en R

https://stackoverflow.com/questions/1401894

statistics
r

05-07-2019
|

Pregunta

Necesito calcular las variaciones dentro y entre ejecuciones de algunos datos como parte del desarrollo de un nuevo método de química analítica. También necesito intervalos de confianza de estos datos usando el lenguaje R

¿Supongo que necesito usar anova o algo?

Mis datos son como

> variance
   Run Rep Value
1    1   1  9.85
2    1   2  9.95
3    1   3 10.00
4    2   1  9.90
5    2   2  8.80
6    2   3  9.50
7    3   1 11.20
8    3   2 11.10
9    3   3  9.80
10   4   1  9.70
11   4   2 10.10
12   4   3 10.00

Solución 4

He estado viendo un problema similar. He encontrado referencias a los intervalos de confianza calculados por Burdick y Graybill (Burdick, R. y Graybill, F. 1992, Intervalos de confianza en los componentes de la varianza, CRC Press)

Al usar un código que he estado intentando, obtengo estos valores



> kiaraov = aov(Value~Run+Error(Run),data=kiar)

> summary(kiaraov)

Error: Run
    Df  Sum Sq Mean Sq
Run  3 2.57583 0.85861

Error: Within
          Df  Sum Sq Mean Sq F value Pr(>F)
Residuals  8 1.93833 0.24229               
> confint = 95

> a = (1-(confint/100))/2

> grandmean = as.vector(kiaraov#' avar Function
#' 
#' Calculate thewithin, between and total %CV of a dataset by ANOVA, and the
#' associated confidence intervals
#' 
#' @param dataf - The data frame to use, in long format 
#' @param afactor Character string representing the column in dataf that contains the factor
#' @param aresponse  Charactyer string representing the column in dataf that contains the response value
#' @param aconfidence What Confidence limits to use, default = 95%
#' @param digits  Significant Digits to report to, default = 3
#' @param debug Boolean, Should debug messages be displayed, default=FALSE
#' @returnType dataframe containing the Mean, Within, Between and Total %CV and LCB and UCB for each
#' @return 
#' @author Paul Hurley
#' @export
#' @examples 
#' #Using the BGBottles data from Burdick and Graybill Page 62
#' assayvar(dataf=BGBottles, afactor="Machine", aresponse="weight")
avar<-function(dataf, afactor, aresponse, aconfidence=95, digits=3, debug=FALSE){
    dataf<-subset(dataf,!is.na(with(dataf,get(aresponse))))
    nmissing<-function(x) sum(!is.na(x))
    n<-nrow(subset(dataf,is.numeric(with(dataf,get(aresponse)))))
    datadesc<-ddply(dataf, afactor, colwise(nmissing,aresponse))
    I<-nrow(datadesc)
    if(debug){print(datadesc)}
    if(min(datadesc[,2])==max(datadesc[,2])){
        balance<-TRUE
        J<-min(datadesc[,2])
        if(debug){message(paste("Dataset is balanced, J=",J,"I is ",I,sep=""))}
    } else {
        balance<-FALSE
        Jh<-I/(sum(1/datadesc[,2], na.rm = TRUE))
        J<-Jh
        m<-min(datadesc[,2])
        M<-max(datadesc[,2])
        if(debug){message(paste("Dataset is unbalanced, like me, I is ",I,sep=""))}
        if(debug){message(paste("Jh is ",Jh, ", m is ",m, ", M is ",M, sep=""))}
    }
    if(debug){message(paste("Call afactor=",afactor,", aresponse=",aresponse,sep=""))}
    formulatext<-paste(as.character(aresponse)," ~ 1 + Error(",as.character(afactor),")",sep="")
    if(debug){message(paste("formula text is ",formulatext,sep=""))}
    aovformula<-formula(formulatext)
    if(debug){message(paste("Formula is ",as.character(aovformula),sep=""))}
    assayaov<-aov(formula=aovformula,data=dataf)
    if(debug){
        print(assayaov)
        print(summary(assayaov))
    }
    a<-1-((1-(aconfidence/100))/2)
    if(debug){message(paste("confidence is ",aconfidence,", alpha is ",a,sep=""))}
    grandmean<-as.vector(assayaov<*>quot;(Intercept)"[[1]][1]) # Grand Mean (I think)
    if(debug){message(paste("n is",n,sep=""))}

    #This line commented out, seems to choke with an aov object built from an external formula
    #grandmean<-as.vector(model.tables(assayaov,type="means")[[1]] He estado viendo un problema similar. He encontrado referencias a los intervalos de confianza calculados por Burdick y Graybill (Burdick, R. y Graybill, F. 1992, Intervalos de confianza en los componentes de la varianza, CRC Press) 

 Al usar un código que he estado intentando, obtengo estos valores 



> kiaraov = aov(Value~Run+Error(Run),data=kiar)

> summary(kiaraov)

Error: Run
    Df  Sum Sq Mean Sq
Run  3 2.57583 0.85861

Error: Within
          Df  Sum Sq Mean Sq F value Pr(>F)
Residuals  8 1.93833 0.24229               
> confint = 95

> a = (1-(confint/100))/2

> grandmean = as.vector(kiaraovGrand mean`) # Grand Mean (I think)
    within<-summary(assayaov)[[2]][[1]]<*>quot;Mean Sq"  # d2e, S2^2 Mean Square Value for Within Machine = 0.1819
    dfRun<-summary(assayaov)[[1]][[1]]<*>quot;Df"  # DF for within = 3
    dfWithin<-summary(assayaov)[[2]][[1]]<*>quot;Df"  # DF for within = 8
    Run<-summary(assayaov)[[1]][[1]]<*>quot;Mean Sq" # S1^2Mean Square for Machine
    if(debug){message(paste("mean square for Run ?",Run,sep=""))}
    #Was between<-(Run-within)/((dfWithin/(dfRun+1))+1) but my comment suggests this should be just J, so I'll use J !
    between<-(Run-within)/J # d2a (S1^2-S2^2)/J
    if(debug){message(paste("S1^2 mean square machine is ",Run,", S2^2 mean square within is ",within))}
    total<-between+within
    between # Between Run Variance
    within # Within Run Variance
    total # Total Variance
    if(debug){message(paste("between is ",between,", within is ",within,", Total is ",total,sep=""))}

    betweenCV<-sqrt(between)/grandmean * 100 # Between Run CV%
    withinCV<-sqrt(within)/grandmean * 100 # Within Run CV%
    totalCV<-sqrt(total)/grandmean * 100 # Total CV%
    n1<-dfRun
    n2<-dfWithin
    if(debug){message(paste("n1 is ",n1,", n2 is ",n2,sep=""))}
    #within confidence intervals
    if(balance){
        withinLCB<-within/qf(a,n2,Inf) # Within LCB
        withinUCB<-within/qf(1-a,n2,Inf) # Within UCB
    } else {
        withinLCB<-within/qf(a,n2,Inf) # Within LCB
        withinUCB<-within/qf(1-a,n2,Inf) # Within UCB
    }
#Mean Confidence Intervals
    if(debug){message(paste(grandmean,"+/-(sqrt(",Run,"/",n,")*qt(",a,",df=",I-1,"))",sep=""))} 
    meanLCB<-grandmean+(sqrt(Run/n)*qt(1-a,df=I-1)) # wrong
    meanUCB<-grandmean-(sqrt(Run/n)*qt(1-a,df=I-1)) # wrong
    if(debug){message(paste("Grandmean is ",grandmean,", meanLCB = ",meanLCB,", meanUCB = ",meanUCB,aresponse,sep=""))}
    if(debug){print(summary(assayaov))}
#Between Confidence Intervals
    G1<-1-(1/qf(a,n1,Inf)) 
    G2<-1-(1/qf(a,n2,Inf))
    H1<-(1/qf(1-a,n1,Inf))-1  
    H2<-(1/qf(1-a,n2,Inf))-1
    G12<-((qf(a,n1,n2)-1)^2-(G1^2*qf(a,n1,n2)^2)-(H2^2))/qf(a,n1,n2) 
    H12<-((1-qf(1-a,n1,n2))^2-H1^2*qf(1-a,n1,n2)^2-G2^2)/qf(1-a,n1,n2) 
    if(debug){message(paste("G1 is ",G1,", G2 is ",G2,sep=""))
        message(paste("H1 is ",H1,", H2 is ",H2,sep=""))
        message(paste("G12 is ",G12,", H12 is ",H12,sep=""))
    }
    if(balance){
        Vu<-H1^2*Run^2+G2^2*within^2+H12*Run*within
        Vl<-G1^2*Run^2+H2^2*within^2+G12*within*Run
        betweenLCB<-(Run-within-sqrt(Vl))/J # Betwen LCB
        betweenUCB<-(Run-within+sqrt(Vu))/J # Between UCB
    } else {
        #Burdick and Graybill seem to suggest calculating anova of mean values to find n1S12u/Jh
        meandataf<-ddply(.data=dataf,.variable=afactor, .fun=function(df){mean(with(df, get(aresponse)), na.rm=TRUE)})
        meandataaov<-aov(formula(paste("V1~",afactor,sep="")), data=meandataf)
        sumsquare<-summary(meandataaov)[[1]] He estado viendo un problema similar. He encontrado referencias a los intervalos de confianza calculados por Burdick y Graybill (Burdick, R. y Graybill, F. 1992, Intervalos de confianza en los componentes de la varianza, CRC Press) 

 Al usar un código que he estado intentando, obtengo estos valores 



> kiaraov = aov(Value~Run+Error(Run),data=kiar)

> summary(kiaraov)

Error: Run
    Df  Sum Sq Mean Sq
Run  3 2.57583 0.85861

Error: Within
          Df  Sum Sq Mean Sq F value Pr(>F)
Residuals  8 1.93833 0.24229               
> confint = 95

> a = (1-(confint/100))/2

> grandmean = as.vector(kiaraovSum Sq`
        #so maybe S12u is just that bit ?
        Runu<-(sumsquare*Jh)/n1
        if(debug){message(paste("n1S12u/Jh is ",sumsquare,", so S12u is ",Runu,sep=""))}
        Vu<-H1^2*Runu^2+G2^2*within^2+H12*Runu*within
        Vl<-G1^2*Runu^2+H2^2*within^2+G12*within*Runu
        betweenLCB<-(Runu-within-sqrt(Vl))/Jh # Betwen LCB
        betweenUCB<-(Runu-within+sqrt(Vu))/Jh # Between UCB
        if(debug){message(paste("betweenLCB is ",betweenLCB,", between UCB is ",betweenUCB,sep=""))}
    }
#Total Confidence Intervals
    if(balance){
        y<-(Run+(J-1)*within)/J
        if(debug){message(paste("y is ",y,sep=""))}
        totalLCB<-y-(sqrt(G1^2*Run^2+G2^2*(J-1)^2*within^2)/J) # Total LCB
        totalUCB<-y+(sqrt(H1^2*Run^2+H2^2*(J-1)^2*within^2)/J) # Total UCB
    } else {
        y<-(Runu+(Jh-1)*within)/Jh
        if(debug){message(paste("y is ",y,sep=""))}
        totalLCB<-y-(sqrt(G1^2*Runu^2+G2^2*(Jh-1)^2*within^2)/Jh) # Total LCB
        totalUCB<-y+(sqrt(H1^2*Runu^2+H2^2*(Jh-1)^2*within^2)/Jh) # Total UCB
    }
    if(debug){message(paste("totalLCB is ",totalLCB,", total UCB is ",totalUCB,sep=""))}
#   result<-data.frame(Name=c("within", "between", "total"),CV=c(withinCV,betweenCV,totalCV),
#           LCB=c(sqrt(withinLCB)/grandmean*100,sqrt(betweenLCB)/grandmean*100,sqrt(totalLCB)/grandmean*100),
#           UCB=c(sqrt(withinUCB)/grandmean*100,sqrt(betweenUCB)/grandmean*100,sqrt(totalUCB)/grandmean*100))
    result<-data.frame(Mean=grandmean,MeanLCB=meanLCB, MeanUCB=meanUCB, Within=withinCV,WithinLCB=sqrt(withinLCB)/grandmean*100, WithinUCB=sqrt(withinUCB)/grandmean*100,
            Between=betweenCV, BetweenLCB=sqrt(betweenLCB)/grandmean*100, BetweenUCB=sqrt(betweenUCB)/grandmean*100,
            Total=totalCV, TotalLCB=sqrt(totalLCB)/grandmean*100, TotalUCB=sqrt(totalUCB)/grandmean*100)
    if(!digits=="NA"){
        result$Mean<-signif(result$Mean,digits=digits)
        result$MeanLCB<-signif(result$MeanLCB,digits=digits)
        result$MeanUCB<-signif(result$MeanUCB,digits=digits)
        result$Within<-signif(result$Within,digits=digits)
        result$WithinLCB<-signif(result$WithinLCB,digits=digits)
        result$WithinUCB<-signif(result$WithinUCB,digits=digits)
        result$Between<-signif(result$Between,digits=digits)
        result$BetweenLCB<-signif(result$BetweenLCB,digits=digits)
        result$BetweenUCB<-signif(result$BetweenUCB,digits=digits)
        result$Total<-signif(result$Total,digits=digits)
        result$TotalLCB<-signif(result$TotalLCB,digits=digits)
        result$TotalUCB<-signif(result$TotalUCB,digits=digits)
    }
    return(result)
}

assayvar<-function(adata, aresponse, afactor, anominal, aconfidence=95, digits=3, debug=FALSE){
    result<-ddply(adata,anominal,function(df){
                resul<-avar(dataf=df,afactor=afactor,aresponse=aresponse,aconfidence=aconfidence, digits=digits, debug=debug)
                resul$n<-nrow(subset(df, !is.na(with(df, get(aresponse)))))
                return(resul)
            })
    return(result)
}
quot;(Intercept)"[[1]][1]) # Grand Mean (I think)

> within = summary(kiaraov)<*>quot;Error: Within"[[1]]<*>quot;Mean Sq"  # S2^2Mean Square Value for Within Run

> dfRun = summary(kiaraov)<*>quot;Error: Run"[[1]]<*>quot;Df"

> dfWithin = summary(kiaraov)<*>quot;Error: Within"[[1]]<*>quot;Df"

> Run = summary(kiaraov)<*>quot;Error: Run"[[1]]<*>quot;Mean Sq" # S1^2Mean Square for between Run

> between = (Run-within)/((dfWithin/(dfRun+1))+1) # (S1^2-S2^2)/J

> total = between+within

> between # Between Run Variance
[1] 0.2054398

> within # Within Run Variance
[1] 0.2422917

> total # Total Variance
[1] 0.4477315

> betweenCV = sqrt(between)/grandmean * 100 # Between Run CV%

> withinCV = sqrt(within)/grandmean * 100 # Within Run CV%

> totalCV = sqrt(total)/grandmean * 100 # Total CV%

> #within confidence intervals

> withinLCB = within/qf(1-a,8,Inf) # Within LCB

> withinUCB = within/qf(a,8,Inf) # Within UCB

> #Between Confidence Intervals

> n1 = dfRun

> n2 = dfWithin

> G1 = 1-(1/qf(1-a,n1,Inf)) # According to Burdick and Graybill this should be a

> G2 = 1-(1/qf(1-a,n2,Inf))

> H1 = (1/qf(a,n1,Inf))-1  # and this should be 1-a, but my results don't agree

> H2 = (1/qf(a,n2,Inf))-1

> G12 = ((qf(1-a,n1,n2)-1)^2-(G1^2*qf(1-a,n1,n2)^2)-(H2^2))/qf(1-a,n1,n2) # again, should be a, not 1-a

> H12 = ((1-qf(a,n1,n2))^2-H1^2*qf(a,n1,n2)^2-G2^2)/qf(a,n1,n2) # again, should be 1-a, not a

> Vu = H1^2*Run^2+G2^2*within^2+H12*Run*within

> Vl = G1^2*Run^2+H2^2*within^2+G12*within*Run

> betweenLCB = (Run-within-sqrt(Vl))/J # Betwen LCB

> betweenUCB = (Run-within+sqrt(Vu))/J # Between UCB

> #Total Confidence Intervals

> y = (Run+(J-1)*within)/J

> totalLCB = y-(sqrt(G1^2*Run^2+G2^2*(J-1)^2*within^2)/J) # Total LCB

> totalUCB = y+(sqrt(H1^2*Run^2+H2^2*(J-1)^2*within^2)/J) # Total UCB

> result = data.frame(Name=c("within", "between", "total"),CV=c(withinCV,betweenCV,totalCV),LCB=c(sqrt(withinLCB)/grandmean*100,sqrt(betweenLCB)/grandmean*100,sqrt(totalLCB)/grandmean*100),UCB=c(sqrt(withinUCB)/grandmean*100,sqrt(betweenUCB)/grandmean*100,sqrt(totalUCB)/grandmean*100))

> result
     Name       CV      LCB      UCB
1  within 4.926418 3.327584  9.43789
2 between 4.536327      NaN 19.73568
3   total 6.696855 4.846030 20.42647


 Aquí, el intervalo de confianza más bajo entre CV de ejecución es menor que cero, por lo que se informa como NaN. 

 Me encantaría tener una mejor manera de hacer esto. Si tengo tiempo, podría intentar crear una función para hacer esto. 

 Paul. 

 - 

 Edición: eventualmente escribí una función, aquí está (caveat emptor) 

<*>



	
					
			
			


	
			


	
			
						 Otros consejos
			
			
	
		
	
	
			 Tiene cuatro grupos de tres observaciones: 

> run1 = c(9.85, 9.95, 10.00)
> run2 = c(9.90, 8.80, 9.50)
> run3 = c(11.20, 11.10, 9.80)
> run4 = c(9.70, 10.10, 10.00)
> runs = c(run1, run2, run3, run4)
> runs
 [1]  9.85  9.95 10.00  9.90  8.80  9.50 11.20 11.10  9.80  9.70 10.10 10.00


 Crea algunas etiquetas: 

> n = rep(3, 4)
> group = rep(1:4, n)
> group
 [1] 1 1 1 2 2 2 3 3 3 4 4 4


 Calcular estadísticas dentro de la ejecución: 

> withinRunStats = function(x) c(sum = sum(x), mean = mean(x), var = var(x), n = length(x))
> tapply(runs, group, withinRunStats)
 Tiene cuatro grupos de tres observaciones: 

> run1 = c(9.85, 9.95, 10.00)
> run2 = c(9.90, 8.80, 9.50)
> run3 = c(11.20, 11.10, 9.80)
> run4 = c(9.70, 10.10, 10.00)
> runs = c(run1, run2, run3, run4)
> runs
 [1]  9.85  9.95 10.00  9.90  8.80  9.50 11.20 11.10  9.80  9.70 10.10 10.00


 Crea algunas etiquetas: 

> n = rep(3, 4)
> group = rep(1:4, n)
> group
 [1] 1 1 1 2 2 2 3 3 3 4 4 4


 Calcular estadísticas dentro de la ejecución: 

1`
         sum         mean          var            n 
29.800000000  9.933333333  0.005833333  3.000000000 

 Tiene cuatro grupos de tres observaciones: 

> run1 = c(9.85, 9.95, 10.00)
> run2 = c(9.90, 8.80, 9.50)
> run3 = c(11.20, 11.10, 9.80)
> run4 = c(9.70, 10.10, 10.00)
> runs = c(run1, run2, run3, run4)
> runs
 [1]  9.85  9.95 10.00  9.90  8.80  9.50 11.20 11.10  9.80  9.70 10.10 10.00


 Crea algunas etiquetas: 

> n = rep(3, 4)
> group = rep(1:4, n)
> group
 [1] 1 1 1 2 2 2 3 3 3 4 4 4


 Calcular estadísticas dentro de la ejecución: 

2`
  sum  mean   var     n 
28.20  9.40  0.31  3.00 

 Tiene cuatro grupos de tres observaciones: 

> run1 = c(9.85, 9.95, 10.00)
> run2 = c(9.90, 8.80, 9.50)
> run3 = c(11.20, 11.10, 9.80)
> run4 = c(9.70, 10.10, 10.00)
> runs = c(run1, run2, run3, run4)
> runs
 [1]  9.85  9.95 10.00  9.90  8.80  9.50 11.20 11.10  9.80  9.70 10.10 10.00


 Crea algunas etiquetas: 

> n = rep(3, 4)
> group = rep(1:4, n)
> group
 [1] 1 1 1 2 2 2 3 3 3 4 4 4


 Calcular estadísticas dentro de la ejecución: 

3`
  sum  mean   var     n 
32.10 10.70  0.61  3.00 

 Tiene cuatro grupos de tres observaciones: 

> run1 = c(9.85, 9.95, 10.00)
> run2 = c(9.90, 8.80, 9.50)
> run3 = c(11.20, 11.10, 9.80)
> run4 = c(9.70, 10.10, 10.00)
> runs = c(run1, run2, run3, run4)
> runs
 [1]  9.85  9.95 10.00  9.90  8.80  9.50 11.20 11.10  9.80  9.70 10.10 10.00


 Crea algunas etiquetas: 

> n = rep(3, 4)
> group = rep(1:4, n)
> group
 [1] 1 1 1 2 2 2 3 3 3 4 4 4


 Calcular estadísticas dentro de la ejecución: 

4`
        sum        mean         var           n 
29.80000000  9.93333333  0.04333333  3.00000000 


 Puedes hacer algo de ANOVA aquí: 

> data = data.frame(y = runs, group = factor(group))
> data
       y group
1   9.85     1
2   9.95     1
3  10.00     1
4   9.90     2
5   8.80     2
6   9.50     2
7  11.20     3
8  11.10     3
9   9.80     3
10  9.70     4
11 10.10     4
12 10.00     4

> fit = lm(runs ~ group, data)
> fit

Call:
lm(formula = runs ~ group, data = data)

Coefficients:
(Intercept)       group2       group3       group4  
  9.933e+00   -5.333e-01    7.667e-01   -2.448e-15 

> anova(fit)
Analysis of Variance Table

Response: runs
          Df  Sum Sq Mean Sq F value  Pr(>F)  
group      3 2.57583 0.85861  3.5437 0.06769 .
Residuals  8 1.93833 0.24229                  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 

> degreesOfFreedom = anova(fit)[, "Df"]
> names(degreesOfFreedom) = c("treatment", "error")
> degreesOfFreedom
treatment     error 
        3         8


 Error o variación dentro del grupo: 

> anova(fit)["Residuals", "Mean Sq"]
[1] 0.2422917


 Tratamiento o varianza entre grupos: 

> anova(fit)["group", "Mean Sq"]
[1] 0.8586111


 Esto debería darle suficiente confianza para hacer intervalos de confianza. 
	


	
		
	
	
			 Si desea aplicar una función (como  var ) a través de un factor como  Run  o  Rep , puede usar  tapply : 

> with(variance, tapply(Value, Run, var))
          1           2           3           4 
0.005833333 0.310000000 0.610000000 0.043333333 
> with(variance, tapply(Value, Rep, var))
          1          2          3 
0.48562500 0.88729167 0.05583333 
	


	
		
	
	
			 Tomaré una grieta en esto cuando tenga más tiempo, pero mientras tanto, aquí está el  dput ()  para la estructura de datos de Kiar: 

structure(list(Run = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4), Rep = c(1, 
2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3), Value = c(9.85, 9.95, 10, 9.9, 
8.8, 9.5, 11.2, 11.1, 9.8, 9.7, 10.1, 10)), .Names = c("Run", 
"Rep", "Value"), row.names = c(NA, -12L), class = "data.frame")


 ... en caso de que te gustaría tomar una foto rápida. 
	

			
		
			


	
		
			Licenciado bajo: CC-BY-SA con atribución
			No afiliado a StackOverflow