# Six parameters are needed to start the program - # these could be put in another file and processed # before running this file, bootstrap.r so one could # control multiple processors simultaneously # beginhere can be used to indicate where to start a random seed # so a job can be split over several files beginhere = 0 nboots = 10 # number of data realizations drawn from # population specified in terms of G, n, and o1mean # as described below # should probably use 1000 or so nboot1 = 50 # nboot1 = number of first level bootstraps for each # realization of data # should probably use 1000 or so nboot2 = 50 # number of second level bootstraps for each first level # should probably use 250 or so #Small numbers used here to generate examples in a relatively # short time # put in G (number of vars) and n (number in each group) # o1mean = pattern of true differences G = 10 n = 14 o1mean = (1:G)/G # tstats is a function for relatively quickly calculating many t-statistics # assuming common standard deviation. Also calculates # mean difference and standard deviation. # returns a matrix with dimension G*3 tstats <- function(data1,data2){ nprots <- dim(data1)[2] samsize1 <- dim(data1)[1] samsize2 <- dim(data2)[1] sumsq1 <- rep(0,nprots) sumsq2 <- rep(0,nprots) for (i in 1:nprots){ sumsq1[i] <- data1[,i] %*% data1[,i] sumsq2[i] <- data2[,i] %*% data2[,i] } xbar1 <- rep(1,samsize1) %*% data1[,1:nprots]/samsize1 xbar2 <- rep(1,samsize2) %*% data2[,1:nprots]/samsize2 var1 <- (sumsq1 - samsize1*xbar1*xbar1)/(samsize1-1) var2 <- (sumsq2 - samsize2*xbar2*xbar2)/(samsize2-1) tstats <- (xbar1 - xbar2)/ ((sqrt( ( (samsize1-1)*var1+(samsize2-1)*var2 )/(samsize1+samsize2-2)) )*sqrt(1/samsize1+1/samsize2)) esses = (sqrt( ( (samsize1-1)*var1+(samsize2-1)*var2 )/(samsize1+samsize2-2)) ) xdiffs = xbar1 - xbar2 cbind(t(tstats),t(esses),t(xdiffs)) } # s = vector of true standard deviations # should probably have called with sigmas instead s = rep(1,G) nsimul = 1000 # number of simulations to see 'true' pattern whichones = rep(0,nsimul) # will hold index of chosen variable xdiffs = whichones; # will hold mean difference unknowns = xdiffs # will hold true value of mu_{r_G} sdiffs = xdiffs # will hold s_{r_G} resdiffs = xdiffs # will hold (d_{r_G}-mu_{r_G})/s_{r_G} # almost like tau_{r_G} except missing sqrt(n/2) offset = rep(0,G) # a way to introduce other patterns in mean differences # not really used here # this could be used for dependent data #rho = .5 # cvmat = matrix(rep(rho,G*G),ncol=G) # diag(cvmat) = 1 for(i in 1:nsimul){ set.seed(i) o1 = matrix(rnorm(G*n,rep(o1mean+offset,G),s),ncol=G,byrow=T) o2 = matrix(rnorm(G*n,rep(offset,G),s),ncol=G,byrow=T) # Could use lines below to experiment with dependent data # o1 = mvrnorm(n,mu=o1mean,Sigma=cvmat) # o2 = mvrnorm(n,mu=rep(0,length(o1mean)),Sigma=cvmat) meandiffs = tstats(o1,o2) whichone = which((meandiffs[,1]) == max((meandiffs[,1])))[1] resdiffs[i] = (meandiffs[whichone,3]-o1mean[whichone])/meandiffs[whichone,2] whichones[i] = whichone xdiffs[i] = meandiffs[whichone,3] unknowns[i] = o1mean[whichone] sdiffs[i] = meandiffs[whichone,2] if(i %% 50 == 0) print(paste(date()," i = ",i)) } # allress = observed values of tau_{r_G} -- take as true values allress = resdiffs*sqrt(n)/sqrt(2) # gives name of output file filename = paste("testdb-max-",G,"-",n,"-",max((round(o1mean,3))),"-",nsimul,"-",beginhere,"-",nboots,".out",sep="") br = rep(0,nboot1) whichones0 = rep(0,nboots) for(i in 1:nboots){ set.seed(i+beginhere) # initialize the same variables as above except they correspond # to 1st and 2nd level bootstrap versions resdiffs1 = rep(0,nboot1); resdiffs2 = rep(0,nboot2); xdiffs1 = resdiffs1; xdiffs2 = resdiffs2;unknowns1 = xdiffs1; sdiffs1 = xdiffs1; sdiffs2 = xdiffs2;unknowns2 = xdiffs2; umr = resdiffs1 # o1 = matrix(rnorm(G*n,rep(o1mean+offset,G),s),ncol=G,byrow=T) o2 = matrix(rnorm(G*n,rep(offset,G),s),ncol=G,byrow=T) meandiffs = tstats(o1,o2) # these variables with 0 suffixes correspond to # realizations of initial data whichone0 = which((meandiffs[,1]) == max((meandiffs[,1])))[1] whichones0[i] = whichone0 resdiffs0 = (meandiffs[whichone0,3]-o1mean[whichone0])/meandiffs[whichone0,2] xdiffs0 = meandiffs[whichone0,3] for(j in 1:nboot1){ bo1 = o1[sample(1:n,replace=T),] bo2 = o2[sample(1:n,replace=T),] meandiffs1 = tstats(bo1,bo2) # create bootstrap versions of the data whichone1 = which((meandiffs1[,1]) == max((meandiffs1[,1])))[1] resdiffs1[j] = (meandiffs1[whichone1,3]-meandiffs[whichone1,3])/meandiffs1[whichone1,2] xdiffs1[j] = meandiffs1[whichone1,3] unknowns1[j] = meandiffs[whichone1,3] for(k in 1:nboot2){ # create second level bootstrap versions bbo1 = bo1[sample(1:n,replace=T),] bbo2 = bo2[sample(1:n,replace=T),] meandiffs2 = tstats(bbo1,bbo2) whichone2 = which((meandiffs2[,1]) == max((meandiffs2[,1])))[1] resdiffs2[k] = (meandiffs2[whichone2,3]-meandiffs1[whichone2,3])/meandiffs2[whichone2,2] xdiffs2[k] = meandiffs2[whichone2,3] sdiffs2[k] = meandiffs2[whichone2,2] unknowns2[k] = meandiffs1[whichone2,3] } sdiffs1[j] = meandiffs1[whichone1,2] # br used for bias correction br[j] = resdiffs1[j] - mean(resdiffs2) # umr used for nested percentile method umr[j] = mean(resdiffs2<= resdiffs1[j]) } print(paste(date(), " i = ", i)) #print(paste(summary((resdiffs/sqrt(var(resdiffs1)))))) # newdist1 is simple bootstrap approximation, tau^{*}_{r_G} newdist1 = (resdiffs1)*sqrt(n)/sqrt(2) # newdist2 is bias corrected version # with very small sample sizes can run into trouble # with granularity of bootstrap so use the na.rm # probably okay to remove this if sample sizes # not too small newdist2 = (resdiffs1+mean(br,na.rm=T))*sqrt(n)/sqrt(2) print(paste(summary(newdist1))) print(paste(summary(newdist2))) print(paste(summary(quantile(newdist1,p=umr)))) # quants0 give quantiles for 'truth' in terms of F quants0 = quantile(allress,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975)) # quants1 give quantiles for simple bootstrap in terms of F^* quants1 = quantile(newdist1,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975)) # quants2 gives quantiles in terms of bias corrected quants2 = quantile(newdist2,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975)) # quants3 gives quantiles in terms of nested percentile quants3 = quantile(newdist1,p=quantile(umr,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975))) # the following gives confidence intervals for mu_{r_G} # note: they are in backwards order, i.e. # 97.5, 95, 90, 75, 50, 25, 10, 5, 2.5 instead of # 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5 cis0 = meandiffs[whichone0,3]-quants0*meandiffs[whichone0,2]*sqrt(2)/sqrt(n) cis1 = meandiffs[whichone0,3]-quants1*meandiffs[whichone0,2]*sqrt(2)/sqrt(n) cis2 = meandiffs[whichone0,3]-quants2*meandiffs[whichone0,2]*sqrt(2)/sqrt(n) cis3 = meandiffs[whichone0,3]-quants3*meandiffs[whichone0,2]*sqrt(2)/sqrt(n) # keeps track of random seed used for each interation in case # further investigation needed index = i + beginhere # the first few elements written out are # index, n, G, max value of mu, d_{r_G}, s_{r_G}, mu_{r_G} # then the confidence interval information is written write.table(list(index,n,G,max(o1mean),round(meandiffs[whichone0,3],2),round(meandiffs[whichone0,2],2),round(o1mean[whichone0],2), round(matrix(cis0,nrow=1),2), round(matrix(cis1,nrow=1),2), round(matrix(cis2,nrow=1),2), round(matrix(cis3,nrow=1),2)), file=filename,col.names=F,row.names=F,sep=",",append=T) } # when the loop is finished # to check coverage of CIs use code like the following # filename is the name of the output file given above res = read.table(filename,sep=",") mean(res[,13]