# Six parameters are needed to start the program -
# these could be put in another file and processed
# before running this file, bootstrap.r so one could
# control multiple processors simultaneously

# beginhere can be used to indicate where to start a random seed
# so a job can be split over several files
beginhere = 0
nboots = 10 # number of data realizations drawn from
              # population specified in terms of G, n, and o1mean
              # as described below
            # should probably use 1000 or so
nboot1 = 50 # nboot1 = number of first level bootstraps for each
             # realization of data
            # should probably use 1000 or so
nboot2 = 50 # number of second level bootstraps for each first level
            # should probably use 250 or so

#Small numbers used here to generate examples in a relatively
# short time

# put in G (number of vars) and n (number in each group)
# o1mean = pattern of true differences
G = 10
n = 14
o1mean = (1:G)/G


# tstats is a function for relatively quickly calculating many t-statistics
# assuming common standard deviation.  Also calculates
# mean difference and standard deviation.
# returns a matrix with dimension G*3
tstats <-  function(data1,data2){
  nprots <-  dim(data1)[2]
  samsize1 <-  dim(data1)[1]
  samsize2 <-  dim(data2)[1]
  sumsq1 <-  rep(0,nprots)
  sumsq2 <-  rep(0,nprots)

  for (i in 1:nprots){
  sumsq1[i] <-  data1[,i] %*% data1[,i]
  sumsq2[i] <-  data2[,i] %*% data2[,i]
  }

  xbar1 <-  rep(1,samsize1) %*% data1[,1:nprots]/samsize1
  xbar2 <-  rep(1,samsize2) %*% data2[,1:nprots]/samsize2

  var1 <-  (sumsq1 - samsize1*xbar1*xbar1)/(samsize1-1)
  var2 <-  (sumsq2 - samsize2*xbar2*xbar2)/(samsize2-1)

  tstats <-  (xbar1 - xbar2)/
           ((sqrt( ( (samsize1-1)*var1+(samsize2-1)*var2 )/(samsize1+samsize2-2))
           )*sqrt(1/samsize1+1/samsize2))
  esses = (sqrt( ( (samsize1-1)*var1+(samsize2-1)*var2 )/(samsize1+samsize2-2))
           )
  xdiffs = xbar1 - xbar2
  cbind(t(tstats),t(esses),t(xdiffs))
}


# s = vector of true standard deviations
# should probably have called with sigmas instead
s = rep(1,G)

nsimul = 1000 # number of simulations to see 'true' pattern
whichones = rep(0,nsimul) # will hold index of chosen variable
xdiffs = whichones; # will hold mean difference
unknowns = xdiffs # will hold true value of mu_{r_G}
sdiffs = xdiffs # will hold s_{r_G}
resdiffs = xdiffs # will hold (d_{r_G}-mu_{r_G})/s_{r_G}
                         # almost like tau_{r_G} except missing sqrt(n/2)

offset = rep(0,G) # a way to introduce other patterns in mean differences
                  # not really used here


# this could be used for dependent data
#rho = .5
#  cvmat = matrix(rep(rho,G*G),ncol=G)
#  diag(cvmat) = 1

for(i in 1:nsimul){
  set.seed(i)
  o1 = matrix(rnorm(G*n,rep(o1mean+offset,G),s),ncol=G,byrow=T)
  o2 = matrix(rnorm(G*n,rep(offset,G),s),ncol=G,byrow=T)
# Could use lines below to experiment with dependent data  
#  o1 = mvrnorm(n,mu=o1mean,Sigma=cvmat)
#  o2 = mvrnorm(n,mu=rep(0,length(o1mean)),Sigma=cvmat)
  meandiffs = tstats(o1,o2)
  whichone = which((meandiffs[,1]) == max((meandiffs[,1])))[1]
  resdiffs[i] = (meandiffs[whichone,3]-o1mean[whichone])/meandiffs[whichone,2]
  whichones[i] = whichone
  xdiffs[i] = meandiffs[whichone,3]
  unknowns[i] = o1mean[whichone]
  sdiffs[i] = meandiffs[whichone,2]
  if(i %% 50 == 0) print(paste(date()," i = ",i))
}

# allress = observed values of tau_{r_G} -- take as true values
allress = resdiffs*sqrt(n)/sqrt(2)


# gives name of output file
filename = paste("testdb-max-",G,"-",n,"-",max((round(o1mean,3))),"-",nsimul,"-",beginhere,"-",nboots,".out",sep="")


br = rep(0,nboot1)
whichones0 = rep(0,nboots)
for(i in 1:nboots){
set.seed(i+beginhere)
# initialize the same variables as above except they correspond
# to 1st and 2nd level bootstrap versions
resdiffs1 = rep(0,nboot1); resdiffs2 = rep(0,nboot2);
xdiffs1 = resdiffs1; xdiffs2 = resdiffs2;unknowns1 = xdiffs1;
sdiffs1 = xdiffs1; sdiffs2 = xdiffs2;unknowns2 = xdiffs2;
umr = resdiffs1
#
o1 = matrix(rnorm(G*n,rep(o1mean+offset,G),s),ncol=G,byrow=T)
o2 = matrix(rnorm(G*n,rep(offset,G),s),ncol=G,byrow=T)
meandiffs = tstats(o1,o2)
# these variables with 0 suffixes correspond to
# realizations of initial data
whichone0 = which((meandiffs[,1]) == max((meandiffs[,1])))[1]
whichones0[i] = whichone0
resdiffs0 = (meandiffs[whichone0,3]-o1mean[whichone0])/meandiffs[whichone0,2]
xdiffs0 = meandiffs[whichone0,3]
for(j in 1:nboot1){
  bo1 = o1[sample(1:n,replace=T),]
  bo2 = o2[sample(1:n,replace=T),]
  meandiffs1 = tstats(bo1,bo2)
  # create bootstrap versions of the data
  whichone1 = which((meandiffs1[,1]) == max((meandiffs1[,1])))[1]
  resdiffs1[j] = (meandiffs1[whichone1,3]-meandiffs[whichone1,3])/meandiffs1[whichone1,2]
  xdiffs1[j] = meandiffs1[whichone1,3]
  unknowns1[j] = meandiffs[whichone1,3]
  for(k in 1:nboot2){
    # create second level bootstrap versions
    bbo1 = bo1[sample(1:n,replace=T),]
    bbo2 = bo2[sample(1:n,replace=T),]
    meandiffs2 = tstats(bbo1,bbo2)
    whichone2 = which((meandiffs2[,1]) == max((meandiffs2[,1])))[1]
    resdiffs2[k] = (meandiffs2[whichone2,3]-meandiffs1[whichone2,3])/meandiffs2[whichone2,2]
    xdiffs2[k] = meandiffs2[whichone2,3]
    sdiffs2[k] = meandiffs2[whichone2,2]
    unknowns2[k] = meandiffs1[whichone2,3]
  }
  sdiffs1[j] = meandiffs1[whichone1,2]
  # br used for bias correction
  br[j] =  resdiffs1[j] - mean(resdiffs2)
  # umr used for nested percentile method
  umr[j] = mean(resdiffs2<= resdiffs1[j])
}
print(paste(date(), " i = ", i))
#print(paste(summary((resdiffs/sqrt(var(resdiffs1))))))

# newdist1 is simple bootstrap approximation, tau^{*}_{r_G}
newdist1 = (resdiffs1)*sqrt(n)/sqrt(2)
# newdist2 is bias corrected version
# with very small sample sizes can run into trouble
# with granularity of bootstrap so use the na.rm
# probably okay to remove this if sample sizes
# not too small
newdist2 = (resdiffs1+mean(br,na.rm=T))*sqrt(n)/sqrt(2)

print(paste(summary(newdist1)))
print(paste(summary(newdist2)))
print(paste(summary(quantile(newdist1,p=umr))))

# quants0 give quantiles for 'truth' in terms of F
quants0 = quantile(allress,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975))
# quants1 give quantiles for simple bootstrap in terms of F^*
quants1 = quantile(newdist1,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975))
# quants2 gives quantiles in terms of bias corrected
quants2 = quantile(newdist2,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975))
# quants3 gives quantiles in terms of nested percentile
quants3 = quantile(newdist1,p=quantile(umr,p=c(.025,.05,.1,.25,.5,.75,.90,.95,.975)))

# the following gives confidence intervals for mu_{r_G}
# note: they are in backwards order, i.e.
# 97.5, 95, 90, 75, 50, 25, 10, 5, 2.5 instead of
# 2.5, 5, 10, 25, 50, 75, 90, 95, 97.5
cis0 = meandiffs[whichone0,3]-quants0*meandiffs[whichone0,2]*sqrt(2)/sqrt(n)
cis1 = meandiffs[whichone0,3]-quants1*meandiffs[whichone0,2]*sqrt(2)/sqrt(n)
cis2 = meandiffs[whichone0,3]-quants2*meandiffs[whichone0,2]*sqrt(2)/sqrt(n)
cis3 = meandiffs[whichone0,3]-quants3*meandiffs[whichone0,2]*sqrt(2)/sqrt(n)

# keeps track of random seed used for each interation in case
# further investigation needed
index = i + beginhere

# the first few elements written out are
# index, n, G, max value of mu, d_{r_G}, s_{r_G}, mu_{r_G}
# then the confidence interval information is written
write.table(list(index,n,G,max(o1mean),round(meandiffs[whichone0,3],2),round(meandiffs[whichone0,2],2),round(o1mean[whichone0],2),
round(matrix(cis0,nrow=1),2),
round(matrix(cis1,nrow=1),2),
round(matrix(cis2,nrow=1),2),
round(matrix(cis3,nrow=1),2)),
file=filename,col.names=F,row.names=F,sep=",",append=T)

}


# when the loop is finished 
# to check coverage of CIs use code like the following
# filename is the name of the output file given above
res = read.table(filename,sep=",")
mean(res[,13]<res[,7] & res[,7]<=res[,11])
mean(res[,22]<res[,7] & res[,7]<=res[,20])
mean(res[,31]<res[,7] & res[,7]<=res[,29])
mean(res[,40]<res[,7] & res[,7]<=res[,38])
#n = 14
addon = sqrt(2/n)*res[,6]*qt(.75,2*n-2)
mean((res[,5]-addon)<res[,7] & res[,7]<(res[,5]+addon))