#The parameters below should be set and comment symbols removed
#csize is the size of the cancer group in the training set
# nsize is size of the control group in the training set
# datadir holds the cancer.rda, control.rda, and wvec.rda files
# dlldir holds the findmleall.dll or .so file
# ntrials is the number of resampling
# K is the number of threshold classifiers
# that are used.  This could have been picked through
# cross-validation but 150 was the first choice and
# it seemed to work well -- see the cited references
# in the text for more information


#rm(list=ls())
#csize = 81; nsize=46;
#datadir = "/data/nealjeff/ciphergen/";
#dlldir = "/data/nealjeff/qstar/"; forext = "so"
#ntrials = 50
#lookuppt = 0;dottype="."
#K = 150

setwd(datadir)

lookup <-  function(point,data=wvec){
  lookup <-  which(abs(data - point) == min(abs(data - point)))
  lookup[1]
}


load(paste(datadir,"cancer.rda",sep=""))
load(paste(datadir,"control.rda",sep=""))
load(paste(datadir,"wvec.rda",sep=""))

dyn.load(paste(dlldir,"findmleall",dottype,forext,sep=""))

ll2 = function(cutpoint,groupclass,peakvals,weights){
  n11 = sum(weights[which(groupclass == 0 & peakvals <= cutpoint)])
  n21 = sum(weights[which(groupclass == 1 & peakvals <= cutpoint)])
  n12 = sum(weights[which(groupclass == 0 & peakvals > cutpoint)])
  n22 = sum(weights[which(groupclass == 1 & peakvals > cutpoint)])
ll =
  ifelse(n11 == 0, 0, n11*log(n11/(n11 + n21)))+
  ifelse(n21 == 0, 0, n21*log(n21/(n11 + n21)))+
  ifelse(n12 == 0, 0, n12*log(n12/(n12 + n22)))+
  ifelse(n22 == 0, 0, n22*log(n22/(n12 + n22)))
  output = c(ll,n11,n21,n12,n22)
  names(output) = c("logl","n11","n21","n12","n22") 
  return(output)
}

avnrmlz = function(vec,avval){
  avnrmlz = vec*avval/mean(vec)
}


ttestp = function(y,labels,label1,label2){
  ttestp = t.test(y[labels == label1],
                     y[labels == label2])$p.value
  ttestp
}
tteststat = function(y,labels,label1,label2){
  ttestp = t.test(y[labels == label1],
                     y[labels == label2])$statistic
  ttestp
}


leftpoint = lookup(lookuppt)
if(wvec[leftpoint] < lookuppt) leftpoint = leftpoint + 1

# holds the data of interest
outcomepreds = matrix(rep(0,4*ntrials),ncol=4)

for(z in 1:ntrials){
set.seed(z)
dimcan = dim(cancer)
dimcon = dim(control)
listcancer1 =  sample(dimcan[1],csize)
listcontrol1 = sample(dimcon[1],nsize)


cancer1 = cancer[listcancer1,leftpoint:dimcan[2]];
control1 = control[listcontrol1,leftpoint:dimcon[2]]

lc1 = length(listcancer1); #lc2 = length(listcancer2);
ln1 = length(listcontrol1);# ln2 = length(listcontrol2);
N1 = lc1+ln1

rorder = sample(1:N1)
backorder = (1:N1)[order(rorder)]
cancer2 = cancer[-listcancer1,leftpoint:dimcan[2]];
control2 = control[-listcontrol1,leftpoint:dimcon[2]]
lc2 = dim(cancer2)[1]; ln2 = dim(control2)[1] 


avval = mean(rbind(cancer1,control1))

# bset1 is training set
bset1 = t(apply(rbind(cancer1,control1),1,avnrmlz,avval))
bset1 = bset1[rorder,]
backorder = (1:N1)[order(rorder)]
classlabels = c(rep("C",lc1),rep("N",ln1))[rorder]
class01s = as.integer(ifelse(classlabels == "C",0,1))

# test set
bset2 = t(apply(rbind(cancer2,control2),1,avnrmlz,mean(rbind(cancer2,control2))))
classlabels2 = c(rep("C",lc2),rep("N",ln2))
dimbset1 = dim(bset1)

# generate a t.test at each m/z value
ttestps = apply(bset1,2,ttestp,labels=classlabels,label1="C",label2="N")


pboundary = .05/dim(bset1)[2]
# restrict attention to Bonferroni adjusted pvalues<.05
smallps = (which(ttestps < pboundary))

datax = bset1[,smallps];
groupclass = ifelse(classlabels == "C",0,1);


I = dim(datax)[1]
J = dim(datax)[2]
weights = rep(1,I)
k = 1
nps = rep(0,1000)
cps = nps; alphams = nps; matchtypes = nps
nerrsvec = nps

weightmat = matrix(rep(0,K*I),ncol=K)
while(k <= K){
weightmat[,k] = weights
loglikes = rep(0,J)
cvals = loglikes
nerrs = cvals
# for each m/z value finds the best (likelihood criteria) cutpoint
# this function returns the vector of loglikelihoods, best cutpoints
# and number of classification errors
what = .Fortran("findmleall",as.double(datax),
as.integer(I),as.integer(J),as.double(weights),
as.integer(groupclass),as.double(rep(1,J)),as.double(rep(1,J)),
as.double(rep(1,J)))
names(what) = c("means","I","J","weights","groupclass","loglikes",
"cvals","nerrs")
loglikes = what$loglikes
cvals = what$cvals
nerrs = what$nerrs

# The best one chosen is that which minimizes the number
# of errors
newpeakpos = order(abs(nerrs-floor(I/2)),decreasing=T)[1]
newcutval = cvals[newpeakpos]
peakvals = datax[,newpeakpos]
matchtype = ifelse(nerrs[newpeakpos] < .5*I,1,0)
# matchtype = 1 means few n12+n21 mismatches/counts
# 0 means few n11+n22 mismatches/counts

# see the description of this algorithm in Hastie et al.,
# section 10.1
temp = ll2(newcutval,groupclass,peakvals,weights)
if(matchtype == 1) {errm = (1/I)*(temp[[3]]+temp[[4]])} else {
errm = (1/I)*(temp[[2]]+temp[[5]])}


errm = min(errm,.999)
errm = max(errm,.001)
alpham = log((1-errm)/errm)


matches = ifelse( (groupclass == 0 & peakvals <= newcutval) |
(groupclass == 1 & peakvals > newcutval) , 1,0)
if(matchtype == 0) {mismatches = matches} else {
mismatches = 1-matches}
 weights = weights*exp(alpham*mismatches)
 weights = I*weights/sum(weights)

nps[k] = newpeakpos
cps[k] = newcutval
nerrsvec[k] = nerrs[newpeakpos]
alphams[k] = alpham
matchtypes[k] = matchtype
# periodically print results
if( k %% 25 == 0){
predvals = rep(0,I)
for(m in 1:k){
  if(matchtypes[m] == 1) {
    addons = alphams[m]*(-1+2*sign(datax[,nps[m]] > cps[m]))} else {
    addons = alphams[m]*(-1+2*sign(datax[,nps[m]] <= cps[m]))} 
  predvals = predvals + addons      
}
minpred = min(abs(predvals))
minmargin = minpred/k
print(paste(k," ",minmargin," ",date()))
}
k = k + 1
}

# evaluate test set
datax2 = bset2[,smallps]
groupclass2 = ifelse(classlabels2 == "C",0,1)


predvals = rep(0,dim(datax2)[1])
for(m in 1:(k-1)){
  if(matchtypes[m] == 1) {
    addons = alphams[m]*(-1+2*sign(datax2[,nps[m]] > cps[m]))} else {
    addons = alphams[m]*(-1+2*sign(datax2[,nps[m]] <= cps[m]))} 
  predvals = predvals + addons    
}

tableboost = table(groupclass2,sign(predvals))

outcomepreds[z,] = c(
             (tableboost[1,1])/(tableboost[1,1]+tableboost[1,2]),
(tableboost[1,1]+tableboost[1,2]),
             (tableboost[2,2])/(tableboost[2,2]+tableboost[2,1]),
(tableboost[2,2]+tableboost[2,1]))


print(paste("cancer1 =",cancer1[1,1]))
print(paste("z =",z))
print(outcomepreds[z,])
}


save(list=c("outcomepreds"),file=paste(datadir,"boostres-",as.character(lookuppt),"-",as.character(csize),"-",as.character(nsize),"-",as.character(K),".rda",sep=""))