#The parameters below should be set and comment symbols removed #csize is the size of the cancer group in the training set # nsize is size of the control group in the training set # datadir holds the cancer.rda, control.rda, and wvec.rda files # dlldir holds the findmleall.dll or .so file # ntrials is the number of resampling # K is the number of threshold classifiers # that are used. This could have been picked through # cross-validation but 150 was the first choice and # it seemed to work well -- see the cited references # in the text for more information #rm(list=ls()) #csize = 81; nsize=46; #datadir = "/data/nealjeff/ciphergen/"; #dlldir = "/data/nealjeff/qstar/"; forext = "so" #ntrials = 50 #lookuppt = 0;dottype="." #K = 150 setwd(datadir) lookup <- function(point,data=wvec){ lookup <- which(abs(data - point) == min(abs(data - point))) lookup[1] } load(paste(datadir,"cancer.rda",sep="")) load(paste(datadir,"control.rda",sep="")) load(paste(datadir,"wvec.rda",sep="")) dyn.load(paste(dlldir,"findmleall",dottype,forext,sep="")) ll2 = function(cutpoint,groupclass,peakvals,weights){ n11 = sum(weights[which(groupclass == 0 & peakvals <= cutpoint)]) n21 = sum(weights[which(groupclass == 1 & peakvals <= cutpoint)]) n12 = sum(weights[which(groupclass == 0 & peakvals > cutpoint)]) n22 = sum(weights[which(groupclass == 1 & peakvals > cutpoint)]) ll = ifelse(n11 == 0, 0, n11*log(n11/(n11 + n21)))+ ifelse(n21 == 0, 0, n21*log(n21/(n11 + n21)))+ ifelse(n12 == 0, 0, n12*log(n12/(n12 + n22)))+ ifelse(n22 == 0, 0, n22*log(n22/(n12 + n22))) output = c(ll,n11,n21,n12,n22) names(output) = c("logl","n11","n21","n12","n22") return(output) } avnrmlz = function(vec,avval){ avnrmlz = vec*avval/mean(vec) } ttestp = function(y,labels,label1,label2){ ttestp = t.test(y[labels == label1], y[labels == label2])$p.value ttestp } tteststat = function(y,labels,label1,label2){ ttestp = t.test(y[labels == label1], y[labels == label2])$statistic ttestp } leftpoint = lookup(lookuppt) if(wvec[leftpoint] < lookuppt) leftpoint = leftpoint + 1 # holds the data of interest outcomepreds = matrix(rep(0,4*ntrials),ncol=4) for(z in 1:ntrials){ set.seed(z) dimcan = dim(cancer) dimcon = dim(control) listcancer1 = sample(dimcan[1],csize) listcontrol1 = sample(dimcon[1],nsize) cancer1 = cancer[listcancer1,leftpoint:dimcan[2]]; control1 = control[listcontrol1,leftpoint:dimcon[2]] lc1 = length(listcancer1); #lc2 = length(listcancer2); ln1 = length(listcontrol1);# ln2 = length(listcontrol2); N1 = lc1+ln1 rorder = sample(1:N1) backorder = (1:N1)[order(rorder)] cancer2 = cancer[-listcancer1,leftpoint:dimcan[2]]; control2 = control[-listcontrol1,leftpoint:dimcon[2]] lc2 = dim(cancer2)[1]; ln2 = dim(control2)[1] avval = mean(rbind(cancer1,control1)) # bset1 is training set bset1 = t(apply(rbind(cancer1,control1),1,avnrmlz,avval)) bset1 = bset1[rorder,] backorder = (1:N1)[order(rorder)] classlabels = c(rep("C",lc1),rep("N",ln1))[rorder] class01s = as.integer(ifelse(classlabels == "C",0,1)) # test set bset2 = t(apply(rbind(cancer2,control2),1,avnrmlz,mean(rbind(cancer2,control2)))) classlabels2 = c(rep("C",lc2),rep("N",ln2)) dimbset1 = dim(bset1) # generate a t.test at each m/z value ttestps = apply(bset1,2,ttestp,labels=classlabels,label1="C",label2="N") pboundary = .05/dim(bset1)[2] # restrict attention to Bonferroni adjusted pvalues<.05 smallps = (which(ttestps < pboundary)) datax = bset1[,smallps]; groupclass = ifelse(classlabels == "C",0,1); I = dim(datax)[1] J = dim(datax)[2] weights = rep(1,I) k = 1 nps = rep(0,1000) cps = nps; alphams = nps; matchtypes = nps nerrsvec = nps weightmat = matrix(rep(0,K*I),ncol=K) while(k <= K){ weightmat[,k] = weights loglikes = rep(0,J) cvals = loglikes nerrs = cvals # for each m/z value finds the best (likelihood criteria) cutpoint # this function returns the vector of loglikelihoods, best cutpoints # and number of classification errors what = .Fortran("findmleall",as.double(datax), as.integer(I),as.integer(J),as.double(weights), as.integer(groupclass),as.double(rep(1,J)),as.double(rep(1,J)), as.double(rep(1,J))) names(what) = c("means","I","J","weights","groupclass","loglikes", "cvals","nerrs") loglikes = what$loglikes cvals = what$cvals nerrs = what$nerrs # The best one chosen is that which minimizes the number # of errors newpeakpos = order(abs(nerrs-floor(I/2)),decreasing=T)[1] newcutval = cvals[newpeakpos] peakvals = datax[,newpeakpos] matchtype = ifelse(nerrs[newpeakpos] < .5*I,1,0) # matchtype = 1 means few n12+n21 mismatches/counts # 0 means few n11+n22 mismatches/counts # see the description of this algorithm in Hastie et al., # section 10.1 temp = ll2(newcutval,groupclass,peakvals,weights) if(matchtype == 1) {errm = (1/I)*(temp[[3]]+temp[[4]])} else { errm = (1/I)*(temp[[2]]+temp[[5]])} errm = min(errm,.999) errm = max(errm,.001) alpham = log((1-errm)/errm) matches = ifelse( (groupclass == 0 & peakvals <= newcutval) | (groupclass == 1 & peakvals > newcutval) , 1,0) if(matchtype == 0) {mismatches = matches} else { mismatches = 1-matches} weights = weights*exp(alpham*mismatches) weights = I*weights/sum(weights) nps[k] = newpeakpos cps[k] = newcutval nerrsvec[k] = nerrs[newpeakpos] alphams[k] = alpham matchtypes[k] = matchtype # periodically print results if( k %% 25 == 0){ predvals = rep(0,I) for(m in 1:k){ if(matchtypes[m] == 1) { addons = alphams[m]*(-1+2*sign(datax[,nps[m]] > cps[m]))} else { addons = alphams[m]*(-1+2*sign(datax[,nps[m]] <= cps[m]))} predvals = predvals + addons } minpred = min(abs(predvals)) minmargin = minpred/k print(paste(k," ",minmargin," ",date())) } k = k + 1 } # evaluate test set datax2 = bset2[,smallps] groupclass2 = ifelse(classlabels2 == "C",0,1) predvals = rep(0,dim(datax2)[1]) for(m in 1:(k-1)){ if(matchtypes[m] == 1) { addons = alphams[m]*(-1+2*sign(datax2[,nps[m]] > cps[m]))} else { addons = alphams[m]*(-1+2*sign(datax2[,nps[m]] <= cps[m]))} predvals = predvals + addons } tableboost = table(groupclass2,sign(predvals)) outcomepreds[z,] = c( (tableboost[1,1])/(tableboost[1,1]+tableboost[1,2]), (tableboost[1,1]+tableboost[1,2]), (tableboost[2,2])/(tableboost[2,2]+tableboost[2,1]), (tableboost[2,2]+tableboost[2,1])) print(paste("cancer1 =",cancer1[1,1])) print(paste("z =",z)) print(outcomepreds[z,]) } save(list=c("outcomepreds"),file=paste(datadir,"boostres-",as.character(lookuppt),"-",as.character(csize),"-",as.character(nsize),"-",as.character(K),".rda",sep=""))