# The parameters below should be set and have the comment symbols # removed. # csize is size of cancer group in training set # nsize is size of control group in training set # datadir holds the wvec, cancer, and control.rda files # dlldir holds fitnessfuncf.dll or .so file # gafuncdir hold the readgafuncs.r file # z controls the random seed # nboots is the number of bootstraps done per resampling # p1/1000 is p1 in the paper, p2/1000 is p2 in the paper #rm(list=ls()) #ngens = 250; p1 = 2; p2 = 0; csize = 81; nsize=46; #datadir = "/data/nealjeff/ciphergen/"; #dlldir = "/data/nealjeff/biowulf/"; forext = "so";dottype="." #gafuncdir = "/data/nealjeff/qstar/"; #lookuppt = 0 #z = 3 #nboots = 20 setwd(datadir) source(paste(gafuncdir,"readgafuncs.r",sep="")) leftpoint = lookup(lookuppt) if(wvec[leftpoint] < lookuppt) leftpoint = leftpoint + 1 set.seed(z) penalty1 = p1/1000; penalty2 = p2/1000 # initialize the space to hold results nbres = NULL dimcan = dim(cancer) dimcon = dim(control) # the full sample -- used to evaluate bias finset = rbind(cancer[,leftpoint:dimcan[2]], control[,leftpoint:dimcon[2]]) for(nb in 1:nboots){ # a place to write output to monitor sink(paste(writedir,"highboot-", as.character(lookuppt),"-",as.character(csize),"-",as.character(nsize),"-", as.character(p1),"-",as.character(p2),"-",as.character(z),"-", as.character(ngens),".txt",sep="")) set.seed(z) # this was done so can easily check nbth iteration to verify # results listcancernew = sample(dimcan[1],replace=T) listcontrolnew = sample(dimcon[1],replace=T) listcancertrain = listcancernew[1:csize] listcontroltrain = listcontrolnew[1:nsize] listcancertest = listcancernew[(csize+1):dimcan[1]] listcontroltest = listcontrolnew[(nsize+1):dimcon[1]] lc2 = length(listcancertest);ln2 = length(listcontroltest) # training set bset1 = rbind(cancer[listcancertrain,leftpoint:dimcan[2]], control[listcontroltrain,leftpoint:dimcon[2]]) # putting set.seed here changes the order of bset # but the composition of which cases are controls and cancer is the same # (for different nb) for a given value of z set.seed(z*100+nb) N1 = csize+nsize rorder = sample(1:N1) bset1 = bset1[rorder,] backorder = (1:N1)[order(rorder)] classlabels = c(rep("C",csize),rep("N",nsize))[rorder] class01s = as.integer(ifelse(classlabels == "C",0,1)) # test set bset2 = rbind(cancer[listcancertest,leftpoint:dimcan[2]], control[listcontroltest,leftpoint:dimcon[2]]) classlabels2 = c(rep("C",lc2),rep("N",ln2)) dimbset1 = dim(bset1) nchromes = 1500 ngenes = dim(bset1)[2] maxlen=20; minlen=5; chrome.sizes = sample(c(minlen:maxlen),nchromes,replace=T) chromes = matrix(rep(0,nchromes*maxlen),nrow=nchromes) for(i in 1:nchromes){ chromes[i,1:chrome.sizes[i]] = sample(1:ngenes,chrome.sizes[i],replace=F) } tempchromes=chromes; tempsizes = chrome.sizes best = c(0,0,0,0,rep(0,maxlen)) n = 1 while(n <= ngens){ chromes = tempchromes chrome.sizes = tempsizes # mutate for(i in 1:nchromes){ chromes[i,1:chrome.sizes[i]] = mutateme(chromes[i,1:chrome.sizes[i]]) } # evaluate fitness of nchromes chromosomes fitnessmat = matrix(rep(0,nchromes*2),ncol=2) temp = .Fortran("fitnessmatf",as.double(bset1), as.double(fitnessmat),as.integer(chromes), as.integer(chrome.sizes),as.integer(nchromes),as.integer(dimbset1[1]), as.integer(dimbset1[2]),as.integer(maxlen),as.integer(class01s)) fitnessmat = matrix(temp[[2]],ncol=2) fitnessmat = signif(fitnessmat,digits=10) fpenalty = fitnessmat[,1]-penalty1*fitnessmat[,2]-penalty2*chrome.sizes fitorder = order(fpenalty,-fitnessmat[,2]) backfitorder = (1:nchromes)[order(fitorder)] phi = 2 alpha = (2*nchromes-phi*(nchromes +1))/(nchromes*(nchromes-1)) beta = 2*(phi-1)/(nchromes*(nchromes-1)) tempp = alpha+beta*c(1:nchromes) choosep = tempp[backfitorder] # create progeny tempchromes = matrix(rep(0,nchromes*maxlen),nrow=nchromes) tempsizes = rep(0,nchromes) i = 1 while(i <= nchromes/2){ sample2 = sample(1:nchromes,2,replace=F, prob=choosep) xind = sample2[1];yind=sample2[2] temp = makekids(chromes[xind,1:chrome.sizes[xind]], chromes[yind,1:chrome.sizes[yind]],maxlen) xlen = length(temp$x); ylen = length(temp$y) tempchromes[(2*(i-1)+1),1:xlen] = temp$x tempchromes[(2*i),1:ylen] = temp$y tempsizes[(2*(i-1)+1)] = xlen tempsizes[2*i] = ylen if(xlen < 2 || ylen < 2){i = i - 1} i = i + 1 } bestind = fitorder[nchromes] if(fpenalty[bestind] > best[3]){ best[1:2] = fitnessmat[bestind,] best[3] = fpenalty[bestind] best[4] = chrome.sizes[bestind] best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])] } print(date()); print(paste(n," ",nb," ",mean(fitnessmat[,1]))); print(c(fitnessmat[bestind,],fpenalty[bestind],best[3])) print(sort(chromes[bestind,1:chrome.sizes[bestind]])) nfinal = n if(fitnessmat[bestind,1] ==1) { n = ngens print("A solution was found") best[1:2] = fitnessmat[bestind,] best[3] = fpenalty[bestind] best[4] = chrome.sizes[bestind] best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])] } n = n + 1 } print(paste("z = ",z," nb = ", nb)) print(paste("best ")) print(best[1:4]) evalpoints = sort( best[5:(4+best[4])]) mat = t(apply(bset1[,evalpoints],1,nrmlz)) dimmat = dim(mat); i = dimmat[1]; j = dimmat[2] mat0 = matrix(rep(0,i*j),ncol=j) temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i), as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0), as.integer(rep(0,i)),class01s, as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)), as.integer(maxlen)) temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]], temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]]) names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities") clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1) mat = t(apply(bset2[,evalpoints],1,nrmlz)) results = matrix(rep(0,dim(mat)[1]*4),ncol=4) cutdist = .1*sqrt(length(evalpoints)) for(k in 1:dim(mat)[1]){ distlist = apply(temp$centers,1,ndist,y=mat[k,]) results[k,1] = which(distlist==min(distlist))[1] results[k,2] = distlist[results[k,1]] results[k,3] = clustertype[results[k,1]] results[k,4] = ifelse(results[k,2] <= cutdist, results[k,3],2) } table1 = table(results[,3],classlabels2) table2 = table(results[,4],classlabels2) if(dim(table2)[1] > 1){ outcomepurities = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), (table2[1,1])/(table1[1,1]+table1[2,1]),(table2[1,1]+table2[2,1]), (table2[2,2])/(table1[1,2]+table1[2,2]),(table2[1,2]+table2[2,2]), length(evalpoints),best[1:4]) } else { outcomepurities = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), 0,0, 0,0, length(evalpoints),best[1:4]) } blanks = rep(0,maxlen) blanks[1:length(evalpoints)] = evalpoints nbres = rbind(nbres, c(z, nb, (sum(table1)-table1[2,1]-table1[1,2])/sum(table1), (sum(table1)-table1[2,1]-table1[1,2])/sum(table1)-penalty1*temp$totclusters-penalty2*length(evalpoints),temp$totclusters,length(evalpoints),blanks)) sink() } # Order in terms of test set error and number of clusters nborder = order(nbres[,3],-nbres[,5],decreasing=T) bestboot = nborder[1] evalpoints = nbres[bestboot,7:(7+(nbres[bestboot,6]-1))] # now reget the clusters for this best chromosome mat = t(apply(bset1[,evalpoints],1,nrmlz)) dimmat = dim(mat); i = dimmat[1]; j = dimmat[2] mat0 = matrix(rep(0,i*j),ncol=j) temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i), as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0), as.integer(rep(0,i)),class01s, as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)), as.integer(maxlen)) temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]], temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]]) names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities") clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1) # now apply to whole sample, not the bootstrap test set mat = t(apply(finset[,evalpoints],1,nrmlz)) results = matrix(rep(0,dim(mat)[1]*4),ncol=4) cutdist = .1*sqrt(length(evalpoints)) for(k in 1:dim(mat)[1]){ distlist = apply(temp$centers,1,ndist,y=mat[k,]) results[k,1] = which(distlist==min(distlist))[1] results[k,2] = distlist[results[k,1]] results[k,3] = clustertype[results[k,1]] results[k,4] = ifelse(results[k,2] <= cutdist, results[k,3],2) } table1 = table(results[,3],c(rep("C",dimcan[1]),rep("N",dimcon[1]))) table2 = table(results[,4],c(rep("C",dimcan[1]),rep("N",dimcon[1]))) if(dim(table2)[1] > 1){ outcomepurities = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), (table2[1,1])/(table1[1,1]+table1[2,1]),(table2[1,1]+table2[2,1]), (table2[2,2])/(table1[1,2]+table1[2,2]),(table2[1,2]+table2[2,2]), length(evalpoints),nbres[bestboot,1:6]) } else { outcomepurities = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), 0,0, 0,0, length(evalpoints),nbres[bestboot,1:6]) } save(list=c("nbres","outcomepurities"),file=paste(writedir,"highboot-", as.character(lookuppt),"-",as.character(csize),"-",as.character(nsize),"-", as.character(p1),"-",as.character(p2),"-",as.character(z),"-", as.character(ngens),".rda",sep=""))