# The file below is run for each combination of p1 and p2 # discussed in the paper in Tables 2, 3, and 4. rm(list=ls()) # ngens is number of generations -- 250 in the paper's tables # p1/1000 = p1 in the paper, p2/1000 = p2 in the paper # ntrials is the number of resamplings (50). ngens = 250; p1 = 2; p2 = 1; csize = 81; nsize=46; datadir = "c:/data/ciphergen/"; ntrials = 50 dlldir = "c:/work/ciphergen/"; forext = "dll" lookuppt = 0 setwd(datadir) # A series of functions for the # genentic algorithm lookup <- function(point,data=wvec){ lookup <- which(abs(data - point) == min(abs(data - point))) lookup[1] } # read in the data load(paste(datadir,"cancer.rda",sep="")) load(paste(datadir,"control.rda",sep="")) load(paste(datadir,"wvec.rda",sep="")) ndist = function(x,y){ sqrt(sum((x-y)^2)) } nrmlz = function(rowvec){ minrow = min(rowvec); maxrow = max(rowvec); if(maxrow > minrow){ nrmlz = (rowvec-minrow)/(maxrow-minrow) } else { nrmlz = rep(.5,length(rowvec)) } nrmlz } mutateme = function(cvec,pmut=.0002){ lvec = length(cvec) changes = rbinom(n=lvec,size=1,prob=pmut) changehere = which(changes==1) numchanges = length(changehere) if(numchanges > 0){ cvec[changehere] = sample(c(1:ngenes)[-cvec],numchanges) } cvec } makekids = function(x,y,maxlen){ xlen = length(x); ylen = length(y); crossx = sample(1:(xlen-1),1); crossy = sample(1:(ylen-1),1) newx = unique(c(x[1:crossx],y[(crossy+1):ylen])) newy = unique(c(y[1:crossy],x[(crossx+1):xlen])) newx = newx[1:min(length(newx),maxlen)] newy = newy[1:min(length(newy),maxlen)] makekids = list(newx,newy) names(makekids) = c("x","y") return(makekids) } dyn.load(paste(dlldir,"fitnessfuncf.",forext,sep="")) # initialization of variables to save outcomepurities = matrix(rep(0,14*ntrials),ncol=14) outcomechromes = matrix(rep(0,20*ntrials),ncol=20) outcomeparams = c(ngens,p1,p2) # establish left endpoint of the data leftpoint = lookup(lookuppt) if(wvec[leftpoint] < lookuppt) leftpoint = leftpoint + 1 # loop for each resampling # much of what follows is the same as # runga.r -- see that program for # some comments for(z in 1:ntrials){ set.seed(z) dimcan = dim(cancer) dimcon = dim(control) listcancer1 = sample(dimcan[1],csize) listcontrol1 = sample(dimcon[1],nsize) cancer1 = cancer[listcancer1,leftpoint:dimcan[2]]; control1 = control[listcontrol1,leftpoint:dimcon[2]] lc1 = length(listcancer1); #lc2 = length(listcancer2); ln1 = length(listcontrol1);# ln2 = length(listcontrol2); N1 = lc1+ln1 rorder = sample(1:N1) backorder = (1:N1)[order(rorder)] cancer2 = cancer[-listcancer1,leftpoint:dimcan[2]]; control2 = control[-listcontrol1,leftpoint:dimcon[2]] lc2 = dim(cancer2)[1]; ln2 = dim(control2)[1] penalty1 = p1/1000 ; penalty2 = p2/1000 bset1 = rbind(cancer1, control1) bset1 = bset1[rorder,] backorder = (1:N1)[order(rorder)] classlabels = c(rep("C",lc1),rep("N",ln1))[rorder] class01s = as.integer(ifelse(classlabels == "C",0,1)) bset2 = rbind(cancer2,control2) classlabels2 = c(rep("C",lc2),rep("N",ln2)) dimbset1 = dim(bset1) nchromes = 1500 ngenes = dim(bset1)[2] maxlen=20; minlen=5; chrome.sizes = sample(c(minlen:maxlen),nchromes,replace=T) chromes = matrix(rep(0,nchromes*maxlen),nrow=nchromes) for(i in 1:nchromes){ chromes[i,1:chrome.sizes[i]] = sample(1:ngenes,chrome.sizes[i],replace=F) } tempchromes=chromes; tempsizes = chrome.sizes best = c(0,0,0,0,rep(0,maxlen)) n = 1 while(n <= ngens){ chromes = tempchromes chrome.sizes = tempsizes # mutate for(i in 1:nchromes){ chromes[i,1:chrome.sizes[i]] = mutateme(chromes[i,1:chrome.sizes[i]]) } # evaluate fitness of nchromes chromosomes fitnessmat = matrix(rep(0,nchromes*2),ncol=2) temp = .Fortran("fitnessmatf",as.double(bset1), as.double(fitnessmat),as.integer(chromes), as.integer(chrome.sizes),as.integer(nchromes),as.integer(dimbset1[1]), as.integer(dimbset1[2]),as.integer(maxlen),as.integer(class01s)) fitnessmat = matrix(temp[[2]],ncol=2) fitnessmat = signif(fitnessmat,digits=10) fpenalty = fitnessmat[,1]-penalty1*fitnessmat[,2]-penalty2*chrome.sizes fitorder = order(fpenalty,-fitnessmat[,2]) backfitorder = (1:nchromes)[order(fitorder)] phi = 2 alpha = (2*nchromes-phi*(nchromes +1))/(nchromes*(nchromes-1)) beta = 2*(phi-1)/(nchromes*(nchromes-1)) tempp = alpha+beta*c(1:nchromes) choosep = tempp[backfitorder] # create progeny tempchromes = matrix(rep(0,nchromes*maxlen),nrow=nchromes) tempsizes = rep(0,nchromes) i = 1 while(i <= nchromes/2){ sample2 = sample(1:nchromes,2,replace=F, prob=choosep) xind = sample2[1];yind=sample2[2] temp = makekids(chromes[xind,1:chrome.sizes[xind]], chromes[yind,1:chrome.sizes[yind]],maxlen) xlen = length(temp$x); ylen = length(temp$y) tempchromes[(2*(i-1)+1),1:xlen] = temp$x tempchromes[(2*i),1:ylen] = temp$y tempsizes[(2*(i-1)+1)] = xlen tempsizes[2*i] = ylen if(xlen < 2 || ylen < 2){i = i - 1} i = i + 1 } bestind = fitorder[nchromes] if(fpenalty[bestind] > best[3]){ best[1:2] = fitnessmat[bestind,] best[3] = fpenalty[bestind] best[4] = chrome.sizes[bestind] best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])] } print(date()); print(paste(n," ",mean(fitnessmat[,1]))); print(c(fitnessmat[bestind,],fpenalty[bestind],best[3])) print(sort(chromes[bestind,1:chrome.sizes[bestind]])) nfinal = n if(fitnessmat[bestind,1] ==1) { n = ngens print("A solution was found") best[1:2] = fitnessmat[bestind,] best[3] = fpenalty[bestind] best[4] = chrome.sizes[bestind] best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])] } n = n + 1 } print(paste("cancer1 =",cancer1[1,1])) print(paste("z = ",z)) print(paste("best ")) print(best[1:4]) evalpoints = sort( best[5:(4+best[4])]) mat = t(apply(bset1[,evalpoints],1,nrmlz)) dimmat = dim(mat); i = dimmat[1]; j = dimmat[2] mat0 = matrix(rep(0,i*j),ncol=j) temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i), as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0), as.integer(rep(0,i)),class01s, as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)), as.integer(maxlen)) temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]], temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]]) names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities") clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1) mat = t(apply(bset2[,evalpoints],1,nrmlz)) results = matrix(rep(0,dim(mat)[1]*4),ncol=4) cutdist = .1*sqrt(length(evalpoints)) for(k in 1:dim(mat)[1]){ distlist = apply(temp$centers,1,ndist,y=mat[k,]) results[k,1] = which(distlist==min(distlist))[1] results[k,2] = distlist[results[k,1]] results[k,3] = clustertype[results[k,1]] results[k,4] = ifelse(results[k,2] <= cutdist, results[k,3],2) } table1 = table(results[,3],classlabels2) table2 = table(results[,4],classlabels2) if(dim(table2)[1] > 1){ outcomepurities[z,] = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), (table2[1,1])/(table1[1,1]+table1[2,1]),(table2[1,1]+table2[2,1]), (table2[2,2])/(table1[1,2]+table1[2,2]),(table2[1,2]+table2[2,2]), length(evalpoints),best[1:4]) } else { outcomepurities[z,] = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), 0,0, 0,0, length(evalpoints),best[1:4]) } outcomechromes[z,1:length(evalpoints)] = evalpoints save(list=c("outcomeparams","outcomepurities","outcomechromes"),file=paste(datadir,"res-",as.character(lookuppt),"-",as.character(csize),"-",as.character(nsize),"-", as.character(p1),"-",as.character(p2),"-",as.character(ngens),".rda",sep="")) } # end the resampling loop outcomeparams = c(ngens,p1,p2) save(list=c("outcomeparams","outcomepurities","outcomechromes"),file=paste(datadir,"res-",as.character(lookuppt),"-",as.character(csize),"-",as.character(nsize),"-", as.character(p1),"-",as.character(p2),"-",as.character(ngens),".rda",sep=""))