# number of chromosomes in each generation nchromes = 1500 # number of possible genes/markers to construct chromosomes ngenes = dim(bset1)[2] # limits on size of initial chromosomes maxlen=20; minlen=5; chrome.sizes = sample(c(minlen:maxlen),nchromes,replace=T) chromes = matrix(rep(0,nchromes*maxlen),nrow=nchromes) # create initial set of chromosomes for(i in 1:nchromes){ chromes[i,1:chrome.sizes[i]] = sample(1:ngenes,chrome.sizes[i],replace=F) } tempchromes=chromes; tempsizes = chrome.sizes # this will hold results of best chromosome # first four spots hold performance measures, last maxlen # hold indices for chromosomes best = c(0,0,0,0,rep(0,maxlen)) n = 1 # ngens is the number of generations - 250 usually while(n <= ngens){ chromes = tempchromes chrome.sizes = tempsizes # mutate for(i in 1:nchromes){ chromes[i,1:chrome.sizes[i]] = mutateme(chromes[i,1:chrome.sizes[i]]) } # initialize structure to hold fitness of nchromes chromosomes fitnessmat = matrix(rep(0,nchromes*2),ncol=2) # fortran code that evaluates all 1500 chromosomes # and returns for each cluster the classification accuracy # (based on training data here) and the number of clusters. temp = .Fortran("fitnessmatf",as.double(bset1), as.double(fitnessmat),as.integer(chromes), as.integer(chrome.sizes),as.integer(nchromes),as.integer(dimbset1[1]), as.integer(dimbset1[2]),as.integer(maxlen),as.integer(class01s)) # first column shows classification accuracy # second column shows number of clusters for all 1500 fitnessmat = matrix(temp[[2]],ncol=2) # rounded to make comparable -- some small numbers # have rounding issues on different processors/languages (R and FORTRAN) fitnessmat = signif(fitnessmat,digits=10) # penalized fitness fpenalty = fitnessmat[,1]-penalty1*fitnessmat[,2]-penalty2*chrome.sizes # order by penalty then number of clusters fitorder = order(fpenalty,-fitnessmat[,2]) # order backfitorder = (1:nchromes)[order(fitorder)] # alpha and beta as described in paper phi = 2 alpha = (2*nchromes-phi*(nchromes +1))/(nchromes*(nchromes-1)) beta = 2*(phi-1)/(nchromes*(nchromes-1)) tempp = alpha+beta*c(1:nchromes) # assign probabilities by the ordering choosep = tempp[backfitorder] # create progeny -- the next generation tempchromes = matrix(rep(0,nchromes*maxlen),nrow=nchromes) tempsizes = rep(0,nchromes) i = 1 while(i <= nchromes/2){ sample2 = sample(1:nchromes,2,replace=F,prob=choosep) xind = sample2[1];yind=sample2[2] temp = makekids(chromes[xind,1:chrome.sizes[xind]],chromes[yind,1:chrome.sizes[yind]],maxlen) xlen = length(temp$x); ylen = length(temp$y) tempchromes[(2*(i-1)+1),1:xlen] = temp$x tempchromes[(2*i),1:ylen] = temp$y tempsizes[(2*(i-1)+1)] = xlen tempsizes[2*i] = ylen if(xlen < 2 || ylen < 2){i = i - 1} i = i + 1 } # index of the best chromosome in this round of 1500 bestind = fitorder[nchromes] # if this best better than previous best (over previous rounds of 1500) # then make it the overall best if(fpenalty[bestind] > best[3]){ best[1:2] = fitnessmat[bestind,] best[3] = fpenalty[bestind] best[4] = chrome.sizes[bestind] best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])] } print(date()); print(paste(n," ",mean(fitnessmat[,1]))); print(c(fitnessmat[bestind,],fpenalty[bestind],best[3])) print(sort(chromes[bestind,1:chrome.sizes[bestind]])) nfinal = n # if a perfect discriminator is found record it and # set the looping index to ngens, usually 250 if(fitnessmat[bestind,1] ==1) { n = ngens print("A solution was found") best[1:2] = fitnessmat[bestind,] best[3] = fpenalty[bestind] best[4] = chrome.sizes[bestind] best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])] } n = n + 1 # end the loop of 250 } # print first element to check that different algorithms # are looking at the same data print(paste("cancer1 =",cancer1[1,1])) print(paste("z = ",z)) print(paste("best ")) print(best[1:4]) evalpoints = sort( best[5:(4+best[4])]) # re-evaluate the best chromosome mat = t(apply(bset1[,evalpoints],1,nrmlz)) dimmat = dim(mat); i = dimmat[1]; j = dimmat[2] mat0 = matrix(rep(0,i*j),ncol=j) temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i), as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0), as.integer(rep(0,i)),class01s, as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)), as.integer(maxlen)) temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]], temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]]) names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities") clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1) # see how that chromosome works on the test set mat = t(apply(bset2[,evalpoints],1,nrmlz)) results = matrix(rep(0,dim(mat)[1]*4),ncol=4) cutdist = .1*sqrt(length(evalpoints)) for(k in 1:dim(mat)[1]){ distlist = apply(temp$centers,1,ndist,y=mat[k,]) results[k,1] = which(distlist==min(distlist))[1] results[k,2] = distlist[results[k,1]] results[k,3] = clustertype[results[k,1]] results[k,4] = ifelse(results[k,2] <= cutdist, results[k,3],2) } # table 1 is usual error rate table1 = table(results[,3],classlabels2) # table 2 is error rate with third category if the # distance criteria of .1*sqrt(chromosome length) is not met # - this was discussed in paper table2 = table(results[,4],classlabels2) if(dim(table2)[1] > 1){ outcomepurities = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), (table2[1,1])/(table1[1,1]+table1[2,1]),(table2[1,1]+table2[2,1]), (table2[2,2])/(table1[1,2]+table1[2,2]),(table2[1,2]+table2[2,2]), length(evalpoints),best[1:4]) } else { # this branch used if the rare case when all cancer and control cases were # lumped into the same class outcomepurities = c(nfinal, (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]), (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]), 0,0, 0,0, length(evalpoints),best[1:4]) }