# number of chromosomes in each generation
nchromes = 1500
# number of possible genes/markers to construct chromosomes
ngenes = dim(bset1)[2]
# limits on size of initial chromosomes
maxlen=20; minlen=5;
chrome.sizes = sample(c(minlen:maxlen),nchromes,replace=T)


chromes = matrix(rep(0,nchromes*maxlen),nrow=nchromes)

# create initial set of chromosomes
for(i in 1:nchromes){
  chromes[i,1:chrome.sizes[i]] = sample(1:ngenes,chrome.sizes[i],replace=F)
}

tempchromes=chromes; tempsizes = chrome.sizes
# this will hold results of best chromosome
# first four spots hold performance measures, last maxlen
# hold indices for chromosomes
best = c(0,0,0,0,rep(0,maxlen))
n = 1
# ngens is the number of generations - 250 usually
while(n <= ngens){
chromes = tempchromes
chrome.sizes = tempsizes

# mutate
for(i in 1:nchromes){
  chromes[i,1:chrome.sizes[i]] = 
         mutateme(chromes[i,1:chrome.sizes[i]])
}


# initialize structure to hold  fitness of nchromes chromosomes
fitnessmat = matrix(rep(0,nchromes*2),ncol=2)

# fortran code that evaluates all 1500 chromosomes
# and returns for each cluster the classification accuracy
# (based on training data here) and  the number of clusters.

temp = .Fortran("fitnessmatf",as.double(bset1),
  as.double(fitnessmat),as.integer(chromes),
  as.integer(chrome.sizes),as.integer(nchromes),as.integer(dimbset1[1]),
  as.integer(dimbset1[2]),as.integer(maxlen),as.integer(class01s))

# first column shows classification accuracy
# second column shows number of clusters for all 1500
fitnessmat = matrix(temp[[2]],ncol=2)

# rounded to make comparable -- some small numbers
# have rounding issues on different processors/languages (R and FORTRAN)
fitnessmat = signif(fitnessmat,digits=10)

# penalized fitness
fpenalty = fitnessmat[,1]-penalty1*fitnessmat[,2]-penalty2*chrome.sizes


# order by penalty then number of clusters
fitorder = order(fpenalty,-fitnessmat[,2])

# order
backfitorder = (1:nchromes)[order(fitorder)]

# alpha and beta as described in paper
phi = 2
alpha = (2*nchromes-phi*(nchromes +1))/(nchromes*(nchromes-1))
beta = 2*(phi-1)/(nchromes*(nchromes-1))
tempp = alpha+beta*c(1:nchromes)

# assign probabilities by the ordering
choosep = tempp[backfitorder]


# create progeny -- the next generation

tempchromes =  matrix(rep(0,nchromes*maxlen),nrow=nchromes)
tempsizes = rep(0,nchromes)
i = 1
while(i <= nchromes/2){
  sample2 = sample(1:nchromes,2,replace=F,prob=choosep)
  xind = sample2[1];yind=sample2[2]
  temp = makekids(chromes[xind,1:chrome.sizes[xind]],chromes[yind,1:chrome.sizes[yind]],maxlen)
  xlen = length(temp$x); ylen = length(temp$y)
  tempchromes[(2*(i-1)+1),1:xlen] = temp$x
  tempchromes[(2*i),1:ylen] = temp$y
  tempsizes[(2*(i-1)+1)] = xlen
  tempsizes[2*i] = ylen
  if(xlen < 2 || ylen < 2){i = i - 1}
  i = i + 1
}

# index of the best chromosome in this round of 1500
bestind = fitorder[nchromes]

# if this best better than previous best (over previous rounds of 1500)
# then make it the overall best
if(fpenalty[bestind] > best[3]){
  best[1:2] = fitnessmat[bestind,]
  best[3] = fpenalty[bestind]
  best[4] = chrome.sizes[bestind]
  best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])]
}

print(date());
print(paste(n," ",mean(fitnessmat[,1])));
print(c(fitnessmat[bestind,],fpenalty[bestind],best[3]))
print(sort(chromes[bestind,1:chrome.sizes[bestind]]))
nfinal = n
# if a perfect discriminator is found record it and
# set the looping index to ngens, usually 250
if(fitnessmat[bestind,1] ==1) {
  n = ngens
  print("A solution was found")
  best[1:2] = fitnessmat[bestind,]
  best[3] = fpenalty[bestind]
  best[4] = chrome.sizes[bestind]
  best[5:(4+best[4])] = chromes[bestind,(1:chrome.sizes[bestind])]
}
n = n + 1
# end the loop of 250
}

# print first element to check that different algorithms
# are looking at the same data
print(paste("cancer1 =",cancer1[1,1]))
print(paste("z = ",z))
print(paste("best "))
print(best[1:4])


evalpoints = sort( best[5:(4+best[4])])


# re-evaluate the best chromosome
mat = t(apply(bset1[,evalpoints],1,nrmlz))
dimmat = dim(mat); i = dimmat[1]; j = dimmat[2]
mat0 = matrix(rep(0,i*j),ncol=j)
temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i),
  as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0),  as.integer(rep(0,i)),class01s,  as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)),
as.integer(maxlen))
  temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]],
    temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]])
  names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities")
clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1)

# see how that chromosome works on the test set
mat = t(apply(bset2[,evalpoints],1,nrmlz))
results = matrix(rep(0,dim(mat)[1]*4),ncol=4)
cutdist = .1*sqrt(length(evalpoints))
for(k in 1:dim(mat)[1]){
  distlist = apply(temp$centers,1,ndist,y=mat[k,])
  results[k,1] = which(distlist==min(distlist))[1]
  results[k,2] = distlist[results[k,1]]
  results[k,3] = clustertype[results[k,1]]
  results[k,4] = ifelse(results[k,2] <= cutdist,
                 results[k,3],2)
}

# table 1 is usual error rate
table1 = table(results[,3],classlabels2)
# table 2 is error rate with third category if the
# distance criteria of .1*sqrt(chromosome length) is not met
# - this was discussed in paper
table2 = table(results[,4],classlabels2)

if(dim(table2)[1] > 1){
outcomepurities = c(nfinal,
  (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]),
  (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]),
  (table2[1,1])/(table1[1,1]+table1[2,1]),(table2[1,1]+table2[2,1]),
  (table2[2,2])/(table1[1,2]+table1[2,2]),(table2[1,2]+table2[2,2]),
  length(evalpoints),best[1:4])
} else {
# this branch used if the rare case when all cancer and control cases were
# lumped into the same class
outcomepurities = c(nfinal,
  (table1[1,1])/(table1[1,1]+table1[2,1]),(table1[1,1]+table1[2,1]),
  (table1[2,2])/(table1[1,2]+table1[2,2]),(table1[1,2]+table1[2,2]),
  0,0,
  0,0,
  length(evalpoints),best[1:4])
}