# function finds the index of the wvec value nearest to a desired
# point.
lookup <-  function(point,data=wvec){
  lookup <-  which(abs(data - point) == min(abs(data - point)))
  lookup[1]
}


load(paste(datadir,"cancer.rda",sep=""))
load(paste(datadir,"control.rda",sep=""))
load(paste(datadir,"wvec.rda",sep=""))


# Euclidean distance
ndist = function(x,y){
  sqrt(sum((x-y)^2))
}

# The normalization used in the genetic algorithm
# this scales a chromosome's values between 0 and 1
nrmlz = function(rowvec){
  minrow = min(rowvec); maxrow = max(rowvec);
  if(maxrow > minrow){
    nrmlz = (rowvec-minrow)/(maxrow-minrow)
  } else {
    nrmlz = rep(.5,length(rowvec))
  }
  nrmlz
}

# Given a chromosome and everyone's not yet normalized values for that
# chromosome, this
# function returns 1) the centers of the clusters, 2) how many indivuals lie
# in that cluster, and 3) the cluster assignment for each spectrum
clusterfunc = function(mat){
#  matrix has number of columns given by length of chromosome
   dimmat = dim(mat)
#  normalization occurs here
   mat =  t(apply(mat,1,nrmlz))
   maxnumclusters = dimmat[1]; Nspace = dimmat[2];
   clustercenters = matrix(rep(0,Nspace*maxnumclusters),ncol=Nspace)
   n.in.clusters = rep(0,maxnumclusters)
   clustercenters[1,] = mat[1,]; n.in.clusters[1] = 1
   totclusters = 1
   dlimit = sqrt(Nspace)*.1
   parent.cluster = rep(0,maxnumclusters)
   parent.cluster[1] = 1
   for(j in 2:maxnumclusters){
     distances = rep(0,totclusters)
     for(k in 1: totclusters){      
       distances[k] = ndist(mat[j,],clustercenters[k,])
     }
     k = which(distances==min(distances))
     if(distances[k] < dlimit){
       # add to cluster k
       n.in.clusters[k] = n.in.clusters[k]+1
       clustercenters[k,] = (clustercenters[k,]*(n.in.clusters[k]-1)+
                             mat[j,])/n.in.clusters[k]
       parent.cluster[j] = k
     } else {
       # make new cluster and increment totclusters
       totclusters = totclusters+1
       clustercenters[totclusters,]=mat[j,]
       n.in.clusters[totclusters]=1
       parent.cluster[j] = totclusters
     }
   } 
   clusterfunc = list(clustercenters[1:totclusters,],
     n.in.clusters[1:totclusters],parent.cluster)
   names(clusterfunc) = c("centers","n","parent")
   return(clusterfunc)
}


# evaluate classification accuracy of a given chromosome
# rset already exists
# this function superceded by FORTRAN functions
fitnessfunc = function(evalpoints,classlabels){
  mat = rset1[,evalpoints]
  clusters = clusterfunc(mat)
  num.clusters = length(clusters$n)
  case.clusters = clusters[[3]]  

  cluster.purity = rep(0,num.clusters)
  for(i in 1:num.clusters){
    cluster.purity[i] = 1- min(
    sum(case.clusters == i & classlabels == "C")/clusters$n[i],
    sum(case.clusters == i & classlabels == "N")/clusters$n[i])
  }
  purity = weighted.mean(cluster.purity,clusters$n/sum(clusters$n))
  fitness = list(purity,num.clusters)
  names(fitness) = c("purity","num.clusters")
  return(fitness)
}

# superceded by FORTRAN functions
fitnessfunc2 = function(evalpoints,fitset,classlabels){
  mat = rset1[,evalpoints]
  mat2 = t(apply(fitset[,evalpoints],1,nrmlz))
  clusters = clusterfunc(mat)

  num.clusters = length(clusters$n)
  num.cases = dim(mat2)[1]
  case.clusters = rep(0,num.cases)
  distances = rep(0,num.clusters)
  closest.distance = rep(0,num.cases)

  cluster.totals = rep(0,num.clusters)
  cluster.cancers = rep(0,num.clusters)  
  for(i in 1:num.cases){
    for(k in 1:num.clusters){
      distances[k] = ndist(mat2[i,],clusters$centers[k,])
    }
    pickme = which(distances==min(distances))
    case.clusters[i] = pickme
    closest.distance[i] = distances[pickme]
    cluster.totals[pickme] = cluster.totals[pickme]+1
    if(classlabels[i] == "C"){ cluster.cancers[pickme] =
                               cluster.cancers[pickme]+1}
  }
  fitness2 = list(case.clusters,cluster.totals,cluster.cancers)
  names(fitness2) = c("clusters","totals","cancers")
  return(fitness2)
}

  
# changes markers within a chromosome with probability .0002
# in some documentation this probability should be .002
mutateme = function(cvec,pmut=.0002){
  lvec = length(cvec)
  changes = rbinom(n=lvec,size=1,prob=pmut)
  changehere = which(changes==1)
  numchanges = length(changehere)
  if(numchanges > 0){
    cvec[changehere] = sample(c(1:ngenes)[-cvec],numchanges)
  }
  cvec
}

# function that crosses-over two chromosomes
makekids = function(x,y,maxlen){
  xlen = length(x); ylen = length(y);
  crossx = sample(1:(xlen-1),1); crossy = sample(1:(ylen-1),1)
  newx = unique(c(x[1:crossx],y[(crossy+1):ylen]))
  newy = unique(c(y[1:crossy],x[(crossx+1):xlen]))
  newx = newx[1:min(length(newx),maxlen)]
  newy = newy[1:min(length(newy),maxlen)]
  makekids = list(newx,newy)
  names(makekids) = c("x","y")
  return(makekids)
}

# load the dll/so file
dyn.load(paste(dlldir,"fitnessfuncf",dottype,forext,sep=""))

# sets bottom of range for analysis
# usually this is 0 but is sometimes set to 1500 to
# look only at data above 1500 m/z

leftpoint = lookup(lookuppt)
if(wvec[leftpoint] < lookuppt) leftpoint = leftpoint + 1