# function finds the index of the wvec value nearest to a desired # point. lookup <- function(point,data=wvec){ lookup <- which(abs(data - point) == min(abs(data - point))) lookup[1] } load(paste(datadir,"cancer.rda",sep="")) load(paste(datadir,"control.rda",sep="")) load(paste(datadir,"wvec.rda",sep="")) # Euclidean distance ndist = function(x,y){ sqrt(sum((x-y)^2)) } # The normalization used in the genetic algorithm # this scales a chromosome's values between 0 and 1 nrmlz = function(rowvec){ minrow = min(rowvec); maxrow = max(rowvec); if(maxrow > minrow){ nrmlz = (rowvec-minrow)/(maxrow-minrow) } else { nrmlz = rep(.5,length(rowvec)) } nrmlz } # Given a chromosome and everyone's not yet normalized values for that # chromosome, this # function returns 1) the centers of the clusters, 2) how many indivuals lie # in that cluster, and 3) the cluster assignment for each spectrum clusterfunc = function(mat){ # matrix has number of columns given by length of chromosome dimmat = dim(mat) # normalization occurs here mat = t(apply(mat,1,nrmlz)) maxnumclusters = dimmat[1]; Nspace = dimmat[2]; clustercenters = matrix(rep(0,Nspace*maxnumclusters),ncol=Nspace) n.in.clusters = rep(0,maxnumclusters) clustercenters[1,] = mat[1,]; n.in.clusters[1] = 1 totclusters = 1 dlimit = sqrt(Nspace)*.1 parent.cluster = rep(0,maxnumclusters) parent.cluster[1] = 1 for(j in 2:maxnumclusters){ distances = rep(0,totclusters) for(k in 1: totclusters){ distances[k] = ndist(mat[j,],clustercenters[k,]) } k = which(distances==min(distances)) if(distances[k] < dlimit){ # add to cluster k n.in.clusters[k] = n.in.clusters[k]+1 clustercenters[k,] = (clustercenters[k,]*(n.in.clusters[k]-1)+ mat[j,])/n.in.clusters[k] parent.cluster[j] = k } else { # make new cluster and increment totclusters totclusters = totclusters+1 clustercenters[totclusters,]=mat[j,] n.in.clusters[totclusters]=1 parent.cluster[j] = totclusters } } clusterfunc = list(clustercenters[1:totclusters,], n.in.clusters[1:totclusters],parent.cluster) names(clusterfunc) = c("centers","n","parent") return(clusterfunc) } # evaluate classification accuracy of a given chromosome # rset already exists # this function superceded by FORTRAN functions fitnessfunc = function(evalpoints,classlabels){ mat = rset1[,evalpoints] clusters = clusterfunc(mat) num.clusters = length(clusters$n) case.clusters = clusters[[3]] cluster.purity = rep(0,num.clusters) for(i in 1:num.clusters){ cluster.purity[i] = 1- min( sum(case.clusters == i & classlabels == "C")/clusters$n[i], sum(case.clusters == i & classlabels == "N")/clusters$n[i]) } purity = weighted.mean(cluster.purity,clusters$n/sum(clusters$n)) fitness = list(purity,num.clusters) names(fitness) = c("purity","num.clusters") return(fitness) } # superceded by FORTRAN functions fitnessfunc2 = function(evalpoints,fitset,classlabels){ mat = rset1[,evalpoints] mat2 = t(apply(fitset[,evalpoints],1,nrmlz)) clusters = clusterfunc(mat) num.clusters = length(clusters$n) num.cases = dim(mat2)[1] case.clusters = rep(0,num.cases) distances = rep(0,num.clusters) closest.distance = rep(0,num.cases) cluster.totals = rep(0,num.clusters) cluster.cancers = rep(0,num.clusters) for(i in 1:num.cases){ for(k in 1:num.clusters){ distances[k] = ndist(mat2[i,],clusters$centers[k,]) } pickme = which(distances==min(distances)) case.clusters[i] = pickme closest.distance[i] = distances[pickme] cluster.totals[pickme] = cluster.totals[pickme]+1 if(classlabels[i] == "C"){ cluster.cancers[pickme] = cluster.cancers[pickme]+1} } fitness2 = list(case.clusters,cluster.totals,cluster.cancers) names(fitness2) = c("clusters","totals","cancers") return(fitness2) } # changes markers within a chromosome with probability .0002 # in some documentation this probability should be .002 mutateme = function(cvec,pmut=.0002){ lvec = length(cvec) changes = rbinom(n=lvec,size=1,prob=pmut) changehere = which(changes==1) numchanges = length(changehere) if(numchanges > 0){ cvec[changehere] = sample(c(1:ngenes)[-cvec],numchanges) } cvec } # function that crosses-over two chromosomes makekids = function(x,y,maxlen){ xlen = length(x); ylen = length(y); crossx = sample(1:(xlen-1),1); crossy = sample(1:(ylen-1),1) newx = unique(c(x[1:crossx],y[(crossy+1):ylen])) newy = unique(c(y[1:crossy],x[(crossx+1):xlen])) newx = newx[1:min(length(newx),maxlen)] newy = newy[1:min(length(newy),maxlen)] makekids = list(newx,newy) names(makekids) = c("x","y") return(makekids) } # load the dll/so file dyn.load(paste(dlldir,"fitnessfuncf",dottype,forext,sep="")) # sets bottom of range for analysis # usually this is 0 but is sometimes set to 1500 to # look only at data above 1500 m/z leftpoint = lookup(lookuppt) if(wvec[leftpoint] < lookuppt) leftpoint = leftpoint + 1