rm(list=ls()) # directory holding relevant cancer.rda, control.rda, and wvec.rda datadir = "/data/nealjeff/ciphergen/" # size of cancer and control training sets csize = 81; nsize = 46 # location of the fitnessfuncf .dll or .so file dlldir = "/data/nealjeff/biowulf/" # should always be "." dottype = "." # should be "dll" if using Windows OS forext = "so" # starting point for data -- sometimes 1500 used # It is fine to use 0 here for the qstar data from 700 to 12000 lookuppt = 0 # location of readgafuncs.r gafuncdir = "/data/nealjeff/qstar/" # location of runga.r rungadir = "/data/nealjeff/qstar/" # a collection of functions that are necessary to run the genetic algorithm source(paste(gafuncdir,"readgafuncs.r",sep="")) # the values on NCI-FDA website that segregate ciphergen data perfectly targets = c(435.46452, 465.56916, 2760.6685, 3497.5508, 6631.7043, 14051.976, 19643.409) tpoints = as.numeric(lapply(targets,lookup,data=wvec)) cbind(targets,wvec[tpoints]) set.seed(1) listcancer1 = sample(dim(cancer)[1],csize) listcontrol1 = sample(dim(control)[1],nsize) listcancer2 = c(1:dim(cancer)[1])[-listcancer1] listcontrol2 = c(1:dim(control)[1])[-listcontrol1] cancer1 = cancer[listcancer1,]; control1 = control[listcontrol1,] cancer2 = cancer[-listcancer1,]; control2 = control[-listcontrol1,] lc1 = length(listcancer1); lc2 = length(listcancer2); ln1 = length(listcontrol1); ln2 = length(listcontrol2); N1 = lc1+ln1 rorder = sample(1:N1) #rorder = 1:N1 rset1 = rbind(cancer1,control1)[rorder,] rset2 = rbind(cancer2,control2) backorder = (1:N1)[order(rorder)] dimrset1 = dim(rset1) classlabels = c(rep("C",lc1),rep("N",ln1))[rorder] class01s = as.integer(ifelse(classlabels == "C",0,1)) classlabels2 = c(rep("C",lc2),rep("N",ln2)) maxlen=20 evalpoints = tpoints mat = t(apply(rset1[,evalpoints],1,nrmlz)) dimmat = dim(mat); i = dimmat[1]; j = dimmat[2] mat0 = matrix(rep(0,i*j),ncol=j) temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i), as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0), as.integer(rep(0,i)),class01s, as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)), as.integer(maxlen)) temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]], temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]]) names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities") clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1) mat = t(apply(rset2[,evalpoints],1,nrmlz)) results = matrix(rep(0,dim(mat)[1]*4),ncol=4) cutdist = .1*sqrt(length(evalpoints)) for(k in 1:dim(mat)[1]){ distlist = apply(temp$centers,1,ndist,y=mat[k,]) results[k,1] = which(distlist==min(distlist)) results[k,2] = distlist[results[k,1]] results[k,3] = clustertype[results[k,1]] results[k,4] = ifelse(results[k,2] <= cutdist, results[k,3],2) } table1 = table(results[,3],classlabels2) # table2 uses the second definition of cluster assignment, i.e. # if no cluster is within .1*sqrt(length(chromosome)) then # the case is assigned 'new' or 'unknown' status -- results # in lower accuracy table2 = table(results[,4],classlabels2) postscript("figure1.eps",onefile=FALSE,horizontal=FALSE,height=6,width=6) matplot(1:7,t(mat),type="l",col=(ifelse(classlabels2=="C","black","grey")), lty=(ifelse(classlabels2=="C",1,1)),xlab="Marker", ylab="Relative Intensity") legend(x=4,y=.6,legend=c("Cancer ","Control "),lty=c(1,1),col=c("black","grey")) dev.off() # This produces the sequence 1,0,0,0,0,0,0,0,5,1 # when run right after that which is above for(z in 1:10){ listcancer1 = sample(dim(cancer)[1],csize) listcontrol1 = sample(dim(control)[1],nsize) listcancer2 = c(1:dim(cancer)[1])[-listcancer1] listcontrol2 = c(1:dim(control)[1])[-listcontrol1] cancer1 = cancer[listcancer1,]; control1 = control[listcontrol1,] cancer2 = cancer[-listcancer1,]; control2 = control[-listcontrol1,] lc1 = length(listcancer1); lc2 = length(listcancer2); ln1 = length(listcontrol1); ln2 = length(listcontrol2); N1 = lc1+ln1 rorder = sample(1:N1) #rorder = 1:N1 rset1 = rbind(cancer1,control1)[rorder,] rset2 = rbind(cancer2,control2) backorder = (1:N1)[order(rorder)] dimrset1 = dim(rset1) classlabels = c(rep("C",lc1),rep("N",ln1))[rorder] class01s = as.integer(ifelse(classlabels == "C",0,1)) classlabels2 = c(rep("C",lc2),rep("N",ln2)) maxlen=20 evalpoints = tpoints mat = t(apply(rset1[,evalpoints],1,nrmlz)) dimmat = dim(mat); i = dimmat[1]; j = dimmat[2] mat0 = matrix(rep(0,i*j),ncol=j) temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i), as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0), as.integer(rep(0,i)),class01s, as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)), as.integer(maxlen)) temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]], temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]]) names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities") clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1) matplot(1:7,t(mat),type="l",col=(class01s+2)) mat = t(apply(rset2[,evalpoints],1,nrmlz)) results = matrix(rep(0,dim(mat)[1]*4),ncol=4) cutdist = .1*sqrt(length(evalpoints)) for(k in 1:dim(mat)[1]){ distlist = apply(temp$centers,1,ndist,y=mat[k,]) results[k,1] = which(distlist==min(distlist)) results[k,2] = distlist[results[k,1]] results[k,3] = clustertype[results[k,1]] results[k,4] = ifelse(results[k,2] <= cutdist, results[k,3],2) } table1 = table(results[,3],classlabels2) table2 = table(results[,4],classlabels2) print(table1) } # End of section examining 10 consecutive fittings of # the 7 marker chromosome # a function to produce t-tests ttestp = function(y,labels,label1,label2){ ttestp = t.test(y[labels == label1], y[labels == label2])$p.value ttestp } # Run the following section of code for z = 1 through 10 # to produce table 1 z = 10 set.seed(1) # this ensures the same ordering and training/test split is # used each time listcancer1 = sample(dim(cancer)[1],csize) listcontrol1 = sample(dim(control)[1],nsize) listcancer2 = c(1:dim(cancer)[1])[-listcancer1] listcontrol2 = c(1:dim(control)[1])[-listcontrol1] cancer1 = cancer[listcancer1,]; control1 = control[listcontrol1,] cancer2 = cancer[-listcancer1,]; control2 = control[-listcontrol1,] lc1 = length(listcancer1); lc2 = length(listcancer2); ln1 = length(listcontrol1); ln2 = length(listcontrol2); N1 = lc1+ln1 rorder = sample(1:N1) #rorder mixes the order in a random way rset1 = rbind(cancer1,control1)[rorder,] rset2 = rbind(cancer2,control2) # how to get back to the original order backorder = (1:N1)[order(rorder)] dimrset1 = dim(rset1) classlabels = c(rep("C",lc1),rep("N",ln1))[rorder] class01s = as.integer(ifelse(classlabels == "C",0,1)) classlabels2 = c(rep("C",lc2),rep("N",ln2)) bset1 = rset1 bset2 = rset2 ngens = 250; penalty1 = 0; penalty2 = 0 dimbset1 = dim(bset1) set.seed(z) # runga.r essentially runs the genetic algorithm source(paste(rungadir,"runga.r",sep="")) print(table1) ttestps = apply(mat,2,ttestp,labels=classlabels2,label1="C",label2="N") evalpoints2 = evalpoints[!is.na(ttestps)] ttestps2 = ttestps[!is.na(ttestps)] print(wvec[evalpoints2[which(ttestps2 == min(ttestps2))]]) print(which(ttestps2 == min(ttestps2))) matplot(1:length(evalpoints),t(mat),type="l",col=(ifelse(classlabels2=="C","black","grey")), lty=(ifelse(classlabels2=="C",1,1)),xlab="Marker",ylab="Relative Intensity") legend(x=6.75,y=.8,legend=c("Cancer ","Control "),lty=c(1,1),col=c("black","grey")) # Here ends the section for table 1 # Figure 2 corresponds to z=1 in the code above postscript("figure2.eps",onefile=FALSE,horizontal=FALSE,height=6,width=6) matplot(1:length(evalpoints),t(mat),type="l",col=(ifelse(classlabels2=="C","black","grey")), lty=(ifelse(classlabels2=="C",1,1)),xlab="Marker",ylab="Relative Intensity") legend(x=6.75,y=.8,legend=c("Cancer ","Control "),lty=c(1,1),col=c("black","grey")) dev.off()