rm(list=ls())
# directory holding relevant cancer.rda, control.rda, and wvec.rda
datadir = "/data/nealjeff/ciphergen/"
# size of cancer and control training sets
csize = 81; nsize = 46
# location of the fitnessfuncf .dll or .so file
dlldir = "/data/nealjeff/biowulf/"
# should always be "."
dottype = "."
# should be "dll" if using Windows OS
forext = "so"
# starting point for data -- sometimes 1500 used
# It is fine to use 0 here for the qstar data from 700 to 12000
lookuppt = 0
# location of readgafuncs.r
gafuncdir = "/data/nealjeff/qstar/"
# location of runga.r
rungadir = "/data/nealjeff/qstar/"

# a collection of functions that are necessary to run the genetic algorithm
source(paste(gafuncdir,"readgafuncs.r",sep=""))


# the values on NCI-FDA website that segregate ciphergen data perfectly
targets = c(435.46452, 465.56916, 2760.6685, 3497.5508, 6631.7043, 
14051.976, 19643.409)


tpoints = as.numeric(lapply(targets,lookup,data=wvec))
cbind(targets,wvec[tpoints])


set.seed(1)
listcancer1 = sample(dim(cancer)[1],csize)
listcontrol1 = sample(dim(control)[1],nsize)
listcancer2 = c(1:dim(cancer)[1])[-listcancer1]
listcontrol2 = c(1:dim(control)[1])[-listcontrol1]

cancer1 = cancer[listcancer1,]; control1 = control[listcontrol1,]
cancer2 = cancer[-listcancer1,]; control2 = control[-listcontrol1,]


lc1 = length(listcancer1); lc2 = length(listcancer2);
ln1 = length(listcontrol1); ln2 = length(listcontrol2);
N1 = lc1+ln1

rorder = sample(1:N1)
#rorder = 1:N1
rset1 = rbind(cancer1,control1)[rorder,]
rset2 = rbind(cancer2,control2)
backorder = (1:N1)[order(rorder)]
dimrset1 = dim(rset1)

classlabels = c(rep("C",lc1),rep("N",ln1))[rorder]
class01s = as.integer(ifelse(classlabels == "C",0,1))
classlabels2 = c(rep("C",lc2),rep("N",ln2))

maxlen=20
evalpoints = tpoints
mat = t(apply(rset1[,evalpoints],1,nrmlz))
dimmat = dim(mat); i = dimmat[1]; j = dimmat[2]
mat0 = matrix(rep(0,i*j),ncol=j)
temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i),
  as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0),  as.integer(rep(0,i)),class01s,  as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)),
as.integer(maxlen))
  temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]],
    temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]])
  names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities")
clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1)


mat = t(apply(rset2[,evalpoints],1,nrmlz))
results = matrix(rep(0,dim(mat)[1]*4),ncol=4)
cutdist = .1*sqrt(length(evalpoints))
for(k in 1:dim(mat)[1]){
  distlist = apply(temp$centers,1,ndist,y=mat[k,])
  results[k,1] = which(distlist==min(distlist))
  results[k,2] = distlist[results[k,1]]
  results[k,3] = clustertype[results[k,1]]
  results[k,4] = ifelse(results[k,2] <= cutdist,
                 results[k,3],2)
}

table1 = table(results[,3],classlabels2)
# table2 uses the second definition of cluster assignment, i.e.
# if no cluster is within .1*sqrt(length(chromosome)) then
# the case is assigned 'new' or 'unknown' status -- results
# in lower accuracy
table2 = table(results[,4],classlabels2)

postscript("figure1.eps",onefile=FALSE,horizontal=FALSE,height=6,width=6)
matplot(1:7,t(mat),type="l",col=(ifelse(classlabels2=="C","black","grey")),
lty=(ifelse(classlabels2=="C",1,1)),xlab="Marker",
ylab="Relative Intensity")
legend(x=4,y=.6,legend=c("Cancer     ","Control     "),lty=c(1,1),col=c("black","grey"))
dev.off()


# This produces the sequence 1,0,0,0,0,0,0,0,5,1
# when run right after that which is above
for(z in 1:10){
listcancer1 = sample(dim(cancer)[1],csize)
listcontrol1 = sample(dim(control)[1],nsize)
listcancer2 = c(1:dim(cancer)[1])[-listcancer1]
listcontrol2 = c(1:dim(control)[1])[-listcontrol1]

cancer1 = cancer[listcancer1,]; control1 = control[listcontrol1,]
cancer2 = cancer[-listcancer1,]; control2 = control[-listcontrol1,]


lc1 = length(listcancer1); lc2 = length(listcancer2);
ln1 = length(listcontrol1); ln2 = length(listcontrol2);
N1 = lc1+ln1

rorder = sample(1:N1)
#rorder = 1:N1
rset1 = rbind(cancer1,control1)[rorder,]
rset2 = rbind(cancer2,control2)
backorder = (1:N1)[order(rorder)]
dimrset1 = dim(rset1)

classlabels = c(rep("C",lc1),rep("N",ln1))[rorder]
class01s = as.integer(ifelse(classlabels == "C",0,1))
classlabels2 = c(rep("C",lc2),rep("N",ln2))

maxlen=20
evalpoints = tpoints
mat = t(apply(rset1[,evalpoints],1,nrmlz))
dimmat = dim(mat); i = dimmat[1]; j = dimmat[2]
mat0 = matrix(rep(0,i*j),ncol=j)
temp = .Fortran("fitnessfuncf",as.double(mat),as.integer(i),
  as.integer(j),as.double(mat0),as.integer(rep(0,i)),as.integer(0),  as.integer(rep(0,i)),class01s,  as.integer(rep(0,i)), as.double(0), as.double(rep(0,i)),
as.integer(maxlen))
  temp = list(matrix(temp[[1]],ncol=j),temp[[2]],temp[[3]],matrix(temp[[4]],ncol=j)[1:temp[[6]],],temp[[5]],temp[[6]],temp[[7]][1:temp[[6]]],
    temp[[8]],temp[[9]][1:temp[[6]]],temp[[10]],temp[[11]][1:temp[[6]]])
  names(temp) = c("mat","i","j","centers","parent","totclusters","ninclusters","class01s","n0inclusters","purity","purities")
clustertype = ifelse(temp$n0inclusters/temp$ninclusters >= .5,0,1)

matplot(1:7,t(mat),type="l",col=(class01s+2))


mat = t(apply(rset2[,evalpoints],1,nrmlz))
results = matrix(rep(0,dim(mat)[1]*4),ncol=4)
cutdist = .1*sqrt(length(evalpoints))
for(k in 1:dim(mat)[1]){
  distlist = apply(temp$centers,1,ndist,y=mat[k,])
  results[k,1] = which(distlist==min(distlist))
  results[k,2] = distlist[results[k,1]]
  results[k,3] = clustertype[results[k,1]]
  results[k,4] = ifelse(results[k,2] <= cutdist,
                 results[k,3],2)
}

table1 = table(results[,3],classlabels2)
table2 = table(results[,4],classlabels2)
print(table1)
}
# End of section examining 10 consecutive fittings of
# the 7 marker chromosome

# a function to produce t-tests
ttestp = function(y,labels,label1,label2){
  ttestp = t.test(y[labels == label1],
                     y[labels == label2])$p.value
  ttestp
}


# Run the following section of code for z = 1 through 10
# to produce table 1

z = 10

set.seed(1)
# this ensures the same ordering and training/test split is
# used each time
listcancer1 = sample(dim(cancer)[1],csize)
listcontrol1 = sample(dim(control)[1],nsize)
listcancer2 = c(1:dim(cancer)[1])[-listcancer1]
listcontrol2 = c(1:dim(control)[1])[-listcontrol1]

cancer1 = cancer[listcancer1,]; control1 = control[listcontrol1,]
cancer2 = cancer[-listcancer1,]; control2 = control[-listcontrol1,]


lc1 = length(listcancer1); lc2 = length(listcancer2);
ln1 = length(listcontrol1); ln2 = length(listcontrol2);
N1 = lc1+ln1

rorder = sample(1:N1)
#rorder mixes the order in a random way
rset1 = rbind(cancer1,control1)[rorder,]
rset2 = rbind(cancer2,control2)
# how to get back to the original order
backorder = (1:N1)[order(rorder)]
dimrset1 = dim(rset1)

classlabels = c(rep("C",lc1),rep("N",ln1))[rorder]
class01s = as.integer(ifelse(classlabels == "C",0,1))
classlabels2 = c(rep("C",lc2),rep("N",ln2))


bset1 = rset1
bset2 = rset2
ngens = 250; penalty1 = 0; penalty2 = 0
dimbset1 = dim(bset1)
set.seed(z)
# runga.r essentially runs the genetic algorithm
source(paste(rungadir,"runga.r",sep=""))
print(table1)
ttestps = apply(mat,2,ttestp,labels=classlabels2,label1="C",label2="N")
evalpoints2 = evalpoints[!is.na(ttestps)]
ttestps2 = ttestps[!is.na(ttestps)]


print(wvec[evalpoints2[which(ttestps2 == min(ttestps2))]])
print(which(ttestps2 == min(ttestps2)))


matplot(1:length(evalpoints),t(mat),type="l",col=(ifelse(classlabels2=="C","black","grey")),
lty=(ifelse(classlabels2=="C",1,1)),xlab="Marker",ylab="Relative Intensity")
legend(x=6.75,y=.8,legend=c("Cancer     ","Control     "),lty=c(1,1),col=c("black","grey"))

# Here ends the section for table 1

# Figure 2 corresponds to z=1 in the code above
postscript("figure2.eps",onefile=FALSE,horizontal=FALSE,height=6,width=6)

matplot(1:length(evalpoints),t(mat),type="l",col=(ifelse(classlabels2=="C","black","grey")),
lty=(ifelse(classlabels2=="C",1,1)),xlab="Marker",ylab="Relative Intensity")
legend(x=6.75,y=.8,legend=c("Cancer     ","Control     "),lty=c(1,1),col=c("black","grey"))

dev.off()