# The directory where raw .csv files are held datadir = "/data/nealjeff/qstar/" # The location of the .dll or .so files dlldir = "/data/nealjeff/biowulf/" # for .so files forext = "so", for .dll it should equal "dll" # dottype should stay "." dottype = "."; forext = "so" # This creates binspots -- designates the binning to aggregate the # data as described on my website a = 0; b = 400/1000000 x0 = 700 xs = x0 while(x0 < 12000){ xdiff = a+b*x0 x0 = xdiff+x0 xs = c(xs,x0) } binspots = xs xlen = length(binspots) # This loads the library to execute the fortran subtourine dyn.load(paste(dlldir,"binspot",dottype,forext,sep="")) # Number of expected cancer files rangeis = 121 cancerfilelist = rep(0,rangeis) maxbins = xlen ucancer = matrix(rep(0,maxbins*rangeis),nrow=rangeis) ucancercount = ucancer j = 1 for(i in 601:780){ label = paste(datadir,"daf-0",i,".txt",sep="") xold = try(read.table(file=label,header=F),silent=TRUE) if(length(xold) > 1) { xold = as.matrix(cbind(xold[,1],xold[,2])) x = matrix(rep(0,2*xlen),ncol=2) xoldlen = dim(xold)[1] temp = .Fortran("binspotf",as.double(xold),as.double(binspots), as.double(x),as.integer(2),as.integer(xoldlen),as.integer(xlen)) # binspotf just counts up the sum of values within each bin # and the number of values within each bin xnew = matrix(temp[[3]],ncol=2) for(w in 1:xlen){ if(xnew[w,1] > 0) xnew[w,2] = xnew[w,2]/xnew[w,1] } ucancer[j,] = xnew[,2] ucancercount[j,] = xnew[,1] cancerfilelist[j] = i print(paste("count is ",j," ",date())); j = j + 1 } } # ucancer has average value within the bin range # ucancercount says how many raw values went into that bin # Parallel construction for the control cases rangeis = 95 controlfilelist = rep(0,rangeis) maxbins = xlen ucontrol = matrix(rep(0,maxbins*rangeis),nrow=rangeis) ucontrolcount = ucontrol j = 1 for(i in 180:281){ label = paste(datadir,"daf-0",i,".txt",sep="") xold = try(read.table(file=label,header=F),silent=TRUE) if(length(xold) > 1) { xold = as.matrix(cbind(xold[,1],xold[,2])) x = matrix(rep(0,2*xlen),ncol=2) xoldlen = dim(xold)[1] temp = .Fortran("binspotf",as.double(xold),as.double(binspots), as.double(x),as.integer(2),as.integer(xoldlen),as.integer(xlen)) xnew = matrix(temp[[3]],ncol=2) for(w in 1:xlen){ if(xnew[w,1] > 0) xnew[w,2] = xnew[w,2]/xnew[w,1] } ucontrol[j,] = xnew[,2] ucontrolcount[j,] = xnew[,1] controlfilelist[j] = i print(paste("count is ",j," ",date())); j = j + 1 } } # drop the first element as this corresponds # to values less than 700 - there are a few wvec = binspots[-1] ucancer = ucancer[,-1] ucontrol = ucontrol[,-1] ucancercount = ucancercount[,-1] ucontrolcount = ucontrolcount[,-1] save(ucancer,file=paste(datadir,"ucancer.rda",sep="")) save(ucontrol,file=paste(datadir,"ucontrol.rda",sep="")) save(ucancercount,file=paste(datadir,"ucancercount.rda",sep="")) save(ucontrolcount,file=paste(datadir,"ucontrolcount.rda",sep="")) save(wvec,file=paste(datadir,"wvec.rda",sep="")) load(paste(datadir,"ucancer.rda",sep="")) load(paste(datadir,"ucontrol.rda",sep="")) load(paste(datadir,"ucancercount.rda",sep="")) load(paste(datadir,"ucontrolcount.rda",sep="")) load(paste(datadir,"wvec.rda",sep="")) avval = mean(rbind(ucancer,ucontrol)) avnrmlz = function(vec,avval){ avnrmlz = vec*avval/mean(vec) } # Data are normalized to have same overall intensity cancer = t(apply(ucancer,1,avnrmlz,avval)) control = t(apply(ucontrol,1,avnrmlz,avval)) save(cancer,file=paste(datadir,"cancer.rda",sep="")) save(control,file=paste(datadir,"control.rda",sep=""))