#------------------------------------------------------------------------------- # Written by: Kory R Johnson # Email: johnsonko@mail.nih.gov # R version: R x64 3.3.1 #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # CODE START #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # Environment Declarations #------------------------------------------------------------------------------- rm(list=ls()) options(object.size=Inf) memory.size(max = TRUE) #------------------------------------------------------------------------------- # Function Declarations #------------------------------------------------------------------------------- source("http://bioconductor.org/biocLite.R") biocLite() install.packages("polycor") library(polycor) install.packages("gplots") library(gplots) install.packages("gmodels") library(gmodels) install.packages("class") library(class) library(car) library(stats) library(rgl) image.scale <- function (z, col, x, y = NULL, size = NULL, digits = 2, labels = c("breaks", "ranges")) { # sort out the location n <- length(col) usr <- par("usr") mx <- mean(usr[1:2]); my <- mean(usr[3:4]) dx <- diff(usr[1:2]); dy <- diff(usr[3:4]) if (missing(x)) x <- mx + 1.05*dx/2 # default x to right of image else if (is.list(x)) { if (length(x$x) == 2) size <- c(diff(x$x), -diff(x$y)/n) y <- x$y[1] x <- x$x[1] } else x <- x[1] if (is.null(size)) if (is.null(y)) { size <- 0.618*dy/n # default size, golden ratio y <- my + 0.618*dy/2 # default y to give centred scale } else size <- (y-my)*2/n if (length(size)==1) size <- rep(size, 2) # default square boxes if (is.null(y)) y <- my + n*size[2]/2 # draw the image scale i <- seq(along = col) rect(x, y - i * size[2], x + size[1], y - (i - 1) * size[2], col = rev(col), xpd = TRUE) # sort out the labels rng <- range(z, na.rm = TRUE) bks <- seq(from = rng[2], to = rng[1], length = n + 1) bks <- formatC(bks, format="f", digits=digits) labels <- match.arg(labels) if (labels == "breaks") ypts <- y - c(0, i) * size[2] else { bks <- paste(bks[-1], bks[-(n+1)], sep = " - ") ypts <- y - (i - 0.5) * size[2] } text(x = x + 1.2 * size[1], y = ypts, labels = bks, adj = ifelse(size[1]>0, 0, 1), xpd = TRUE) } #------------------------------------------------------------------------------- # Import Data #------------------------------------------------------------------------------- working.data <- read.table("head_sample_data.txt",row.names=1,header=TRUE) sample.info <- read.table("head_sample_info.txt",row.names=1,header=TRUE) working.data <- working.data[,dimnames(sample.info)[[1]]] #------------------------------------------------------------------------------- # Generate Box Plot #------------------------------------------------------------------------------- dev.new() palette(rainbow(20)) WorkingList <- vector("list", dim(working.data)[2]) for ( i in 1:dim(working.data)[2]) { WorkingList[[i]] <- as.numeric(working.data[,i]) } col.2.use <- as.numeric(unlist(sample.info$Color)) boxplot(WorkingList,col=col.2.use) #------------------------------------------------------------------------------- # Generate PCA #------------------------------------------------------------------------------- dev.new() par(mfrow=c(1,2)) col.2.use <- as.numeric(unlist(sample.info$Color)) shape.2.use <- as.numeric(unlist(sample.info$Shape)) pca.results <- princomp(working.data,cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,1],pca.loadings[,2],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) x.range <- range(pca.loadings[,2]) y.range <- range(pca.loadings[,3]) xlab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") ylab <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="") plot(pca.loadings[,2],pca.loadings[,3],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,2],pca.loadings[,3],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) #------------------------------------------------------------------------------- # Generate Heat Map #------------------------------------------------------------------------------- dev.new() cdat <- working.data cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() #------------------------------------------------------------------------------- # Remove Outlier #------------------------------------------------------------------------------- sample.info <- sample.info[-c(3),] working.data <- working.data[,-c(3)] #------------------------------------------------------------------------------- # Repeat Generate Box Plot #------------------------------------------------------------------------------- dev.new() palette(rainbow(20)) WorkingList <- vector("list", dim(working.data)[2]) for ( i in 1:dim(working.data)[2]) { WorkingList[[i]] <- as.numeric(working.data[,i]) } col.2.use <- as.numeric(unlist(sample.info$Color)) boxplot(WorkingList,col=col.2.use) #------------------------------------------------------------------------------- # Repeat Generate PCA #------------------------------------------------------------------------------- dev.new() par(mfrow=c(1,2)) col.2.use <- as.numeric(unlist(sample.info$Color)) shape.2.use <- as.numeric(unlist(sample.info$Shape)) pca.results <- princomp(working.data,cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,1],pca.loadings[,2],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) x.range <- range(pca.loadings[,2]) y.range <- range(pca.loadings[,3]) xlab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") ylab <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="") plot(pca.loadings[,2],pca.loadings[,3],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,2],pca.loadings[,3],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) #------------------------------------------------------------------------------- # Repeat Generate Heat Map #------------------------------------------------------------------------------- dev.new() cdat <- working.data cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() #------------------------------------------------------------------------------- # Perform Baseline Subtraction #------------------------------------------------------------------------------- dev.new() par(mfrow=c(1,2)) baseline.sample <- working.data[,sample.info$Baseline_Sample=="Y"] names(baseline.sample) <- dimnames(working.data)[[1]] old.wt.mean <- apply(working.data[,sample.info$Group==2],1,mean) plot(old.wt.mean,baseline.sample) diff.old.new <- baseline.sample-old.wt.mean old.data <- working.data[,sample.info$Data_Source=="OLD"] new.data <- working.data[,sample.info$Data_Source=="NEW"] updated.new.data <- NULL for (i in 1:dim(new.data)[2]) { print(i) temp <- as.numeric(new.data[,i])-as.numeric(diff.old.new) updated.new.data <- cbind(updated.new.data,temp) } dimnames(updated.new.data)[[1]] <- dimnames(new.data)[[1]] dimnames(updated.new.data)[[2]] <- dimnames(new.data)[[2]] updated.working.data <- cbind(old.data,updated.new.data) plot(old.wt.mean,updated.new.data[,1]) #------------------------------------------------------------------------------- # Repeat Box Plot #------------------------------------------------------------------------------- dev.new() palette(rainbow(20)) WorkingList <- vector("list", dim(updated.working.data)[2]) for ( i in 1:dim(updated.working.data)[2]) { WorkingList[[i]] <- as.numeric(updated.working.data[,i]) } col.2.use <- as.numeric(unlist(sample.info$Color)) boxplot(WorkingList,col=col.2.use) #------------------------------------------------------------------------------- # Repeat PCA #------------------------------------------------------------------------------- dev.new() par(mfrow=c(1,2)) col.2.use <- as.numeric(unlist(sample.info$Color)) shape.2.use <- as.numeric(unlist(sample.info$Shape)) pca.results <- princomp(updated.working.data,cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,1],pca.loadings[,2],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) x.range <- range(pca.loadings[,2]) y.range <- range(pca.loadings[,3]) xlab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") ylab <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="") plot(pca.loadings[,2],pca.loadings[,3],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,2],pca.loadings[,3],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) #------------------------------------------------------------------------------- # Repeat Generate Heat Map #------------------------------------------------------------------------------- dev.new() cdat <- updated.working.data cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() #------------------------------------------------------------------------------- # Perform Mean Subtraction #------------------------------------------------------------------------------- dev.new() par(mfrow=c(1,2)) new.wt.mean <- apply(updated.working.data[,sample.info$Group==8],1,mean) old.wt.mean <- apply(updated.working.data[,sample.info$Group==2],1,mean) plot(old.wt.mean,new.wt.mean) diff.old.new <- new.wt.mean-old.wt.mean old.data <- updated.working.data[,sample.info$Data_Source=="OLD"] new.data <- updated.working.data[,sample.info$Data_Source=="NEW"] updated.new.data <- NULL for (i in 1:dim(new.data)[2]) { print(i) temp <- as.numeric(new.data[,i])-as.numeric(diff.old.new) updated.new.data <- cbind(updated.new.data,temp) } dimnames(updated.new.data)[[1]] <- dimnames(new.data)[[1]] dimnames(updated.new.data)[[2]] <- dimnames(new.data)[[2]] Final.working.data <- cbind(old.data,updated.new.data) plot(old.wt.mean,apply(Final.working.data[,sample.info$Group==8],1,mean)) #------------------------------------------------------------------------------- # Repeat Box Plot #------------------------------------------------------------------------------- dev.new() palette(rainbow(20)) WorkingList <- vector("list", dim(Final.working.data)[2]) for ( i in 1:dim(Final.working.data)[2]) { WorkingList[[i]] <- as.numeric(Final.working.data[,i]) } col.2.use <- as.numeric(unlist(sample.info$Color)) boxplot(WorkingList,col=col.2.use) #------------------------------------------------------------------------------- # Quantile Polish #------------------------------------------------------------------------------- dev.new() working.data <- Final.working.data plot(sort(working.data[,1]),type="l",ylab="Expression") running.sort <- NULL for (i in 1:dim(working.data)[2]) { temp <- sort(working.data[,i]) running.sort <- cbind(running.sort,temp) lines(temp,col=col.2.use[i]) } running.median <- apply(running.sort,1,median) running.normalized <- NULL for (i in 1:dim(working.data)[2]) { temp <- working.data[,i] temp <- sort(temp,index.return = TRUE) temp <- temp$ix names(temp) <- as.numeric(unlist(running.median)) temp <- sort(temp) temp <- as.numeric(unlist(names(temp))) running.normalized <- cbind(running.normalized,temp) rm(temp) } dimnames(running.normalized) <- dimnames(working.data) write.table(running.normalized,"quanitle_polished_sample_data.txt",sep="\t") #------------------------------------------------------------------------------- # Repeat Box Plot #------------------------------------------------------------------------------- dev.new() palette(rainbow(20)) WorkingList <- vector("list", dim(running.normalized)[2]) for ( i in 1:dim(running.normalized)[2]) { WorkingList[[i]] <- as.numeric(running.normalized[,i]) } col.2.use <- as.numeric(unlist(sample.info$Color)) boxplot(WorkingList,col=col.2.use) #------------------------------------------------------------------------------- # Repeat PCA #------------------------------------------------------------------------------- dev.new() par(mfrow=c(1,2)) col.2.use <- as.numeric(unlist(sample.info$Color)) shape.2.use <- as.numeric(unlist(sample.info$Shape)) pca.results <- princomp(running.normalized,cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,1],pca.loadings[,2],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) x.range <- range(pca.loadings[,2]) y.range <- range(pca.loadings[,3]) xlab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") ylab <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="") plot(pca.loadings[,2],pca.loadings[,3],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,2],pca.loadings[,3],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) #------------------------------------------------------------------------------- # Repeat Generate Heat Map #------------------------------------------------------------------------------- dev.new() cdat <- running.normalized cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() #------------------------------------------------------------------------------- # Noise Analysis #------------------------------------------------------------------------------- dev.new() working.data <- running.normalized group1 <- working.data[,sample.info$Group==1] group2 <- working.data[,sample.info$Group==2] group3 <- working.data[,sample.info$Group==3] group4 <- working.data[,sample.info$Group==4] group5 <- working.data[,sample.info$Group==5] group6 <- working.data[,sample.info$Group==6] group7 <- working.data[,sample.info$Group==8] group8 <- working.data[,sample.info$Group==9] g1.mean <- apply(group1,1,mean) g2.mean <- apply(group2,1,mean) g3.mean <- apply(group3,1,mean) g4.mean <- apply(group4,1,mean) g5.mean <- apply(group5,1,mean) g6.mean <- apply(group6,1,mean) g7.mean <- apply(group7,1,mean) g8.mean <- apply(group8,1,mean) g1.sd <- apply(group1,1,sd) g2.sd <- apply(group2,1,sd) g3.sd <- apply(group3,1,sd) g4.sd <- apply(group4,1,sd) g5.sd <- apply(group5,1,sd) g6.sd <- apply(group6,1,sd) g7.sd <- apply(group7,1,sd) g8.sd <- apply(group8,1,sd) g1.cv <- (g1.sd/g1.mean)*100 g2.cv <- (g2.sd/g2.mean)*100 g3.cv <- (g3.sd/g3.mean)*100 g4.cv <- (g4.sd/g4.mean)*100 g5.cv <- (g5.sd/g5.mean)*100 g6.cv <- (g6.sd/g6.mean)*100 g7.cv <- (g7.sd/g7.mean)*100 g8.cv <- (g8.sd/g8.mean)*100 lowess.g1 <- lowess(g1.cv~g1.mean) lowess.g2 <- lowess(g2.cv~g2.mean) lowess.g3 <- lowess(g3.cv~g3.mean) lowess.g4 <- lowess(g4.cv~g4.mean) lowess.g5 <- lowess(g5.cv~g5.mean) lowess.g6 <- lowess(g6.cv~g6.mean) lowess.g7 <- lowess(g7.cv~g7.mean) lowess.g8 <- lowess(g8.cv~g8.mean) xrange <- range(c(2:14)) yrange <- range(c(0:20)) plot(xrange,yrange,type='n',xlab="Mean Expression",ylab="C.V.") palette("default") points(g1.mean,g1.cv,type='p',col=8) points(g2.mean,g2.cv,type='p',col=8) points(g3.mean,g3.cv,type='p',col=8) points(g4.mean,g4.cv,type='p',col=8) points(g5.mean,g5.cv,type='p',col=8) points(g6.mean,g6.cv,type='p',col=8) points(g7.mean,g7.cv,type='p',col=8) points(g8.mean,g8.cv,type='p',col=8) palette(rainbow(20)) lines(lowess.g1,col=5,lwd=2) lines(lowess.g2,col=4,lwd=2) lines(lowess.g3,col=11,lwd=2) lines(lowess.g4,col=14,lwd=2) lines(lowess.g5,col=3,lwd=2) lines(lowess.g6,col=2,lwd=2) lines(lowess.g7,col=5,lwd=2) lines(lowess.g8,col=17,lwd=2) abline(v=7.5) abline(h=0) #------------------------------------------------------------------------------- # Remove Noise #------------------------------------------------------------------------------- noise.pick <- 7.5 temp <- as.numeric(unlist(working.data)) temp <- ifelse(temp=1.5,1,0) #------------------------------------------------------------------------------- # Changing Gene Selection #------------------------------------------------------------------------------- cx <- coded.pvs+coded.fcs output.changers <- ifelse(cx==2,1,0) output.changers.summary <- apply(output.changers,2,sum) dev.new() barplot(output.changers.summary,col="black") #text(barplot(output.changers.summary),c(output.changers.summary-15), as.character(output.changers.summary)) abline(h=0) keepers <- dimnames(output.changers[apply(output.changers,1,max)>0,])[[1]] #------------------------------------------------------------------------------- # PCA Post Selection #------------------------------------------------------------------------------- dev.new() par(mfrow=c(1,2)) palette(rainbow(20)) col.2.use <- as.numeric(unlist(sample.info$Color)) shape.2.use <- as.numeric(unlist(sample.info$Shape)) pca.results <- princomp(posthoc.data[keepers,],cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,1],pca.loadings[,2],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) x.range <- range(pca.loadings[,2]) y.range <- range(pca.loadings[,3]) xlab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") ylab <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="") plot(pca.loadings[,2],pca.loadings[,3],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,2],pca.loadings[,3],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) text(pca.loadings[,2],pca.loadings[,3],dimnames(posthoc.data)[[2]]) #------------------------------------------------------------------------------- # Repeat Generate Heat Map #------------------------------------------------------------------------------- dev.new() cdat <- posthoc.data[keepers,] cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() #------------------------------------------------------------------------------- # Generate Clustered Heat Map #------------------------------------------------------------------------------- dev.off() cc <- greenred(64) dd <- posthoc.data[keepers,] dimnames(dd)[[2]] <- sample.info$Group heatmap.2(as.matrix(dd),trace="none",density.info="none",col=cc) #------------------------------------------------------------------------------- # Output Results #------------------------------------------------------------------------------- output.means <- output.means[keepers,] output.anova.results <- output.anova.results[keepers,] output.posthoc <- output.posthoc[keepers,] output.fcs <- output.fcs[keepers,] output.changers <- output.changers[keepers,] cx <- cbind(output.means,output.anova.results[,2]) cx <- cbind(cx,output.posthoc) cx <- cbind(cx,output.fcs) cx <- cbind(cx,output.changers) write.table(cx,"FINAL_FINAL_results_table.txt",sep="\t") #------------------------------------------------------------------------------ # KNN # https://www.datacamp.com/community/tutorials/machine-learning-in-r # http://blog.echen.me/2011/04/27/choosing-a-machine-learning-classifier/ # https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm #------------------------------------------------------------------------------ # Mean_OLD_D3_WT 1 # Mean_OLD_D10_WT 2 # Mean_OLD_D10_P35_KO 3 # Mean_OLD_D10_P35_OE 4 # Mean_OLD_D30_WT 5 # Mean_OLD_D45_WT 6 # Mean_NEW_D10_WT 8 # Mean_NEW_D10_P35_TG 9 temp1 <- sample.info$Group==1 #D3 temp2 <- sample.info$Group==2 #D10 temp3 <- sample.info$Group==5 #D30 temp4 <- sample.info$Group==6 #D45 temptemp <- temp1 temptemp <- temptemp+temp2 temptemp <- temptemp+temp3 temptemp <- temptemp+temp4 training.set <- Final.working.data[,temptemp==1] palette(rainbow(20)) col.2.use <- as.numeric(unlist(sample.info$Color))[temptemp==1] shape.2.use <- as.numeric(unlist(sample.info$Shape))[temptemp==1] pca.results <- princomp(training.set,cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,1],pca.loadings[,2],lwd=2,bg=col.2.use,cex=3,pch=shape.2.use) set.seed(1234) Class <- as.factor(as.numeric(sample.info$Group[temptemp==1])) true.labels <- as.factor(as.numeric(sample.info$Group)) running.keepers <- NULL init.ctab <- rep(0,32) running.ctab <- matrix(init.ctab,8,4) for (r in 1:dim(training.set)[2]) { print(r) fff <- training.set[,-c(r)] check.fff <- apply(fff,1,max)>7.5 fff <- fff[check.fff,] running.pvs <- NULL for (i in 1:dim(fff)[1]) { temp1 <- as.numeric(fff[i,]) temp2 <- data.frame(DATA=temp1,CLASS=factor(Class[-c(r)])) temp3 <- aov(DATA~CLASS,data=temp2) temp4 <- summary(temp3) temp5 <- temp4[[1]] temp6 <- temp5[,5][[1]] running.pvs <- c(running.pvs,temp6) } names(running.pvs) <- dimnames(fff)[[1]] running.anova.pvs <- running.pvs temp1 <- p.adjust(running.anova.pvs,method="BH") output.anova.results <- as.data.frame(cbind(running.anova.pvs,temp1)) dimnames(output.anova.results)[[2]] <- c("uncorrected_pvalue","corrected_pvalue") pass.anova <- dimnames(output.anova.results[output.anova.results[,2]<0.05,])[[1]] posthoc.data <- fff[pass.anova,] running.posthoc <- NULL running.diffs <- NULL for (i in 1:dim(posthoc.data)[1]) { temp1 <- as.numeric(posthoc.data[i,]) temp2 <- data.frame(DATA=temp1,Class=factor(Class[-c(r)])) temp3 <- aov(DATA~Class,data=temp2) temp4 <- TukeyHSD(temp3) temp5 <- temp4[[1]] temp6 <- temp5[,4] running.posthoc <- rbind(running.posthoc,temp6) temp7 <- temp5[,1] running.diffs <- rbind(running.diffs,temp7) } running.posthoc <- as.data.frame(running.posthoc) dimnames(running.posthoc)[[1]] <- dimnames(posthoc.data)[[1]] output.posthoc <- running.posthoc dimnames(running.diffs)[[1]] <- dimnames(posthoc.data)[[1]] coded.pvs <- ifelse(running.posthoc<0.05,1,0) output.fcs <- 2^running.diffs output.fcs <- ifelse(output.fcs<1,-1/output.fcs,output.fcs) output.fcm <- abs(output.fcs) coded.fcs <- ifelse(output.fcm>=1.5,1,0) cx <- coded.pvs+coded.fcs output.changers <- ifelse(cx==2,1,0) output.changers.summary <- apply(output.changers,2,sum) keepers <- dimnames(output.changers[apply(output.changers,1,max)>0,])[[1]] running.keepers <- c(running.keepers,keepers) ccc <- Class[-c(r)] ttt <- t(fff[keepers,]) eee <- t(Final.working.data[keepers,]) print(length(keepers)) fff_pred <- knn(train=ttt,test=eee,cl=Class[-c(r)],k=4,prob=TRUE) ctab <- CrossTable(x=true.labels,y=fff_pred,prop.chisq=FALSE)$t running.ctab <- running.ctab+ctab } kkk <- table(running.keepers) kkkk <- kkk/19*100 hist(kkkk,col="grey") kkk <- kkk[kkk==19] kkk <- names(kkk) #------------------------------------------------------------------------------- # Age Modeling #------------------------------------------------------------------------------- keepers1 <- sample.info$Color==5 keepers2 <- sample.info$Color==4 keepers3 <- sample.info$Color==3 keepers4 <- sample.info$Color==2 final.keep <- keepers1+keepers2+keepers3+keepers4 final.keep <- final.keep==1 temp <- sample.info[final.keep,] temp <- temp[temp$Data_Source=="OLD",] final.keep <- dimnames(temp)[[1]] final.sample.info <- sample.info[final.keep,] palette(rainbow(20)) col.2.use1 <- as.numeric(unlist(final.sample.info$Color)) dat <- t(Final.working.data[kkk,final.keep]) new.dat <- t(Final.working.data[kkk,sample.info$Data_Source=="OLD"]) nnew.dat <- t(Final.working.data[kkk,sample.info$Group!=8]) pca <- prcomp(dat, retx=T, center=T, scale=T) summary(pca) sum.pca <- summary(pca)[[6]][2,] dev.new() barplot(summary(pca)[[6]][2,]*100,ylab="% Total Variance Explained") abline(h=0) abline(h=1,lty=2) pc <- c(1,2,3) write.table(summary(pca)[[6]],"head_pca_stats.txt",sep="\t") write.table(pca$x,"head_sample_scores.txt",sep="\t") write.table(pca$rotation,"head_gene_loadings.txt",sep="\t") dev.new() plot(pca$x[,1], pca$x[,2], col=col.2.use1, cex=2, pch=19, xlab=paste0("PC ", pc[1], " (", round(pca$sdev[pc[1]]/sum(pca$sdev)*100,0), "%)"), ylab=paste0("PC ", pc[2], " (", round(pca$sdev[pc[2]]/sum(pca$sdev)*100,0), "%)")) points(pca$x[,1], pca$x[,2], col="black", cex=2, pch=21) p.new.dat <- predict(pca,newdata=new.dat) pp.new.dat <- predict(pca,newdata=nnew.dat) col.2.use2 <- as.numeric(unlist(sample.info$Color))[sample.info$Data_Source=="OLD"] ccol.2.use2 <- as.numeric(unlist(sample.info$Color))[sample.info$Group!=8] dev.new() plot(p.new.dat[,pc[1]],p.new.dat[,pc[2]], col=col.2.use2, cex=2, pch=19, xlab=paste0("PC", pc[1], " (", round(sum.pca[pc[1]]*100,2), "%)"), ylab=paste0("PC", pc[2], " (", round(sum.pca[pc[2]]*100,2), "%)")) points(p.new.dat[,pc[1]],p.new.dat[,pc[2]],col="black", cex=2, pch=21) dev.new() plot(pp.new.dat[,pc[1]],pp.new.dat[,pc[2]], col=ccol.2.use2, cex=2, pch=19, xlab=paste0("PC", pc[1], " (", round(sum.pca[pc[1]]*100,2), "%)"), ylab=paste0("PC", pc[2], " (", round(sum.pca[pc[2]]*100,2), "%)")) points(pp.new.dat[,pc[1]],pp.new.dat[,pc[2]],col="black", cex=2, pch=21) ages <- c(rep(3,4),rep(10,5),rep(30,5),rep(45,5)) loadings2.work.with <- cbind(pca$x,ages) ages.2.fit <- as.data.frame(loadings2.work.with) age.model <- lm(ages~PC1*PC2*PC3*PC4,ages.2.fit) age.step <- step(age.model) age.keep <- summary(age.step) age.keep Anova(age.step,type="III") ppp.age.model1 <- predict(age.step,ages.2.fit) names(ppp.age.model1) <- c(rep("D3",4),rep("D10",5),rep("D30",5),rep("D45",5)) dev.new() plot(c(rep(3,4),rep(10,5),rep(30,5),rep(45,5)),as.numeric(ppp.age.model1),col=col.2.use1,cex=2,pch=19,ylab="Predicted Age (days)",xlab="Actual Age (days)") points(c(rep(3,4),rep(10,5),rep(30,5),rep(45,5)),as.numeric(ppp.age.model1),col="black",pch=21,cex=2) abline(0,1) final.2.predict <- as.data.frame(p.new.dat) ppp.age.model2 <- predict(age.step,final.2.predict) names(ppp.age.model2) <- c(rep("D3",4),rep("D10",5),rep("D10_KO",5),rep("D10_OE",5),rep("D30",5),rep("D45",5)) write.table(ppp.age.model2,"head_predictions.txt",sep="\t") dev.new() plot(c(rep(3,4),rep(10,15),rep(30,5),rep(45,5)),as.numeric(ppp.age.model2),col=col.2.use2,cex=2,pch=19,ylab="Predicted Age (days)",xlab="Actual Age (days)") points(c(rep(3,4),rep(10,15),rep(30,5),rep(45,5)),as.numeric(ppp.age.model2),col="black",cex=2,pch=21) abline(0,1) ffinal.2.predict <- as.data.frame(pp.new.dat) pppp.age.model2 <- predict(age.step,ffinal.2.predict) names(pppp.age.model2) <- c(rep("D3",4),rep("D10",5),rep("D10_KO",5),rep("D10_OE",5),rep("D30",5),rep("D45",5),rep("D10_RSCU",5)) write.table(pppp.age.model2,"head_predictions.txt",sep="\t") dev.new() plot(c(rep(3,4),rep(10,15),rep(30,5),rep(45,5),rep(10,5)),as.numeric(pppp.age.model2),col=ccol.2.use2,cex=2,pch=19,ylab="Predicted Age (days)",xlab="Actual Age (days)") points(c(rep(3,4),rep(10,15),rep(30,5),rep(45,5),rep(10,5)),as.numeric(pppp.age.model2),col="black",cex=2,pch=21) abline(0,1) pc <- c(1,2,3) xlab <- paste0("PC ", pc[1], " (", round(pca$sdev[pc[1]]/sum(pca$sdev)*100,0), "%)") ylab <- paste0("PC ", pc[2], " (", round(pca$sdev[pc[2]]/sum(pca$sdev)*100,0), "%)") zlab <- paste0("PC ", pc[3], " (", round(pca$sdev[pc[3]]/sum(pca$sdev)*100,0), "%)") plot3d(pp.new.dat[,c(1:3)], col=ccol.2.use2,size=2,type="s",xlab=xlab,ylab=ylab,zlab=zlab) #------------------------------------------------------------------------------- # Test Model Predictions & Export Stats #------------------------------------------------------------------------------- temp2 <- data.frame(DATA=as.numeric(pppp.age.model2),CLASS=factor(names(pppp.age.model2))) temp3 <- aov(DATA~CLASS,data=temp2) temp4 <- TukeyHSD(temp3) unadj.means <- model.tables(temp3, which="CLASS", type="means")[[1]][2] posthoc.on.unadj.means <- TukeyHSD(temp3, which="CLASS")[[1]] write.table(posthoc.on.unadj.means,"posthoc.on.unadj.means.txt",sep="\t") running.out.stats <- NULL #D3 -> MEAN & SE m <- mean(temp2$DATA[temp2$CLASS=="D3"]) s <- s <- sd(temp2$DATA[temp2$CLASS=="D3"])/sqrt(length(temp2$DATA[temp2$CLASS=="D3"])) running.out.stats <- rbind(running.out.stats,c(m,s)) #D10 -> MEAN & SE m <- mean(temp2$DATA[temp2$CLASS=="D10"]) s <- sd(temp2$DATA[temp2$CLASS=="D10"])/sqrt(length(temp2$DATA[temp2$CLASS=="D10"])) running.out.stats <- rbind(running.out.stats,c(m,s)) #D10_KO -> MEAN & SE m <- mean(temp2$DATA[temp2$CLASS=="D10_KO"]) s <- sd(temp2$DATA[temp2$CLASS=="D10_KO"])/sqrt(length(temp2$DATA[temp2$CLASS=="D10_KO"])) running.out.stats <- rbind(running.out.stats,c(m,s)) #D10_OE -> MEAN & SE m <- mean(temp2$DATA[temp2$CLASS=="D10_OE"]) s <- sd(temp2$DATA[temp2$CLASS=="D10_OE"])/sqrt(length(temp2$DATA[temp2$CLASS=="D10_OE"])) running.out.stats <- rbind(running.out.stats,c(m,s)) #D30 -> MEAN & SE m <- mean(temp2$DATA[temp2$CLASS=="D30"]) s <- sd(temp2$DATA[temp2$CLASS=="D30"])/sqrt(length(temp2$DATA[temp2$CLASS=="D30"])) running.out.stats <- rbind(running.out.stats,c(m,s)) #D45 -> MEAN & SE m <- mean(temp2$DATA[temp2$CLASS=="D45"]) s <- sd(temp2$DATA[temp2$CLASS=="D45"])/sqrt(length(temp2$DATA[temp2$CLASS=="D45"])) running.out.stats <- rbind(running.out.stats,c(m,s)) #D10_RSCU -> MEAN & SE m <- mean(temp2$DATA[temp2$CLASS=="D10_RSCU"]) s <- sd(temp2$DATA[temp2$CLASS=="D10_RSCU"])/sqrt(length(temp2$DATA[temp2$CLASS=="D10_RSCU"])) running.out.stats <- rbind(running.out.stats,c(m,s)) write.table(running.out.stats,"running.out.stats.txt",sep="\t") #------------------------------------------------------------------------------- # CODE END #-------------------------------------------------------------------------------