#------------------------------------------------------------------------------- # Environment Declarations #------------------------------------------------------------------------------- rm(list=ls()) options(object.size=Inf) memory.size(max = TRUE) #------------------------------------------------------------------------------- # Function Declarations #------------------------------------------------------------------------------- source("http://bioconductor.org/biocLite.R") biocLite() biocLite("multtest") library(multtest) install.packages("polycor") library(polycor) install.packages("gplots") library(gplots) image.scale <- function (z, col, x, y = NULL, size = NULL, digits = 2, labels = c("breaks", "ranges")) { # sort out the location n <- length(col) usr <- par("usr") mx <- mean(usr[1:2]); my <- mean(usr[3:4]) dx <- diff(usr[1:2]); dy <- diff(usr[3:4]) if (missing(x)) x <- mx + 1.05*dx/2 # default x to right of image else if (is.list(x)) { if (length(x$x) == 2) size <- c(diff(x$x), -diff(x$y)/n) y <- x$y[1] x <- x$x[1] } else x <- x[1] if (is.null(size)) if (is.null(y)) { size <- 0.618*dy/n # default size, golden ratio y <- my + 0.618*dy/2 # default y to give centred scale } else size <- (y-my)*2/n if (length(size)==1) size <- rep(size, 2) # default square boxes if (is.null(y)) y <- my + n*size[2]/2 # draw the image scale i <- seq(along = col) rect(x, y - i * size[2], x + size[1], y - (i - 1) * size[2], col = rev(col), xpd = TRUE) # sort out the labels rng <- range(z, na.rm = TRUE) bks <- seq(from = rng[2], to = rng[1], length = n + 1) bks <- formatC(bks, format="f", digits=digits) labels <- match.arg(labels) if (labels == "breaks") ypts <- y - c(0, i) * size[2] else { bks <- paste(bks[-1], bks[-(n+1)], sep = " - ") ypts <- y - (i - 0.5) * size[2] } text(x = x + 1.2 * size[1], y = ypts, labels = bks, adj = ifelse(size[1]>0, 0, 1), xpd = TRUE) } #------------------------------------------------------------------------------- # Import Data #------------------------------------------------------------------------------- working.data <- read.table("ExpressionData.txt",row.names=1,header=TRUE) sample.info <- read.table("SampleInfo.txt",row.names=1,header=TRUE) working.data <- working.data[,dimnames(sample.info)[[1]]] #------------------------------------------------------------------------------- # Generate Box Plot #------------------------------------------------------------------------------- WorkingList <- vector("list", dim(working.data)[2]) for ( i in 1:dim(working.data)[2]) { WorkingList[[i]] <- as.numeric(working.data[,i]) } col.2.use <- as.numeric(unlist(sample.info$Color)) boxplot(WorkingList,col=col.2.use) #------------------------------------------------------------------------------- # Generate PCA #------------------------------------------------------------------------------- pca.results <- princomp(working.data,cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=col.2.use,cex=3,pch=19) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=1,cex=3,pch=21) #------------------------------------------------------------------------------- # Generate Heat Map #------------------------------------------------------------------------------- cdat <- working.data cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() #------------------------------------------------------------------------------- # Noise Analysis #------------------------------------------------------------------------------- group1 <- working.data[,col.2.use==4] group2 <- working.data[,col.2.use==2] group3 <- working.data[,col.2.use==5] group4 <- working.data[,col.2.use==6] g1.mean <- apply(group1,1,mean) g2.mean <- apply(group2,1,mean) g3.mean <- apply(group3,1,mean) g4.mean <- apply(group4,1,mean) g1.sd <- apply(group1,1,sd) g2.sd <- apply(group2,1,sd) g3.sd <- apply(group3,1,sd) g4.sd <- apply(group4,1,sd) g1.cv <- (g1.sd/g1.mean)*100 g2.cv <- (g2.sd/g2.mean)*100 g3.cv <- (g3.sd/g3.mean)*100 g4.cv <- (g4.sd/g4.mean)*100 lowess.g1 <- lowess(g1.cv~g1.mean) lowess.g2 <- lowess(g2.cv~g2.mean) lowess.g3 <- lowess(g3.cv~g3.mean) lowess.g4 <- lowess(g4.cv~g4.mean) xrange <- range(c(2:14)) yrange <- range(c(0:20)) plot(xrange,yrange,type='n',xlab="Mean Expression",ylab="C.V.") points(g2.mean,g2.cv,type='p',col=8) lines(lowess.g1,col=4,lwd=2) lines(lowess.g2,col=2,lwd=2) lines(lowess.g3,col=5,lwd=2) lines(lowess.g4,col=6,lwd=2) abline(v=5) #------------------------------------------------------------------------------- # Remove Noise #------------------------------------------------------------------------------- noise.pick <- 5 temp <- as.numeric(unlist(working.data)) temp <- ifelse(temp BH FDR alpha = 0.05 #------------------------------------------------------------------------------- temp1 <- mt.rawp2adjp(running.pvs, proc="BH") temp2 <- temp1$adj[order(temp1$index),] temp2 <- as.data.frame(temp2) dimnames(temp2)[[1]] <- names(running.pvs) final.testing.results.table <- temp2 dimnames(final.testing.results.table)[[2]] <- c("uncorrected_pvalue","corrected_pvalue") write.table(final.testing.results.table,"Results_Post_Statistical_Testing.txt",sep="\t") barplot(c(dim(final.testing.results.table)[1],dim(final.testing.results.table)[1]-sum(final.testing.results.table[,2]<0.05),sum(final.testing.results.table[,2]<0.05)),col=1,ylab="Number of Gene Fragments") c(dim(final.testing.results.table)[1],dim(final.testing.results.table)[1]-sum(final.testing.results.table[,2]<0.05),sum(final.testing.results.table[,2]<0.05)) abline(h=0) pass.anova <- dimnames(final.testing.results.table)[[1]][final.testing.results.table[,2]<0.05] #------------------------------------------------------------------------------- # Repeat Visualizations Post FDR MCC #------------------------------------------------------------------------------- Final.working.data <- Final.working.data[pass.anova,] pca.results <- princomp(Final.working.data,cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points.2.use <- as.numeric(unlist(sample.info$Shape)) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=col.2.use,cex=2,pch=19) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=1,cex=2,pch=21) x.range <- range(pca.loadings[,2]) y.range <- range(pca.loadings[,3]) xlab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") ylab <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="") plot(pca.loadings[,2],pca.loadings[,3],type='n',xlab=xlab,ylab=ylab) points.2.use <- as.numeric(unlist(sample.info$Shape)) points(pca.loadings[,2],pca.loadings[,3],lwd=2,col=col.2.use,cex=2,pch=19) points(pca.loadings[,2],pca.loadings[,3],lwd=2,col=1,cex=2,pch=21) cdat <- Final.working.data[pass.anova,] cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() #------------------------------------------------------------------------------- # Perform Post Hoc Testing #------------------------------------------------------------------------------- curated.Final.working.data <- Final.working.data[pass.anova,] Class <- as.factor(as.numeric(sample.info$Color)) running.pvs <- NULL for (i in 1:dim(curated.Final.working.data)[1]) { print(i) temp1 <- as.numeric(unlist(curated.Final.working.data[i,])) temp2 <- data.frame(DATA=temp1,CLASS=factor(Class)) temp3 <- aov(DATA~CLASS,data=temp2) temp4 <- TukeyHSD(temp3) temp5 <- temp4[[1]] temp6 <- temp5[,4] running.pvs <- rbind(running.pvs,temp6) } dimnames(running.pvs)[[1]] <- dimnames(curated.Final.working.data)[[1]] dimnames(running.pvs)[[2]] <- dimnames(temp5)[[1]] #------------------------------------------------------------------------------- # Perform Fold Change Analysis #------------------------------------------------------------------------------- g1.mean <- apply(curated.Final.working.data[,col.2.use==4],1,mean) g2.mean <- apply(curated.Final.working.data[,col.2.use==2],1,mean) g3.mean <- apply(curated.Final.working.data[,col.2.use==5],1,mean) g4.mean <- apply(curated.Final.working.data[,col.2.use==6],1,mean) g4 <- g1.mean g2 <- g2.mean g5 <- g3.mean g6 <- g4.mean running.fcs <- NULL # 4 vs 2 fc <- g4-g2 fc <- 2^fc fc <- ifelse(fc<1,-1/fc,fc) running.fcs <- cbind(running.fcs,fc) # 5 vs 2 fc <- g5-g2 fc <- 2^fc fc <- ifelse(fc<1,-1/fc,fc) running.fcs <- cbind(running.fcs,fc) # 6 vs 2 fc <- g6-g2 fc <- 2^fc fc <- ifelse(fc<1,-1/fc,fc) running.fcs <- cbind(running.fcs,fc) # 5 vs 4 fc <- g5-g4 fc <- 2^fc fc <- ifelse(fc<1,-1/fc,fc) running.fcs <- cbind(running.fcs,fc) # 6 vs 4 fc <- g6-g4 fc <- 2^fc fc <- ifelse(fc<1,-1/fc,fc) running.fcs <- cbind(running.fcs,fc) # 6 vs 5 fc <- g6-g5 fc <- 2^fc fc <- ifelse(fc<1,-1/fc,fc) running.fcs <- cbind(running.fcs,fc) dimnames(running.fcs)[[2]] <- c("4_vs_2", "5_vs_2", "6_vs_2", "5_vs_4", "6_vs_4", "6_vs_5") coded.pvs <- ifelse(running.pvs<0.05,1,0) coded.fcs <- ifelse(abs(running.fcs)>=1.5,1,0) cx.coded <- coded.pvs+coded.fcs cx.coded <- ifelse(cx.coded==2,1,0) apply(cx.coded,2,sum) sum(apply(cx.coded,1,max)) selected.gene.probes <- dimnames(cx.coded)[[1]][apply(cx.coded,1,max)==1] write.table(running.fcs,"fc.results.txt",sep="\t") write.table(running.pvs,"pv.results.txt",sep="\t") write.table(cx.coded,"cx.coded.txt",sep="\t") #------------------------------------------------------------------------------- # Repeat Visualizations Post Selection #------------------------------------------------------------------------------- pca.results <- princomp(Final.working.data[selected.gene.probes,],cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points.2.use <- as.numeric(unlist(sample.info$Shape)) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=col.2.use,cex=2,pch=19) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=1,cex=2,pch=21) x.range <- range(pca.loadings[,2]) y.range <- range(pca.loadings[,3]) xlab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") ylab <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="") plot(pca.loadings[,2],pca.loadings[,3],type='n',xlab=xlab,ylab=ylab) points.2.use <- as.numeric(unlist(sample.info$Shape)) points(pca.loadings[,2],pca.loadings[,3],lwd=2,col=col.2.use,cex=2,pch=19) points(pca.loadings[,2],pca.loadings[,3],lwd=2,col=1,cex=2,pch=21) cdat <- Final.working.data[selected.gene.probes,] cdat <- cor(cdat) par("mar" = c(5, 4, 4, 10) + 0.1) # wide righthand margin greys <- rgb(r=(3:0)/3, g=(3:0)/3, b=(3:0)/3, names=paste("greys",3:0,sep=".")) image(cdat,col=greys) image.scale(as.numeric(cdat),col=greys,digits = 3) box() cc <- redgreen(9) heatmap.2(as.matrix(Final.working.data[selected.gene.probes,]),trace="none", Rowv=TRUE, Colv=TRUE,density.info="none",dendrogram="both",col=cc) Final.Final.data <- Final.working.data[selected.gene.probes,] #------------------------------------------------------------------------------- # Age Correlation Analysis #------------------------------------------------------------------------------- age <- sample.info$Time[-c(1:5)] age.data <- post.noise.filtered.data[,-c(1:5)] running.cor <- NULL running.random <- NULL for (i in 1:dim(age.data)[1]) { print(i) print(dim(age.data)[1]) if(sum(as.numeric(age.data[i,])!=7.5)>0) { temp <- polyserial(as.numeric(age.data[i,]),age) running.cor <- c(running.cor,temp) temp <- polyserial(sample(as.numeric(age.data[i,]),length(age)),age) running.random <- c(running.random,temp) } else { running.cor <- c(running.cor,0) running.random <- c(running.random,0) } } names(running.cor) <- dimnames(age.data)[[1]] names(running.random) <- dimnames(age.data)[[1]] a <- density(running.cor) b <- density(running.random) xrange <- range(c(-1,0,1)) yrange <- range(c(0,1.5)) plot(xrange,yrange,type='n',xlab="Polyserial Correlation Rho",ylab="Density") lines(b,col=2,lwd=2,lty=2) lines(a,col=3,lwd=2,lty=2) r.sd <- sd(running.random) r.mean <- mean(running.random) r.real <- pnorm(abs(running.cor),mean=r.mean,sd=r.sd,lower.tail=FALSE) r.random <- pnorm(abs(running.random),mean=r.mean,sd=r.sd,lower.tail=FALSE) #points(running.random,r.random,col=2,cex=0.75,pch=20) points(running.cor,r.real,col=3,cex=0.75,pch=20) grab.pass <- r.real[r.real<0.05] length(grab.pass) grab.pass.up <- running.cor[names(grab.pass)] grab.pass.up <- grab.pass.up[grab.pass.up>0] length(grab.pass.up) grab.pass.down <- running.cor[names(grab.pass)] grab.pass.down <- grab.pass.down[grab.pass.down<0] length(grab.pass.down) names(r.random) <- names(running.cor) grab.05 <- sort(abs(r.random-0.05))[1] grab.05 <- names(r.random[names(grab.05)]) abline(v=running.random[grab.05],col=2,lwd=4) abline(v=-running.random[grab.05],col=2,lwd=4) cx.cor <- cbind(running.random,r.random,running.cor,r.real) write.table(cx.cor,"cx.cor.txt",sep="\t") write.table(grab.pass,"grab.pass.txt",sep="\t") pca.results <- princomp(age.data[names(grab.pass),],cor=F) pca.loadings <- loadings(pca.results) totalvar <- (pca.results$sdev^2) variancePer <- round(totalvar/sum(totalvar)*100,1) variancePer <- variancePer[1:3] x.range <- range(pca.loadings[,1]) y.range <- range(pca.loadings[,2]) xlab <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="") ylab <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="") plot(pca.loadings[,1],pca.loadings[,2],type='n',xlab=xlab,ylab=ylab) points.2.use <- as.numeric(unlist(sample.info$Shape)) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=col.2.use[-c(1:5)],cex=2,pch=19) points(pca.loadings[,1],pca.loadings[,2],lwd=2,col=1,cex=2,pch=21) temp <- pca.results[[6]] temp.1 <- temp[,1] plot(sort(temp.1),ylab="PCA Weighting",xlab="Age Correlated Genes") abline(h=0) plot(density(temp.1)) write.table(temp[,c(1,2)],"PCA.txt",sep="\t")