## ## ## ## This code the supplementary material of the paper ## http://bib.oxfordjournals.org/cgi/content/full/8/6/415 ## doi:10.1093/bib/bbm043 following clean up and additional commenting by ## Phillip Lord (phillip.lord@newcastle.ac.uk). ## ## ## Code performs analysis of differential gene expression in Soybean. ## ## "SPLIT" comments are to enable easy HTML publication of code snippets. ##SPLIT: install ## install the basic bioconductor packages source( "http://bioconductor.org/biocLite.R") biocLite() biocLite('soybeanprobe') ## ensure installation before carrying on. ##SPLIT: reading_and_investigating ### Load the appropriate library for reading affy data, and inspect the data. library( 'affy' ) soy.ab <- ReadAffy( 'geo_data/GSM209576.CEL.gz', 'geo_data/GSM209585.CEL.gz', 'geo_data/GSM209594.CEL.gz', 'geo_data/GSM209577.CEL.gz', 'geo_data/GSM209586.CEL.gz', 'geo_data/GSM209595.CEL.gz', ## we have gz files which R can read in. compress=TRUE) ## Inspect the loaded data. This will make a network connection first time ## around and can be slow. soy.ab ## Check out the names of the samples. sampleNames( soy.ab ) ## as the current sample names refer to the original files, we change this for ## something more, er, easy to remember. new.sampleNames <- c('hr.a3.12','hr.b3.12','hr.c3.12', 'ts.a4.12','ts.b4.12','ts.c4.12') sampleNames(soy.ab) <- new.sampleNames ## and check that it has worked sampleNames( soy.ab ) ##SPLIT: subsetting ## ## We are trying to do some subsetting because not all of the probes on the ## chip are from soy ## read in another data frame called Species.Affy.ID. ## this links species names to affy ids. Species.Affy.ID <- read.table('SpeciesAffyID.txt', header = T, sep = "") dim(Species.Affy.ID) load( 'SoybeanCutObjects.RData' ) tv.for.glycine.max <- Species.Affy.ID$species == 'Glycine max' table( tv.for.glycine.max ) listOutProbeSets <- Species.Affy.ID$affyID[ tv.for.glycine.max==FALSE ] length( listOutProbeSets ) is.factor( listOutProbeSets ) ## Create a character vector for listOutProbeSets ## One way: rename listOutProbeSets as a character vector listOutProbeSets <- as.character(listOutProbeSets) ## Confirm that listOutProbeSets is a character vector is.character(listOutProbeSets) ## check object soy.ab ## this is the bit which actually removes the stuff we are not intereste RemoveProbes(listOutProbes=NULL, listOutProbeSets, cdfpackagename, probepackagename) ## Check that the object has less IDs now. There should be 37444. soy.ab ##SPLIT: pheno_names # Start preparation for phenoData slot in AffyBatch object pd <- data.frame(population = c(1,1,1,2,2,2), replicate = c(1,2,3,1,2,3)) # Display contents of pd pd # Assign the sampleNames(soy.ab) to the rownames of pd rownames(pd) <- sampleNames(soy.ab) # Display contents of pd again, notice change in rownames pd ## Continue preparation for phenoData slot metaData <- data.frame(labelDescription = c( 'population', 'replicate' )) ## Establish new phenoData slot phenoData(soy.ab) <- new( 'AnnotatedDataFrame', data = pd, varMetadata = metaData) ## Display pData(soy.ab) pData(soy.ab) ## Display phenoData(soy.ab) phenoData(soy.ab) ##SPLIT: image_setup palette.gray <- c(rep(gray(0:10/10), times = seq(1,41, by = 4))) library('RColorBrewer') brewer.cols <- brewer.pal(6, 'Set1') ##SPLIT: basic_quality_images ## Now we want to look at the images of the chips. Basically, we are looking ## for something unusual here; one chip too bright for example. ## Construct image plots for quality control assessment ## Note: that these plots appear dark; they are not log transformed par(oma = c(1,1,3,1)) par(mfrow = c(2,3)) palette.gray <- c(rep(gray(0:10/10), times = seq(1,41, by = 4))) image(soy.ab[,1], transfo = function(x) x, col = palette.gray) image(soy.ab[,2], transfo = function(x) x, col = palette.gray) image(soy.ab[,3], transfo = function(x) x, col = palette.gray) image(soy.ab[,4], transfo = function(x) x, col = palette.gray) image(soy.ab[,5], transfo = function(x) x, col = palette.gray) image(soy.ab[,6], transfo = function(x) x, col = palette.gray) mtext ("> image(soy.ab[,i=1:6], transfo = function(x) x, col = palette.gray)", side = 3, outer = T, cex = .8) ## Construct individual image plots using log intensities ## Note: that these plots appear lighter; they are log transformed ## ## This allows us to see a single microarray chip. par(mfrow = c(1,1)) palette.gray <- c(rep(gray(0:10/10), times = seq(1,41, by = 4))) image(soy.ab[,1], col = palette.gray) title(sub = "> image(soy.ab[,1], col = palette.gray)") ## Repeat above commands with .. soy.ab[,i], i = 2-6 ## Display all six log-transformed image plots in one plot par(oma = c(3,1,3,1)) par(mfrow = c(2,3)) image(soy.ab[,1], col = palette.gray) image(soy.ab[,2], col = palette.gray) image(soy.ab[,3], col = palette.gray) image(soy.ab[,4], col = palette.gray) image(soy.ab[,5], col = palette.gray) image(soy.ab[,6], col = palette.gray) mtext("Image Plots - Hawaii/Resistant vs. Taiwan/Susceptible", side = 3, outer = T) mtext("Top Row: Hawaii/Resistant - Bottom Row: Taiwan/Susceptible", side = 1, outer = T) ## Reset to single screen. par(mfrow = c(1,1)) ##SPLIT: boxplots_and_others ## Construct color boxplots boxplot(soy.ab, col = brewer.cols, ylab = 'Unprocessed log (base 2) scale Probe Intensities', xlab = 'Array Names') title('Box Plots of soy Probe Level Data') ## and density plots hist(soy.ab, col = brewer.cols, lty = 1, xlab = 'Log (base 2) Intensities', lwd = 3) samp.leg.names <- new.sampleNames legend(10, .4, legend = samp.leg.names, lty = 1, col = brewer.cols, lwd = 3) title('Density plots - probe level data for six soy arrays', cex = .6) # Construct MAplots par(oma = c(1,1,3,1)) par(mfrow = c(2,3)) MAplot(soy.ab, cex = .5) mtext('M', side = 2, outer = TRUE) mtext('A', side = 1, outer = TRUE) mtext('Six MA-Plots vs. the Median Mock Array', side = 3, outer = TRUE) par(mfrow = c(1,1)) # Construct single (first) MAplot to reproduce figure in paper par(oma = c(1,1,1,1)) MAplot(soy.ab, which=1) ##SPLIT: affy_quality_assement library( 'simpleaffy' ) ## TODO this section is broken ## # Load (source in) function for quality control metrics revised ## # by W. Gregory Alvord and Octavio Quinones to work# with ## # soybeancdf source('soybean.mod.qc.affy.r') ## # Execute soybean.mod.qc.affy() on object soy.ab ##soy.ab.qc <- soybean.mod.qc.affy(soy.ab) ## # Assign qc metrics to objects ## avbg.soy.ab.qc <- avbg(soy.ab.qc) ## minbg.soy.ab.qc <- minbg(soy.ab.qc) ## maxbg.soy.ab.qc <- maxbg(soy.ab.qc) ## sfs.soy.ab.qc <- sfs(soy.ab.qc) ## percent.present.soy.ab.qc <- percent.present(soy.ab.qc) ## # Display objects (here, rounded to 2 decimal places) ## round(minbg.soy.ab.qc ,2) ## round(maxbg.soy.ab.qc ,2) ## round(avbg.soy.ab.qc ,2) ## round(sfs.soy.ab.qc ,2) ## round(percent.present.soy.ab.qc,2) ##SPLIT: rna_degredation ## Construct RNA degradation plots and summary library('affy') par(mfrow = c(1,1)) RNAdeg <- AffyRNAdeg(soy.ab) plotAffyRNAdeg(RNAdeg, col = c(rep('blue',3), rep('red', 3))) summaryAffyRNAdeg(RNAdeg) ##SPLIT: probe_level_models ## this section is also current broken... ## # Fit a probe-level model to the soy.ab probe-level data ## library('affyPLM') ## Pset1 <- fitPLM(soy.ab) ## # Display probe-level quality diagnostics for array # 1 ## par(mfrow = c(2,2)) ## par(oma = c(3,1,3,1)) ## image(soy.ab[,1], col = palette.gray) ## image(Pset1, type = 'weights', which = 1) ## image(Pset1, type = 'resids', which = 1) ## image(Pset1, type = 'sign.resids', which = 1) ## mtext('Probe Level Models - QC Checks', side = 3, outer = T) ## par(mfrow = c(1,1)) ## # Repeat above commands with .. soy.ab[,i], i = 2-6; which = i ## # Construct Relative Log Expression (RLE) Plot ## library(affyPLM) ## Mbox(Pset1, col = brewer.cols, names = NULL, main = 'Relative Log Expression Plot - H/R vs. T/S') ## # Add ylim = c(-.5, .5) after observing previous plot ## Mbox(Pset1, col = brewer.cols, ylim = c(-.5,.5), main = 'Relative Log Expression Plot - H/R vs. T/S') ## # Construct Normalized Unscaled Standard Error (NUSE) Plot ## boxplot(Pset1, col = brewer.cols, main = 'NUSE Plot', ylab = 'NUSE - Normalized Unscaled Standard Error') ## # Add ylim = c(.9, 1.1) after observing previous plot ## boxplot(Pset1, ylim = c(.9, 1.1), col = brewer.cols, main = 'NUSE Plot', ylab = 'NUSE - Normalized Unscaled Standard Error', las = 2) ##SPLIT: normalisation # Execute RMA procedure on soy.ab object; assign to object eset eset <- rma(soy.ab) # Display eset eset # Create an object that contains exprs(eset), named exprs.eset exprs.eset <- exprs(eset) # Create index values – Index1 for Hawaii/Resistant, # Index2 for Taiwan/Susceptible Index1 <- 1:3 Index2 <- 4:6 # Compute Difference vector for rowMeans between H/R and T/S Difference <- rowMeans(exprs.eset[,Index1]) -rowMeans(exprs.eset[,Index2]) # Also compute the Average for each row Average <- rowMeans(exprs.eset) # Create data frame for matrix exprs.eset exprs.eset.df <- data.frame(exprs.eset) ##SPLIT: normalisation_checks # Construct expression set boxplots following RMA par(oma = c(1,1,3,1)) boxplot(exprs.eset.df, col = brewer.cols) mtext('Boxplots Soybean(Glycine max subset) RMA Gene Expression Data', side = 3, outer = T) # Construct MA-Plot, Difference vs. Average plot(Average, Difference) lines(lowess(Average, Difference), col = 'red', lwd = 4) abline( h = -2) abline( h = 2) title(sub = "> lines(lowess(Average, Difference), col = 'red', lwd = 4)") mtext('MA-Plot, Difference vs. Average, Soybean (H/R & T/S)', outer = T, side = 3) ##SPLIT: over_expressed # Compute ordinary t statistics library('genefilter') tt <- rowttests(exprs.eset, factor(eset$population)) names(tt) # Check that length of tt$statistic is 37744 length(tt$statistic) ##SPLIT: volcano_prepare ## Start the construction of the Volcano Plot ## Construct the lod scores... lod <- -log10(tt$p.value) o1 <- order(abs(Difference), decreasing = TRUE)[1:50] o2 <- order(abs(tt$statistic), decreasing = TRUE)[1:50] # Construct union and intersection of sets o <- union(o1, o2) i <- intersect(o1, o2) # Display i; note, we have 4 intersects i ##SPLIT: volcano # Construct simple Volcano plot to get an idea of the range # of the Difference values and lod values plot(Difference, lod) # Construct more sophisticated set of commands for a nicer Volcano plot par(mfrow = c(1,1)) plot(Difference[-o], lod[-o], cex = .25, xlim = c(-3,3), ylim = range(lod)) points(Difference[o1], lod[o1], pch = 18, col = 'blue', cex = 1.5) points(Difference[o2], lod[o2], pch = 1, col = 'red', cex = 2, lwd = 2) abline(h = 3) text(-2, 3.8, 'Red circles: 50 genes') text(-2, 3.6, 'with lowest p-values') text(-2, 3.4, '=====>>>') text(-2, 3.15, 'p < 0.001') title('Volcano Plot: lowest p values and largest fold differences') ##SPLIT: limma library(limma) # Construct population.groups for model design population.groups <- factor(c(rep('Taiwan/Susceptible',3), rep('Hawaii/Resistant',3))) design <- model.matrix( ~ population.groups ) # Display design design # Fit linear model fit <- lmFit(eset, design) # Check the dimension of fit$coeff; should be 37744 x 2 dim(fit$coeff) # Execute eBayes to obtain attenuated t statistics fit.eBayes <- eBayes(fit) # Check dimension of fit.eBayes$t; should be 37744 x 2 dim(fit.eBayes$t) # Compute statistics necessary for Volcano Plot # using attenuated t statistics lodd <- -log10(fit.eBayes$p.value[,2]) oo2 <- order(abs(fit.eBayes$t[,2]), decreasing = TRUE)[1:50] oo <- union(o1, oo2) ii <- intersect(o1, oo2) # Display ii ii # Construct Volcano plot using attenuated t statistics plot(Difference[-oo], lodd[-oo], cex = .25, xlim = c(-3,3), ylim = range(lodd), xlab = 'Average (log) Fold-change', ylab = 'LOD score - Negative log10 of P-value') points(Difference[o1], lodd[o1], pch = 18, col = 'blue', cex = 1.5) points(Difference[oo2], lodd[oo2], pch = 1, col ='red',cex = 2,lwd = 2) abline(h = 3) title('Volcano Plot with moderated t statistics') text(-2, 3.2, 'p < 0.001') text(1, 4, 'Nine intersects') ##SPLIT: heatmaps # Construct the appropriate matrices/data frames for # construction of the heat map. ii.mat <- exprs.eset[ii,] ii.df <- data.frame(ii.mat) # Dislplay ii.df ii.df # Construct heatmap using reds and blues library('RColorBrewer') library('genefilter') hmcol <- colorRampPalette(brewer.pal(10, 'RdBu'))(256) tv.Hawaii.Resistant <- dimnames(ii.mat)[[2]][1:3] spcol <- ifelse(dimnames(ii.mat)[[2]] == tv.Hawaii.Resistant, 'skyblue', 'goldenrod') heatmap(ii.mat, col = hmcol, ColSideColors = spcol, margins = c(10,15)) # Construct heatmap using a grey (gray) scale library('RColorBrewer') library('genefilter') hmcol <- colorRampPalette(brewer.pal(9, 'Greys'))(256) tv.Hawaii.Resistant <- dimnames(ii.mat)[[2]][1:3] spcol <- ifelse(dimnames(ii.mat)[[2]] == tv.Hawaii.Resistant, 'grey10', 'grey80') heatmap(ii.mat, col = hmcol, ColSideColors = spcol, margins = c(10,15)) ##SPLIT: end