CSC8309 -- Gene Expression and Proteomics

Here is the all the code that you will have used in the practical.
##
##
## 
## This code the supplementary material of the paper
## http://bib.oxfordjournals.org/cgi/content/full/8/6/415
## doi:10.1093/bib/bbm043 following clean up and additional commenting by
## Phillip Lord (phillip.lord@newcastle.ac.uk).
##
##
## Code performs analysis of differential gene expression in Soybean.
##
## "SPLIT" comments are to enable easy HTML publication of code snippets.


##SPLIT: install

## install the basic bioconductor packages
source( "http://bioconductor.org/biocLite.R")
biocLite()
biocLite('soybeanprobe') 


## ensure installation before carrying on.


##SPLIT: reading_and_investigating

### Load the appropriate library for reading affy data, and inspect the data. 
library( 'affy' )
soy.ab <- ReadAffy( 'geo_data/GSM209576.CEL.gz',
                   'geo_data/GSM209585.CEL.gz',
                   'geo_data/GSM209594.CEL.gz',
                   'geo_data/GSM209577.CEL.gz',
                   'geo_data/GSM209586.CEL.gz',
                   'geo_data/GSM209595.CEL.gz',

                   ## we have gz files which R can read in. 
                   compress=TRUE)
                   


## Inspect the loaded data. This will make a network connection first time
## around and can be slow.
soy.ab

## Check out the names of the samples.
sampleNames( soy.ab )



## as the current sample names refer to the original files, we change this for
## something more, er, easy to remember.
new.sampleNames <- c('hr.a3.12','hr.b3.12','hr.c3.12',
                     'ts.a4.12','ts.b4.12','ts.c4.12')
sampleNames(soy.ab) <- new.sampleNames  

## and check that it has worked
sampleNames( soy.ab )



##SPLIT: subsetting
##
## We are trying to do some subsetting because not all of the probes on the
## chip are from soy


## read in another data frame called Species.Affy.ID.
## this links species names to affy ids. 
Species.Affy.ID <- read.table('SpeciesAffyID.txt', header = T, sep = "")
dim(Species.Affy.ID)



load( 'SoybeanCutObjects.RData' )

tv.for.glycine.max <- Species.Affy.ID$species == 'Glycine max'
table( tv.for.glycine.max )
listOutProbeSets <- Species.Affy.ID$affyID[ tv.for.glycine.max==FALSE ]

length( listOutProbeSets )
is.factor( listOutProbeSets )

## Create a character vector for listOutProbeSets
## One way: rename listOutProbeSets as a character vector
listOutProbeSets <- as.character(listOutProbeSets) 

## Confirm that listOutProbeSets is a character vector
is.character(listOutProbeSets) 

## check object
soy.ab


## this is the bit which actually removes the stuff we are not intereste
RemoveProbes(listOutProbes=NULL, listOutProbeSets, cdfpackagename, probepackagename) 

## Check that the object has less IDs now. There should be 37444. 
soy.ab

##SPLIT: pheno_names
# Start preparation for phenoData slot in AffyBatch object 
pd <- data.frame(population = c(1,1,1,2,2,2), replicate = c(1,2,3,1,2,3))

# Display contents of pd
pd

# Assign the sampleNames(soy.ab) to the rownames of pd
rownames(pd) <- sampleNames(soy.ab) 

# Display contents of pd again, notice change in rownames
pd

## Continue preparation for phenoData slot
metaData <- data.frame(labelDescription = c( 'population', 'replicate' ))

## Establish new phenoData slot  
phenoData(soy.ab) <- new( 'AnnotatedDataFrame', data = pd, varMetadata = metaData)

## Display pData(soy.ab)
pData(soy.ab) 

## Display phenoData(soy.ab)
phenoData(soy.ab) 

##SPLIT: image_setup
palette.gray <- c(rep(gray(0:10/10), times = seq(1,41, by = 4)))

library('RColorBrewer') 
brewer.cols <- brewer.pal(6, 'Set1')  

##SPLIT: basic_quality_images

## Now we want to look at the images of the chips. Basically, we are looking
## for something unusual here; one chip too bright for example. 

## Construct image plots for quality control assessment 
## Note: that these plots appear dark; they are not log transformed
par(oma = c(1,1,3,1))
par(mfrow = c(2,3)) 


palette.gray <- c(rep(gray(0:10/10), times = seq(1,41, by = 4)))
image(soy.ab[,1], transfo = function(x) x, col = palette.gray)
image(soy.ab[,2], transfo = function(x) x, col = palette.gray)
image(soy.ab[,3], transfo = function(x) x, col = palette.gray)
image(soy.ab[,4], transfo = function(x) x, col = palette.gray)
image(soy.ab[,5], transfo = function(x) x, col = palette.gray)
image(soy.ab[,6], transfo = function(x) x, col = palette.gray)
mtext ("> image(soy.ab[,i=1:6], transfo = function(x) x, col = palette.gray)", side = 3, outer = T, cex = .8)


## Construct individual image plots using log intensities
## Note: that these plots appear lighter; they are log transformed 
##
## This allows us to see a single microarray chip. 
par(mfrow = c(1,1))

palette.gray <- c(rep(gray(0:10/10), times = seq(1,41, by = 4))) 
image(soy.ab[,1],  col = palette.gray)
title(sub = "> image(soy.ab[,1],  col = palette.gray)")

## Repeat above commands with .. soy.ab[,i], i = 2-6

## Display all six log-transformed image plots in one plot
par(oma = c(3,1,3,1))
par(mfrow = c(2,3))
image(soy.ab[,1],  col = palette.gray)
image(soy.ab[,2],  col = palette.gray)
image(soy.ab[,3],  col = palette.gray)
image(soy.ab[,4],  col = palette.gray)
image(soy.ab[,5],  col = palette.gray)
image(soy.ab[,6],  col = palette.gray)
mtext("Image Plots - Hawaii/Resistant vs. Taiwan/Susceptible", side = 3, outer = T) 
mtext("Top Row: Hawaii/Resistant - Bottom Row: Taiwan/Susceptible", side = 1, outer = T) 

## Reset to single screen.
par(mfrow = c(1,1))


##SPLIT: boxplots_and_others

## Construct color boxplots
boxplot(soy.ab, col = brewer.cols, ylab = 'Unprocessed log (base 2) scale Probe Intensities', xlab = 'Array Names') 
title('Box Plots of soy Probe Level Data')

## and density plots
hist(soy.ab, col = brewer.cols, lty = 1, xlab = 'Log (base 2) Intensities', lwd = 3)
samp.leg.names <- new.sampleNames
legend(10, .4, legend = samp.leg.names, lty = 1, col = brewer.cols, lwd = 3) 
title('Density plots - probe level data for six soy arrays', cex = .6) 

# Construct MAplots
par(oma = c(1,1,3,1)) 
par(mfrow = c(2,3)) 
MAplot(soy.ab, cex = .5)
mtext('M', side = 2, outer = TRUE) 
mtext('A', side = 1, outer = TRUE) 
mtext('Six MA-Plots vs. the Median Mock Array', side = 3, outer = TRUE)
par(mfrow = c(1,1))  

# Construct single (first) MAplot to reproduce figure in paper 
par(oma = c(1,1,1,1))  
MAplot(soy.ab, which=1)


##SPLIT: affy_quality_assement
library( 'simpleaffy' )

## TODO this section is broken

## # Load (source in) function for quality control metrics revised 
## # by W. Gregory Alvord and Octavio Quinones to work# with 
## # soybeancdf
source('soybean.mod.qc.affy.r')

## # Execute soybean.mod.qc.affy() on object soy.ab
##soy.ab.qc <- soybean.mod.qc.affy(soy.ab) 

## # Assign qc metrics to objects
## avbg.soy.ab.qc <- avbg(soy.ab.qc) 
## minbg.soy.ab.qc <- minbg(soy.ab.qc) 
## maxbg.soy.ab.qc <- maxbg(soy.ab.qc) 
## sfs.soy.ab.qc <- sfs(soy.ab.qc) 
## percent.present.soy.ab.qc <- percent.present(soy.ab.qc)

## # Display objects (here, rounded to 2 decimal places)
## round(minbg.soy.ab.qc ,2)
## round(maxbg.soy.ab.qc ,2)
## round(avbg.soy.ab.qc ,2)
## round(sfs.soy.ab.qc ,2) 
## round(percent.present.soy.ab.qc,2)


##SPLIT: rna_degredation

## Construct RNA degradation plots and summary
library('affy')
par(mfrow = c(1,1))  
RNAdeg <- AffyRNAdeg(soy.ab) 
plotAffyRNAdeg(RNAdeg, col = c(rep('blue',3), rep('red', 3)))
summaryAffyRNAdeg(RNAdeg) 


##SPLIT: probe_level_models
# Fit a probe-level model to the soy.ab probe-level data
library('affyPLM')
Pset1 <- fitPLM(soy.ab)

# Display probe-level quality diagnostics for array # 1
par(mfrow = c(2,2))
par(oma = c(3,1,3,1)) 
image(soy.ab[,1], col = palette.gray)
image(Pset1, type = 'weights', which = 1) 
image(Pset1, type = 'resids', which = 1) 
image(Pset1, type = 'sign.resids', which = 1)
mtext('Probe Level Models - QC Checks', side = 3, outer = T) 
par(mfrow = c(1,1))
# Repeat above commands with .. soy.ab[,i], i = 2-6; which = i

# Construct Relative Log Expression (RLE) Plot
library(affyPLM)
Mbox(Pset1, col = brewer.cols, names = NULL, main = 'Relative Log Expression Plot - H/R vs. T/S') 

# Add ylim = c(-.5, .5) after observing previous plot  
Mbox(Pset1, col = brewer.cols, ylim = c(-.5,.5),  main = 'Relative Log Expression Plot - H/R vs. T/S') 

 
# Construct Normalized Unscaled Standard Error (NUSE) Plot
boxplot(Pset1, col = brewer.cols, main = 'NUSE Plot', ylab = 'NUSE - Normalized Unscaled Standard Error') 

# Add ylim = c(.9, 1.1) after observing previous plot 
boxplot(Pset1, ylim = c(.9, 1.1), col = brewer.cols, main = 'NUSE Plot', ylab = 'NUSE - Normalized Unscaled Standard Error', las = 2)

##SPLIT: normalisation
# Execute RMA procedure on soy.ab object; assign to object eset
eset <- rma(soy.ab) 

# Display eset
eset

# Create an object that contains exprs(eset), named exprs.eset
exprs.eset <- exprs(eset) 

# Create index values – Index1 for Hawaii/Resistant, 
# Index2 for Taiwan/Susceptible
Index1 <- 1:3
Index2 <- 4:6

# Compute Difference vector for rowMeans between H/R and T/S
Difference <- rowMeans(exprs.eset[,Index1]) -rowMeans(exprs.eset[,Index2]) 

# Also compute the Average for each row
Average <- rowMeans(exprs.eset) 

# Create data frame for matrix exprs.eset 
exprs.eset.df <- data.frame(exprs.eset)

##SPLIT: normalisation_checks

# Construct expression set boxplots following RMA 
par(oma = c(1,1,3,1))  
boxplot(exprs.eset.df, col = brewer.cols)
mtext('Boxplots Soybean(Glycine max subset) RMA Gene Expression Data', side = 3, outer = T)   

# Construct MA-Plot, Difference vs. Average 
plot(Average, Difference) 
lines(lowess(Average, Difference), col = 'red', lwd = 4) 
abline( h = -2)
abline( h = 2)
title(sub = "> lines(lowess(Average, Difference), col = 'red', lwd = 4)") 
mtext('MA-Plot, Difference vs. Average, Soybean (H/R & T/S)', outer = T, side = 3) 


##SPLIT: over_expressed

# Compute ordinary t statistics 
library('genefilter')
tt <- rowttests(exprs.eset, factor(eset$population)) 
names(tt) 

# Check that length of tt$statistic is 37744 
length(tt$statistic)


##SPLIT: volcano_prepare
## Start the construction of the Volcano Plot
## Construct the lod scores...
lod <- -log10(tt$p.value)
o1 <- order(abs(Difference), decreasing = TRUE)[1:50]
o2 <- order(abs(tt$statistic), decreasing = TRUE)[1:50]

# Construct union and intersection of sets
o <- union(o1, o2) 
i <- intersect(o1, o2)

# Display i; note, we have 4 intersects 
i

##SPLIT: volcano

# Construct simple Volcano plot to get an idea of the range 
# of the Difference values and lod values
plot(Difference, lod)   

# Construct more sophisticated set of commands for a nicer Volcano plot
par(mfrow = c(1,1))
plot(Difference[-o], lod[-o], cex = .25, xlim = c(-3,3), ylim = range(lod)) 
points(Difference[o1], lod[o1], pch = 18, col = 'blue', cex = 1.5) 
points(Difference[o2], lod[o2], pch = 1, col = 'red', cex = 2, lwd = 2) 
abline(h = 3) 
text(-2, 3.8, 'Red circles: 50 genes') 
text(-2, 3.6, 'with lowest p-values') 
text(-2, 3.4, '=====>>>') 
text(-2, 3.15, 'p < 0.001') 
title('Volcano Plot: lowest p values and largest fold differences') 


##SPLIT: limma
library(limma) 

# Construct population.groups for model design
population.groups <- factor(c(rep('Taiwan/Susceptible',3), rep('Hawaii/Resistant',3)))
design <- model.matrix( ~ population.groups ) 

# Display design
design

# Fit linear model 
fit <- lmFit(eset, design)
 
# Check the dimension of fit$coeff; should be 37744 x 2
dim(fit$coeff) 

# Execute eBayes to obtain attenuated t statistics 
fit.eBayes <- eBayes(fit)

# Check dimension of fit.eBayes$t; should be 37744 x 2
dim(fit.eBayes$t) 

# Compute statistics necessary for Volcano Plot 
# using attenuated t statistics 
lodd <- -log10(fit.eBayes$p.value[,2])
oo2 <- order(abs(fit.eBayes$t[,2]), decreasing = TRUE)[1:50] 
oo <- union(o1, oo2) 
ii <- intersect(o1, oo2) 

# Display ii
ii

# Construct Volcano plot using attenuated t statistics
plot(Difference[-oo], lodd[-oo], cex = .25, xlim = c(-3,3), ylim = range(lodd), xlab = 'Average (log) Fold-change', ylab = 'LOD score - Negative log10 of P-value') 
points(Difference[o1], lodd[o1], pch = 18, col = 'blue', cex = 1.5) 
points(Difference[oo2], lodd[oo2], pch = 1, col ='red',cex = 2,lwd = 2) 
abline(h = 3)
title('Volcano Plot with moderated t statistics')  
text(-2, 3.2, 'p < 0.001') 
text(1, 4, 'Nine intersects')

##SPLIT: heatmaps
# Construct the appropriate matrices/data frames for  
# construction of the heat map.  
ii.mat <- exprs.eset[ii,]
ii.df <- data.frame(ii.mat)

# Dislplay ii.df 
ii.df

# Construct heatmap using reds and blues
library('RColorBrewer') 
library('genefilter') 
hmcol <- colorRampPalette(brewer.pal(10, 'RdBu'))(256)
tv.Hawaii.Resistant <- dimnames(ii.mat)[[2]][1:3] 
spcol <- ifelse(dimnames(ii.mat)[[2]] == tv.Hawaii.Resistant, 'skyblue', 'goldenrod')
heatmap(ii.mat, col = hmcol, ColSideColors = spcol, margins = c(10,15)) 

# Construct heatmap using a grey (gray) scale
library('RColorBrewer') 
library('genefilter') 
hmcol <- colorRampPalette(brewer.pal(9, 'Greys'))(256)
tv.Hawaii.Resistant <- dimnames(ii.mat)[[2]][1:3] 
spcol <- ifelse(dimnames(ii.mat)[[2]] == tv.Hawaii.Resistant, 'grey10', 'grey80')
heatmap(ii.mat, col = hmcol, ColSideColors = spcol, margins = c(10,15)) 

##SPLIT: end
(Complete File)(Rout)
You can get all files and all images from here.