palette.gray <- c(rep(gray(0:10/10), times = seq(1,41, by = 4)))

library('RColorBrewer')
brewer.cols <- brewer.pal(6, 'Set1')


### Load the appropriate library for reading affy data, and inspect the data.
library( 'affy' )
soy.ab <- ReadAffy( 'geo_data/GSM209576.CEL.gz',
                   'geo_data/GSM209585.CEL.gz',
                   'geo_data/GSM209594.CEL.gz',
                   'geo_data/GSM209577.CEL.gz',
                   'geo_data/GSM209586.CEL.gz',
                   'geo_data/GSM209595.CEL.gz',

                   ## we have gz files which R can read in.
                   compress=TRUE)


## Inspect the loaded data. This will make a network connection first time
## around and can be slow.
soy.ab

## Check out the names of the samples.
sampleNames( soy.ab )


## as the current sample names refer to the original files, we change this for
## something more, er, easy to remember.
new.sampleNames <- c('hr.a3.12','hr.b3.12','hr.c3.12',
                     'ts.a4.12','ts.b4.12','ts.c4.12')
sampleNames(soy.ab) <- new.sampleNames

## and check that it has worked
sampleNames( soy.ab )


##
## We are trying to do some subsetting because not all of the probes on the
## chip are from soy


## read in another data frame called Species.Affy.ID.
## this links species names to affy ids.
Species.Affy.ID <- read.table('SpeciesAffyID.txt', header = T, sep = "")
dim(Species.Affy.ID)


load( 'SoybeanCutObjects.RData' )

tv.for.glycine.max <- Species.Affy.ID$species == 'Glycine max'
table( tv.for.glycine.max )
listOutProbeSets <- Species.Affy.ID$affyID[ tv.for.glycine.max==FALSE ]

length( listOutProbeSets )
is.factor( listOutProbeSets )

## Create a character vector for listOutProbeSets
## One way: rename listOutProbeSets as a character vector
listOutProbeSets <- as.character(listOutProbeSets)

## Confirm that listOutProbeSets is a character vector
is.character(listOutProbeSets)

## check object
soy.ab


## this is the bit which actually removes the stuff we are not intereste
RemoveProbes(listOutProbes=NULL, listOutProbeSets, cdfpackagename, probepackagename)

## Check that the object has less IDs now. There should be 37444.
soy.ab

# Start preparation for phenoData slot in AffyBatch object
pd <- data.frame(population = c(1,1,1,2,2,2), replicate = c(1,2,3,1,2,3))

# Display contents of pd
pd

# Assign the sampleNames(soy.ab) to the rownames of pd
rownames(pd) <- sampleNames(soy.ab)

# Display contents of pd again, notice change in rownames
pd

## Continue preparation for phenoData slot
metaData <- data.frame(labelDescription = c( 'population', 'replicate' ))

## Establish new phenoData slot
phenoData(soy.ab) <- new( 'AnnotatedDataFrame', data = pd, varMetadata = metaData)

## Display pData(soy.ab)
pData(soy.ab)

## Display phenoData(soy.ab)
phenoData(soy.ab)

# Execute RMA procedure on soy.ab object; assign to object eset
eset <- rma(soy.ab)

# Display eset
eset

# Create an object that contains exprs(eset), named exprs.eset
exprs.eset <- exprs(eset)

# Create index values – Index1 for Hawaii/Resistant,
# Index2 for Taiwan/Susceptible
Index1 <- 1:3
Index2 <- 4:6

# Compute Difference vector for rowMeans between H/R and T/S
Difference <- rowMeans(exprs.eset[,Index1]) -rowMeans(exprs.eset[,Index2])

# Also compute the Average for each row
Average <- rowMeans(exprs.eset)

# Create data frame for matrix exprs.eset
exprs.eset.df <- data.frame(exprs.eset)


# Construct expression set boxplots following RMA
par(oma = c(1,1,3,1))
boxplot(exprs.eset.df, col = brewer.cols)
mtext('Boxplots Soybean(Glycine max subset) RMA Gene Expression Data', side = 3, outer = T)

# Construct MA-Plot, Difference vs. Average
plot(Average, Difference)
lines(lowess(Average, Difference), col = 'red', lwd = 4)
abline( h = -2)
abline( h = 2)
title(sub = "> lines(lowess(Average, Difference), col = 'red', lwd = 4)")
mtext('MA-Plot, Difference vs. Average, Soybean (H/R & T/S)', outer = T, side = 3)


# Compute ordinary t statistics
library('genefilter')
tt <- rowttests(exprs.eset, factor(eset$population))
names(tt)

# Check that length of tt$statistic is 37744
length(tt$statistic)


## Start the construction of the Volcano Plot
## Construct the lod scores...
lod <- -log10(tt$p.value)
o1 <- order(abs(Difference), decreasing = TRUE)[1:50]
o2 <- order(abs(tt$statistic), decreasing = TRUE)[1:50]

# Construct union and intersection of sets
o <- union(o1, o2)
i <- intersect(o1, o2)

# Display i; note, we have 4 intersects
i

library(limma)

# Construct population.groups for model design
population.groups <- factor(c(rep('Taiwan/Susceptible',3), rep('Hawaii/Resistant',3)))
design <- model.matrix( ~ population.groups )

# Display design
design

# Fit linear model
fit <- lmFit(eset, design)

# Check the dimension of fit$coeff; should be 37744 x 2
dim(fit$coeff)

# Execute eBayes to obtain attenuated t statistics
fit.eBayes <- eBayes(fit)

# Check dimension of fit.eBayes$t; should be 37744 x 2
dim(fit.eBayes$t)

# Compute statistics necessary for Volcano Plot
# using attenuated t statistics
lodd <- -log10(fit.eBayes$p.value[,2])
oo2 <- order(abs(fit.eBayes$t[,2]), decreasing = TRUE)[1:50]
oo <- union(o1, oo2)
ii <- intersect(o1, oo2)

# Display ii
ii

# Construct Volcano plot using attenuated t statistics
plot(Difference[-oo], lodd[-oo], cex = .25, xlim = c(-3,3), ylim = range(lodd), xlab = 'Average (log) Fold-change', ylab = 'LOD score - Negative log10 of P-value')
points(Difference[o1], lodd[o1], pch = 18, col = 'blue', cex = 1.5)
points(Difference[oo2], lodd[oo2], pch = 1, col ='red',cex = 2,lwd = 2)
abline(h = 3)
title('Volcano Plot with moderated t statistics')
text(-2, 3.2, 'p < 0.001')
text(1, 4, 'Nine intersects')