# Eklund_script01.txt # 2007-12-04 # Aron Charles Eklund # http://www.cbs.dtu.dk/suppl/biascorr/ # # generates figures 1 - 4 # (those related to Signoretti breast cancer data set) # tested in R 2.5.1 #### Fig. 1 (dendrograms of Signoretti data) library(affy) library(bias) library(squash) library(RColorBrewer) load('~/aron/chb/data/batch2/signoretti.batch2.RData') signoretti.rma <- rma(signoretti.batch2) ## calculate bias metrics signoretti.bm <- getBiasMetrics(signoretti.batch2, signoretti.rma) ### set up matrix annotating replicates and normals, for graphical display myPal <- brewer.pal(9, 'Set1') replicate <- c(NA, myPal)[signoretti.rma$replicateGroup + 1] normal <- ifelse(signoretti.rma$normal, 'black', 'white') myMat <- cbind(replicate, normal) ### generate various dendrograms ## clustering using (1 - Pearson correlation) as distance metric pclust <- function(x) { if( inherits(x, 'ExpressionSet')) x <- exprs(x) hclust(as.dist(1 - cor(x))) } #### we need a way to count how many replicate pairs are correctly clustered together ## first define the negative indices of replicate pairs repIndices <- -sapply(1:9, function(x) which(signoretti.rma$replicateGroup == x)) ## this function counts paired samples (at first merge) in a dendrogram countPairedSamples <- function(x) { good <- apply(x$merge, 1, function(y) { isMatch <- apply(repIndices, 2, function(z) all(z == y)) any(isMatch) }) sum(good) } ###### ## 1a. untreated data signoretti.rma.pclust <- pclust(signoretti.rma) countPairedSamples(signoretti.rma.pclust) ## 4 ## 1b. variance-filtered data (top 100 highest variance probe sets) myVar <- apply(exprs(signoretti.rma), 1, var) myVar100 <- order(myVar, decreasing = TRUE)[1:100] signoretti.rma.vfilt <- signoretti.rma[myVar100,] signoretti.rma.vfilt.pclust <- pclust(signoretti.rma.vfilt) countPairedSamples(signoretti.rma.vfilt.pclust) ## 9 ## 1c. bias-corrected data signoretti.rma.bc <- biasCorrection(signoretti.rma, signoretti.bm) signoretti.rma.bc.pclust <- pclust(signoretti.rma.bc) countPairedSamples(signoretti.rma.bc.pclust) ## 9 ## (no figure) quantile-normalized data signoretti.rma.qn <- quantileNormalize(exprs(signoretti.rma)) signoretti.rma.qn.pclust <- pclust(signoretti.rma.qn) countPairedSamples(signoretti.rma.qn.pclust) ## 5 ## (no figure) quantile-normalized data v. 2 ## (affy package implementation of QN) signoretti.rma.qn2 <- normalize.quantiles(exprs(signoretti.rma)) signoretti.rma.qn2.pclust <- pclust(signoretti.rma.qn2) countPairedSamples(signoretti.rma.qn2.pclust) ## 5 ## (no figure) batch-corrected data signoretti.rma.bat <- batchCorrection(signoretti.rma, b = signoretti.rma$scan.date) signoretti.rma.bat.pclust <- pclust(signoretti.rma.bat) countPairedSamples(signoretti.rma.bat.pclust) ## 5 ## Fig. 1d -- effects of filtering thresholds on number of clustered pairs. thresh <- c(10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 12625) clusterTopN <- function(n) { wh <- order(myVar, decreasing = TRUE)[1:n] pclust(signoretti.rma[wh,]) } nPaired <- sapply(thresh, function(x) countPairedSamples(clusterTopN(x))) ## [1] 7 9 9 9 8 8 7 7 6 5 4 ### create Figure 1 pl01 <- function(x, figlabel, ...) { plotDendroCol(as.dendrogram(x), colMat = myMat, leaflab = 'none', colSize = 0.5, colGap = 0.1, xlim = c(3, 97), x.lab.adj = 1, ...) mtext(figlabel, adj = -0.06, font = 2, cex = 1.2) } pdf('Fig1.pdf', width = 12, height = 10) par(las = 1, mar = c(2, 6, 4, 2), mfrow = c(4,1), cex = 0.75) pl01(signoretti.rma.pclust, 'a') pl01(signoretti.rma.vfilt.pclust, 'b') pl01(signoretti.rma.bc.pclust, 'c') par(mar = c(5,6,4,60)) plot(thresh, nPaired, log = 'x', bty = 'l', xlab = 'number of genes retained', ylab = 'correctly paired replicates') mtext('d', adj = -0.35, font = 2, cex = 1.2, line = 2) dev.off() ### how much variance is removed by bias correction? ## sum square deviation from row-means ssd <- function(x) { x <- sweep(x, 1, rowMeans(x)) sum(x ^ 2) } ssd(exprs(signoretti.rma.bc)) / ssd(exprs(signoretti.rma)) ## 0.6707058 (33% explained) ############# Fig. 3 (correlation of bias metrics with genes in Signoretti data) mycor <- cor(t(exprs(signoretti.rma)), signoretti.bm) ## use a single set of parameters for the density function ## (so each vector of correlations is treated the same) ## note: n = 128 is chosen for a smaller figure size ## note: bw = 0.03 is near what would be chosen automatically for mycor dens <- function(x) density(x, n = 128, from = -1, to = 1, bw = 0.03) mycor.dens <- apply(mycor, 2, dens) ## compute correlations for 100 random permutations of each bias metric set.seed(2007) permutation.dens <- lapply(1:4, function(i) { permutedBM <- sapply(1:100, function(x) sample(signoretti.bm[,i])) permutedBMcor <- cor(t(exprs(signoretti.rma)), permutedBM) dens(permutedBMcor) }) colnames(signoretti.bm) ## "pm.median" "pm.IQR" "rma.IQR" "degradation" leg <- c('PM median', 'PM IQR', 'RMA IQR', "degradation", 'random') pdf('Fig3.pdf', width = 5, height = 5) plot(c(-1, 1), c(0, 3.7), type = 'n', axes = FALSE, xlab = 'Pearson correlation', ylab = 'density') axis(1) axis(2) abline(h = 0, lwd = 0.1, col = "gray") for (i in 1:4) lines(permutation.dens[[i]], col = 1) for (i in 1:4) lines(mycor.dens[[i]], col = i + 1) legend('topright', legend = leg, col = c(2:5, 1), lty = 1, bty = 'n', inset = 0.02) dev.off() ################ Figure (not in paper) - bias drives clustering ## "dist" function using (1 - Pearson correlation) metric distP <- function(x) { a <- 1 - cor(x) a[lower.tri(a)] } ## pairwise distances of entire data set: signo.rma.dP <- distP(exprs(signoretti.rma)) signo.rma.bc.dP <- distP(exprs(signoretti.rma.bc)) signo.rma.vfilt.dP <- distP(exprs(signoretti.rma.vfilt)) ## pairwise distances of two main bias metrics ## (We don't want to worry about fitting weights, ## so we pick only two metrics, which have similar ## magnitude of effect) signo.bm.dist <- dist(scale(signoretti.bm[,c('rma.IQR', 'degradation')])) ## want locations and colors for replicate pairs (in "dist" space) repPairs <- signoretti.rma$replicateGroup repPairs[repPairs == 0] <- NA repPairMat <- outer(repPairs, repPairs, '==') repPairMat[is.na(repPairMat)] <- FALSE isPair.dist <- as.dist(repPairMat) colorMat <- outer(repPairs, repPairs, function(x, y) ifelse(x == y, x, 0)) colorMat[is.na(colorMat)] <- 0 pairColor.dist <- colorMat[lower.tri(colorMat)] pairColor.dist <- c('black', myPal)[1 + pairColor.dist] ## (non-pairs are plotted first, so they don't ## obscure the pairs (in color) on top) pl02 <- function(x, y, figlabel = '', ...) { ylim <- c(0, max(y)) cc <- cor(x, y) plot(x[isPair.dist == 0], y[isPair.dist == 0], ylim = ylim, xlab = 'dissimilarity in bias metrics', ylab = 'dissimilarity in gene expression', ...) points(x[isPair.dist == 1], y[isPair.dist == 1], col = pairColor.dist[isPair.dist == 1], pch = 16) mtext(figlabel, adj = -0.1, font = 2) text(5, 0, sprintf('cor = %.2f', cc)) } pdf('Fig_bias_drives_clustering.pdf', width = 9, height = 3) par(mfrow = c(1,3)) pl02(signo.bm.dist, signo.rma.dP, figlabel = 'a', main = 'RMA only') pl02(signo.bm.dist, signo.rma.vfilt.dP, figlabel = 'b', main = 'RMA + variance filtering') pl02(signo.bm.dist, signo.rma.bc.dP, figlabel = 'c', main = 'RMA + bias correction') dev.off() ## how much of gene-expression distance is explained by bias metrics? summary(lm(signo.rma.dP ~ signo.bm.dist)) ## Multiple R-Squared: 0.4352, Adjusted R-squared: 0.435 ########## Fig. 2 ## calculate correlation matrices (on sorted expression matrices) calcCM.meansort <- function(x) { ord <- order(rowMeans(x)) calcCM( x[ord,] ) } ## (these take ~ 2 minutes each) signoretti.rma.cm <- calcCM.meansort(exprs(signoretti.rma )) signoretti.rma.bc.cm <- calcCM.meansort(exprs(signoretti.rma.bc)) r <- range(signoretti.rma.cm$z, signoretti.rma.bc.cm$z) # -0.1800237 0.1974303 map <- colormap(r, center = TRUE, colFn = orangeblue) pdf('Fig2.pdf', width = 9, height = 4) layout(matrix(1:3, nrow = 1), widths = c(1, 1, 0.3)) par(mar = c(4.1, 4.1, 2.1, 2.1), xpd = NA, cex = 1) colorgram(signoretti.rma.cm, map = map, key = FALSE, xlab = 'expression rank', ylab = 'expression rank') mtext('a', side = 3, adj = -0.2, line = 1, cex = par('cex'), font = 2) colorgram(signoretti.rma.bc.cm, map = map, key = FALSE, xlab = 'expression rank', ylab = 'expression rank') mtext('b', side = 3, adj = -0.2, line = 1, cex = par('cex'), font = 2) par(mar = c(5, 0, 3, 5), yaxs='r', las = 1) drawKey(map, lab = 'median correlation') dev.off() ######### Fig. 4 - intensity-dependent correlation between genes and bias metrics ## mean signal for each probe set m <- rowMeans(exprs(signoretti.rma)) ## calculate correlation vs. intensity for each sq <- lapply(colnames(mycor), function(i) { squash(m, mycor[,i], seq(along = m), length, ylim = c(-1,1)) } ) r4 <- range(sapply(sq, function(x) range(x$z, na.rm = T))) # 1 74 map4 <- colormap(r4) colnames(mycor) # [1] "pm.median" "pm.IQR" "rma.IQR" "degradation" leg4 <- c('PM median', 'PM IQR', 'RMA IQR', 'degradation') pdf('Fig4.pdf', width = 9, height = 2.5) layout(matrix(1:5, nrow = 1), widths = c(1, 1, 1, 1, 0.4)) par(mar = c(5,4,3,2)) for(i in 1:4) { colorgram(sq[[i]], map = map4, key = FALSE, xlab = 'mean log2 expression value', ylab = paste('correlation with', leg4[i])) mtext(letters[i], side = 3, adj = -0.4, line = 1, cex = par('cex'), font = 2) } par(mar = c(5, 0, 3, 5), yaxs='r', las = 1) drawKey(map4, lab='count') dev.off()