Pri

Data Scientist

Personal and professional website of Priyanka Oberoi

Microarray analysis

Here is the R code for a squamous cell carcinoma microarray dataset project that included data visualization, outlier identification, missing data imputation, normalization, statistical testing for differential expression, functional annotation, dimensionality reduction, cluster analysis and classification:

#DataSet Record GDS2520, Platform GPL8300, Series GSE6631
source("http://bioconductor.org/biocLite.R")
biocLite("Biobase")
biocLite("GEOquery")
library(Biobase)
library(GEOquery)
gds2520 <- getGEO(GEO='GDS2520')
Meta(gds2520)$feature_count #[1] "12625"
Meta(gds2520)$sample_count #[1] "44"

#turn GDS object into an expression set object (using base 2 logarithms)
eset <- GDS2eSet(gds2520, do.log2=TRUE) 
eset
#ExpressionSet (storageMode: lockedEnvironment)
#assayData: 12625 features, 44 samples 
#element names: exprs 
#protocolData: none
#phenoData
#sampleNames: GSM153813 GSM153814 ... GSM153856 (44 total)
#varLabels: sample individual disease.state description
#varMetadata: labelDescription
#featureData
#featureNames: 1000_at 1001_at ... AFFX-YEL024w/RIP1_at (12625 total)
#fvarLabels: ID Gene title ... GO:Component ID (21 total)
#fvarMetadata: Column labelDescription
#experimentData: use 'experimentData(object)'
#pubMedIds: 15170515 
#Annotation:
sampleNames(eset) 
#[1] "GSM153813" "GSM153814" "GSM153815" "GSM153816" "GSM153817" "GSM153818"
#[7] "GSM153819" "GSM153820" "GSM153821" "GSM153822" "GSM153823" "GSM153824"
#[13] "GSM153825" "GSM153826" "GSM153827" "GSM153828" "GSM153829" "GSM153830"
#[19] "GSM153831" "GSM153832" "GSM153833" "GSM153834" "GSM153835" "GSM153836"
#[25] "GSM153837" "GSM153838" "GSM153839" "GSM153840" "GSM153841" "GSM153842"
#[31] "GSM153843" "GSM153844" "GSM153845" "GSM153846" "GSM153847" "GSM153848"
#[37] "GSM153849" "GSM153850" "GSM153851" "GSM153852" "GSM153853" "GSM153854"
#[43] "GSM153855" "GSM153856"
Table(gds2520)[1:10,] #raw

dat <- exprs(eset)
ann <- pData(eset)

##testing for outliers##

#correlation matrix
dat.cor <- cor(dat, use="pairwise.complete.obs")
layout(matrix(c(1,1,1,1,1,1,1,1,2,2),5,2,byrow=TRUE))
par(oma=c(5,7,1,1))
par(mgp=c(0, 1, 0))
cx <- rev(colorpanel(25, 'yellow', 'black', 'blue'))
leg <- seq(min(dat.cor, na.rm=T), max(dat.cor, na.rm=T), length=10)
image(dat.cor, main='Correlation plot for Head \nand neck squamous cell carcinoma dataset', axes=F, col=cx, xlab='timepoints', ylab='timepoints')
axis(1, at=seq(0,1,length=ncol(dat.cor)), label=dimnames(dat.cor)[[2]], cex.axis=0.9, las=2)
axis(2, at=seq(0,1,length=ncol(dat.cor)), label=dimnames(dat.cor)[[2]], cex.axis=0.9, las=2)
image(as.matrix(leg), col=cx, axes=F)
tmp <- round(leg, 2)
axis(1, at=seq(0, 1, length=length(leg)), labels=tmp, cex.axis=1)

#CV vs Mean plot
dat.mean <- apply(dat,2,mean)
dat.sd <- sqrt(apply(dat,2,var))
dat.cv <- dat.sd/dat.mean
plot(dat.mean, dat.cv, main="Head and Neck Squamous Cell Carcinoma dataset \nCV vs. Mean", xlab="Mean", ylab="CV", col='blue', cex=1.5, type="n")
points(dat.mean, dat.cv, bg="lightblue", col=1, pch=21)
text(dat.mean, dat.cv, label=dimnames(dat)[[2]], pos=1, cex=0.5)

#Avg correlation plot
dat.avg <- apply(dat.cor,1,mean)
par(oma=c(3,0.1,0.1,0.1))
plot(c(1,length(dat.avg)),range(dat.avg),type="n",xlab="",ylab="Avg r",main="Avg correlation of Tumor/Normal tissue samples",axes=F)
points(dat.avg,bg="red",col=1,pch=21,cex=1.25)
axis(1,at=c(1:length(dat.avg)),labels=dimnames(dat)[[2]],las=2,cex.lab=0.4,cex.axis=0.6)
axis(2)
abline(v=seq(0.5,62.5,1),col="grey")

#cluster dendrogram
dat.t <- t(dat)#transpose dat
dat.dist <- dist(dat.t,method="euclidean")# calculate distance
dat.clust <- hclust(dat.dist,method="single")# calculate clusters
plot(dat.clust,labels=names(dat),cex=0.75)# plot cluster tree

#PCA plot
dat.pca <- prcomp(t(dat))
dat.loads <- dat.pca$x[,1:2]
plot(dat.loads[,1], dat.loads[,2], main='PCA plot of \nHead and Neck Squamous Cell Carcinoma dataset', xlab="p1", ylab="p2", col="red", cex=1, pch=15)
text(dat.loads, label=dimnames(dat)[[2]], pos=1, cex=0.5)

##Filtering out low expression genes##

#mean of genes
dat.mean.genes <- apply(dat, 1, mean)
mean.gt.4 <- dat.mean.genes>4
length(dat.mean.genes[mean.gt.4]) #[1] 11804
keep <- names(dat.mean.genes[mean.gt.4])
#filter out genes with a mean <= 4
dat2 <- dat[keep,]

##t-test##

#disease state classes#
ann <- ann[,-2]
ann <- ann[,-3]
dimnames(ann)[[2]] #[1] "sample""disease.state"
ann <- as.data.frame(ann)
ann2 <- as.data.frame(ann[,2])
dimnames(ann2)[[1]] <- dimnames(ann)[[1]]
normal <- ann[,2]=='normal'
normal <- dimnames(ann[normal,])[[1]]
carcinoma <- ann[,2]=='head and neck squamous carcinoma'
carcinoma <- dimnames(ann[carcinoma,])[[1]]
#run test
t.test.all.genes <- function(x,s1,s2) {
x1 <- x[s1]
x2 <- x[s2]
x1 <- as.numeric(x1)
x2 <- as.numeric(x2)
t.out <- t.test(x1,x2, alternative="two.sided",var.equal=T)
out <- as.numeric(t.out$p.value)
return(out)
}
pv <- apply(dat2, 1, t.test.all.genes, s1=normal, s2=carcinoma)
#plot
hist(pv, col="lightblue", xlab="p-values", main="P-value distribution", cex.main=0.9)
abline(v=0.05, col=2, lwd=2)

##adjust p-values##

pv.holm <- p.adjust(pv, method="holm")
pv.bonf <- p.adjust(pv, method="bonferroni")
pv.fdr <- p.adjust(pv, method="fdr")
#graphs of p-value distrbutions after adjustment
hist(pv.holm, col="lightblue", xlab="p-values", main="P-value distribution - holm adjusted", cex.main=0.9)
abline(v=0.05, col=2, lwd=2)
hist(pv.bonf, col="lightblue", xlab="p-values", main="P-value distribution - bonferroni adjusted", cex.main=0.9)
abline(v=0.05, col=2, lwd=2)
hist(pv.fdr, col="lightblue", xlab="p-values", main="P-value distribution - fdr adjusted", cex.main=0.9)
abline(v=0.05, col=2, lwd=2)
#number of genes with p-value less than 0.05
pv.lt.05 <- pv.fdr[(pv.fdr<0.05)]
length(pv.lt.05) #[1] 1365

##Plot of p-values for genes retained##
hist(pv.lt.05, col="lightblue", xlab="p-values", main="P-value distribution of Retained Genes", cex.main=0.9)

##Dimensionality Reduction or Clustering##

#subset data by genes retained
dat3 <- dat2[names(pv.lt.05),]

#PCA
dat.pca <- prcomp(t(dat3), cor=F)
#scree plot
dat.pca.var <- round(dat.pca$sdev^2 / sum(dat.pca$sdev^2)*100,2)
plot(c(1:length(dat.pca.var)), dat.pca.var, type="b", xlab="# components", ylab="% variance", pch=21, col=1, bg=3, cex=1.5)
title("Scree plot showing % variability explained by each eigenvalue")
#plot of three components
dat.loadings <- dat.pca$x[,1:3]
par(mfrow=c(2,2))
plot(range(dat.loadings[,1]),range(dat.loadings[,2]),type="n",xlab='p1',ylab='p2',main='PCA plot of Normal/Carcinoma Data\np2 vs. p1')
points(dat.loadings[,1][normal], dat.loadings[,2][normal],col=1,bg='red',pch=21,cex=1.5)
points(dat.loadings[,1][carcinoma], dat.loadings[,2][carcinoma],col=1,bg='blue',pch=21,cex=1.5)
legend('topleft',c("Normal","Carcinoma"),col=c("red","blue"),pch=15,cex=.9,horiz=F)
plot(range(dat.loadings[,1]),range(dat.loadings[,3]),type="n",xlab='p1',ylab='p3',main='PCA plot of Normal/Carcinoma Data\np3 vs. p1')
points(dat.loadings[,1][normal],dat.loadings[,3][normal],col=1,bg='red',pch=21,cex=1.5)
points(dat.loadings[,1][carcinoma], dat.loadings[,3][carcinoma],col=1,bg='blue',pch=21,cex=1.5)
legend('topleft',c("Normal","Carcinoma"),col=c("red","blue"),pch=15,cex=.9,horiz=F)
plot(range(dat.loadings[,2]),range(dat.loadings[,3]),xlab='p2',type="n",ylab='p3',main='PCA plot of Normal/Carcinoma Data\np3 vs. p2')
points(dat.loadings[,2][normal],dat.loadings[,3][normal],col=1,bg='red',pch=21,cex=1.5)
points(dat.loadings[,2][carcinoma], dat.loadings[,3][carcinoma],col=1,bg='blue',pch=21,cex=1.5)
legend('topleft',c("Normal","Carcinoma"),col=c("red","blue"),pch=15,cex=.9,horiz=F)

##discriminant analysis##

library(MASS)
dat.loads <- dat.pca$x[,1:3]
dat4 <- as.data.frame(dat.loads)
dat4 <- data.frame(ann2, dat4)
#train model
dat.lda <- lda(dat4[,1]~., dat4[,2:4])
#predict
dat.pred <- predict(dat.lda, dat4[,2:4])
#confusion table
table(dat.pred$class, dat4[,1])
# head and neck squamous carcinoma normal
#head and neck squamous carcinoma 210
#normal1 22
#plot of 3 components and with predicted versus actual memberships
par(mfrow=c(2,2))
plot(range(dat.loadings[,1]),range(dat.loadings[,2]),type="n",xlab='p1',ylab='p2',main='PCA plot of Normal/Carcinoma Data\np1 vs. p2')
points(dat.loadings[,1][dat.pred$class=='normal'], dat.loadings[,2][dat.pred$class=='normal'],col='red',bg='red',pch="o",cex=1.5)
points(dat.loadings[,1][dat.pred$class=='head and neck squamous carcinoma'], dat.loadings[,2][dat.pred$class=='head and neck squamous carcinoma'],col='blue',bg='blue',pch="o",cex=1.5)
points(dat.loadings[,1][normal], dat.loadings[,2][normal],pch="*",cex=1)
legend('topleft',c("predicted Normal","predicted Carcinoma"), pch="o", col=c('red','blue'), cex=0.9, horiz=F)
legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "o"), col='black', cex=0.9, horiz=F)
legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "*"), col='black', cex=0.9, horiz=F)

plot(range(dat.loadings[,1]),range(dat.loadings[,3]),type="n",xlab='p1',ylab='p3',main='PCA plot of Normal/Carcinoma Data\np1 vs. p3')
points(dat.loadings[,1][dat.pred$class=='normal'], dat.loadings[,3][dat.pred$class=='normal'],col='red',bg='red',pch="o",cex=1.5)
points(dat.loadings[,1][dat.pred$class=='head and neck squamous carcinoma'], dat.loadings[,3][dat.pred$class=='head and neck squamous carcinoma'],col='blue',bg='blue',pch="o",cex=1.5)
points(dat.loadings[,1][normal], dat.loadings[,3][normal],pch="*",cex=1)
legend('topleft',c("predicted Normal","predicted Carcinoma"), pch="o", col=c('red','blue'), cex=0.9, horiz=F)
legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "o"), col='black', cex=0.9, horiz=F)
legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "*"), col='black', cex=0.9, horiz=F)

plot(range(dat.loadings[,2]),range(dat.loadings[,3]),type="n",xlab='p2',ylab='p3',main='PCA plot of Normal/Carcinoma Data\np2 vs. p3')
points(dat.loadings[,2][dat.pred$class=='normal'], dat.loadings[,3][dat.pred$class=='normal'],col='red',bg='red',pch="o",cex=1.5)
points(dat.loadings[,2][dat.pred$class=='head and neck squamous carcinoma'], dat.loadings[,3][dat.pred$class=='head and neck squamous carcinoma'],col='blue',bg='blue',pch="o",cex=1.5)
points(dat.loadings[,2][normal], dat.loadings[,3][normal],pch="*",cex=1)
legend('topleft',c("predicted Normal","predicted Carcinoma"), pch="o", col=c('red','blue'), cex=0.9, horiz=F)
legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "o"), col='black', cex=0.9, horiz=F)
legend('bottomright',c("actual Normal","actual Carcinoma"), pch=c("o", "*"), col='black', cex=0.9, horiz=F)

##top 10 discriminant genes##

#p-value instead of fold-change
pv.sorted <- sort(pv, decreasing=FALSE)
names(pv.sorted[1:10])
# [1] "39657_at" "38051_at" "41644_at" "37093_at" "39801_at" "40315_at"
# [7] "32868_at" "988_at" "38394_at" "33839_at"