RNA-seq analysis in R

library(edgeR)
library(limma)
library(Glimma) 
library(gplots)
library(org.Mm.eg.db)
library(RColorBrewer)

# Preprocessing of expression data
# dataset: https://figshare.com/s/1d788fd384d33e913a2a
setwd("~/data/")

# load data
sampleinfo <- read.delim("SampleInfo.txt")
seqdata <- read.delim("GSE60450_Lactation-GenewiseCounts.txt", stringsAsFactors = FALSE)
head(seqdata)
# format data
# Remove first two columns from seqdata
exp <- seqdata[,-(1:2)]

# Store EntrezGeneID as rownames
rownames(exp) <- seqdata[,1]
head(exp)

# make shorter sample names
colnames(exp) <- substr(colnames(exp), start=1, stop=7)
head(exp)

# Counts per Million (CPM)
myCPM <- edgeR::cpm(exp)
# Have a look at the output
head(myCPM)

# TRUE if CPM > 0.5
thresh <- myCPM > 0.5
# logical matrix
head(thresh)

# How many genes with CPM > 0.5
table(rowSums(thresh))

# keep genes that have at least 2 TRUES in each row of thresh
fltr <- rowSums(thresh) >= 2
# Subset the rows of countdata to keep the more highly expressed genes
exp.fltr <- exp[fltr,]
summary(exp.fltr)
dim(exp.fltr)

# We will look at the first sample
plot(myCPM[,1],countdata[,1])
# Let us limit the x and y-axis so we can actually look to see what is happening at the smaller counts
plot(myCPM[,1],countdata[,1],ylim=c(0,50),xlim=c(0,3))
# Add a vertical line at 0.5 CPM
abline(v=0.5)

### Digital Gene Expression List from edgeR
# Convert counts to DGEList object
dgeObj <- edge::DGEList(exp.fltr)
# have a look at dgeObj
dgeObj
# See what slots are stored in dgeObj
names(dgeObj)
# Library size information is stored in the samples slot
dgeObj$samples

# Library sizes and distribution plot
dgeObj$samples$lib.size


# The names argument tells the barplot to use the sample names on the x-axis
# The las argument rotates the axis names
barplot(dgeObj$samples$lib.size, names=colnames(dgeObj), las=2, main="Barplot of library sizes")

# Get log2 counts per million
logcounts <- cpm(dgeObj,log=TRUE)
# Check distributions of samples using boxplots
boxplot(logcounts, xlab="", ylab="Log2 counts per million", las=2,
        main = "Boxplots of logCPMs (unnormalised)")
# Let's add a blue horizontal line that corresponds to the median logCPM
abline(h=median(logcounts),col="blue")

plotMDS(dgeObj)
# We specify the option to let us plot two plots side-by-sde
par(mfrow=c(1,2))
# Let's set up colour schemes for CellType
# How many cell types and in what order are they stored?
levels(sampleinfo$CellType)
## Let's choose purple for basal and orange for luminal
col.cell <- c("purple","orange")[sampleinfo$CellType]
data.frame(sampleinfo$CellType, col.cell)

# Redo the MDS with cell type colouring
plotMDS(dgeObj,col=col.cell)
# Let's add a legend to the plot so we know which colours correspond to which cell type
legend("topleft", fill=c("purple","orange"), legend=levels(sampleinfo$CellType))
# Add a title
title("Cell type")

# Similarly for status
levels(sampleinfo$Status)
col.status <- c("blue","red","dark green")[sampleinfo$Status]
col.status
plotMDS(dgeObj, col=col.status)
legend("topleft", fill=c("blue","red","dark green"),
       legend=levels(sampleinfo$Status), cex=0.8)
title("Status")

# There is a sample info corrected file in your data directory
# Old sampleinfo
sampleinfo
# I'm going to write over the sampleinfo object with the corrected sample info
sampleinfo <- read.delim("data/SampleInfo_Corrected.txt")
sampleinfo

# Redo the MDSplot with corrected information
par(mfrow=c(1,2))
col.cell <- c("purple","orange")[sampleinfo$CellType]
col.status <- c("blue","red","dark green")[sampleinfo$Status]


plotMDS(dgeObj,col=col.cell)
legend("topleft",fill=c("purple","orange"),
       legend=levels(sampleinfo$CellType))
title("Cell type")


plotMDS(dgeObj,col=col.status)
legend("topleft",fill=c("blue","red","dark green"), 
       legend=levels(sampleinfo$Status), cex=0.8)
title("Status")

# Dimension 3 appears to separate pregnant samples from the rest. Dim4?
plotMDS(dgeObj,dim=c(3,4))

labels <- paste(sampleinfo$SampleName, sampleinfo$CellType, sampleinfo$Status)
group <- paste(sampleinfo$CellType,sampleinfo$Status,sep=".")
group <- factor(group)
glMDSPlot(dgeObj, labels=labels, groups=group, folder="mds")

# Hierarchical clustering with heatmaps

# We estimate the variance for each row in the logcounts matrix
var_genes <- apply(logcounts, 1, var)
head(var_genes)
# Get the gene names for the top 500 most variable genes
select_var <- names(sort(var_genes, decreasing=TRUE))[1:500]
head(select_var)
# Subset logcounts matrix
highly_variable_lcpm <- logcounts[select_var,]
dim(highly_variable_lcpm)
head(highly_variable_lcpm)

## Get some nicer colours
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)
# Set up colour vector for celltype variable
col.cell <- c("purple","orange")[sampleinfo$CellType]

# Plot the heatmap
heatmap.2(highly_variable_lcpm, 
          col=rev(morecols(50)),
          trace="column", 
          main="Top 500 most variable genes across samples",
          ColSideColors=col.cell,scale="row")

# Save the heatmap
png(file="High_var_genes.heatmap.png")
heatmap.2(highly_variable_lcpm,col=rev(morecols(50)), trace="none",
          main="Top 500 most variable genes\nacross samples", 
          ColSideColors=col.cell,scale="row")
dev.off()

# Normalization for composition bias
# Apply normalisation to DGEList object
dgeObj <- calcNormFactors(dgeObj)

dgeObj$samples

par(mfrow=c(1,2))
plotMD(logcounts,column = 7)
abline(h=0,col="grey")
plotMD(logcounts,column = 11)
abline(h=0,col="grey")

par(mfrow=c(1,2))
plotMD(dgeObj,column = 7)
abline(h=0,col="grey")

plotMD(dgeObj,column = 11)
abline(h=0,col="grey")

save(group,dgeObj,sampleinfo,file="Robjects/preprocessing.Rdata")
Avatar
Mark Goldberg
Researcher

My research interests include epigenetics and computational biology.

Related

comments powered by Disqus