Club Bio-Info’ 14/05/2018
A) Package Installation
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")
B) Loading
library("DESeq2")
C) Formatting the descriptive data.frame …
… from the count files (HTSeq)
The data.frame needs to have the columns “sampleName”, “fileName” and then one or more columns of characteristics.
directory <- "HTSeq/"
sampleFiles <- grep("*txt",list.files(directory),value=TRUE)
sampleName <- sub("*_HTSeq_counts.txt","",sampleFiles)
Here, you need one caharcteristics’ column: “condition”. We can do it with regular expressions.
sampleCondition <- as.character(sampleName)
for (i in 1:length(sampleCondition)){
sampleCondition[i] <- gsub("*_B*[[:digit:]]+", "", sampleCondition[i])
}
Or, by writing the vector and transforming it into a ‘factor’ type:
sampleCondition <- c("Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "DM1",
"DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1")
sampleCondition <- as.factor(sampleCondition)
We write the data.frame “sampleTable” and we load it into the variable ‘ddsInput’
sampleTable <- data.frame(sampleName = sampleName,
fileName = sampleFiles,
condition = sampleCondition)
sampleTable
str(sampleTable)
ddsInput <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = directory,
design = ~ condition)
… from a matrix of counts (rawcounts)
cts <- as.matrix(read.csv("readscounts_EWang_Tibialis_GRCH37-87_raw.csv",sep=",",row.names="gene_id"))
coldata <- as.data.frame(matrix(nc=1, nr=20))
colnames(coldata) <- "condition"
rownames(coldata) <- colnames(cts)
sampleCondition <- c("Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "DM1",
"DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1")
coldata$condition <- as.factor(sampleCondition)
we load it into the variable ‘ddsInput’
ddsInput <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ condition)
D) Launch of DESeq2
dds <- DESeq(ddsInput)
Tests of log2 fold change above or below a threshold > https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#tests-of-log2-fold-change-above-or-below-a-threshold
resGA <- results(dds, contrast=c("condition","DM1","Ctrl"), lfcThreshold=.4, altHypothesis="greaterAbs")
table(resGA$padj < 0.05)
E) Verification of the model: DEseq2 images
library(RColorBrewer)
library(gplots)
library(pheatmap)
Data transformation for visualization
rld <- rlogTransformation(dds, blind=TRUE)
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
PDF generating
pdf("plots.pdf")
ylim <- c(-10,10)
drawLines <- function() abline(h=c(-.4,.4),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim); drawLines()
plotPCA(rld, intgroup="condition")
plotDispEsts(dds)
sampleDists <- dist( t( assay(rld) ) )
sampleDistMatrix <- as.matrix( sampleDists )
colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
heatmap.2( sampleDistMatrix, trace="none", col=colours)
plot(resGA$baseMean+1, -log10(resGA$pvalue),
log="x", xlab="mean of normalized counts",
ylab=expression(-log[10](pvalue)),
ylim=c(0,30),
cex=.4, col=rgb(0,0,0,.3))
use <- resGA$baseMean > 10
table(use)
h1 <- hist(resGA$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(resGA$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
col = colori, space = 0, main = "", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))
dev.off()
F) Refinement of the model
The 2nd main component is not explained.
Patients data
- Searching in the Chr Y Specific Genes Count Table
Y gene 2654896 2655740 0 - 0 gene_id “ENSG00000184895” gene_name “SRY”
Y gene 2803112 2850547 0 + 0 gene_id “ENSG00000067646” gene_name “ZFY”
Y gene 7142013 7249589 0 + 0 gene_id “ENSG00000099725” gene_name “PRKY”
1° Raw & normalized counts tables
counts_raw <- counts(dds, norm=F)
counts_norm <- counts(dds, norm=T)
write.csv(counts_raw,file="readscounts_raw.csv")
write.csv(counts_norm,file="readscounts_norm.csv")
2° Search for SRY, ZFY & PRKY genes
data.Y <- data.frame(SRY=counts_norm[which(rownames(counts_norm) %in% "ENSG00000184895"),],
ZFY=counts_norm[which(rownames(counts_norm) %in% "ENSG00000067646"),], PRKY=counts_norm[which(rownames(counts_norm)
%in% "ENSG00000099725"),])
data.Y
sexe <- as.factor(c("F", "M", "F", "M", "F", "M", "F", "M", "M", "F", "M", "F", "F", "F", "F", "M", "M", "F",
"F", "F"))
3° Modification of the descriptive data.frame
coldata2 <- as.data.frame(matrix(nc=2, nr=20))
colnames(coldata2) <- c("condition","sexe")
rownames(coldata2) <- rownames(coldata)
coldata2$condition <- coldata$condition
coldata2$sexe <- sexe
coldata2
str(coldata2)
ddsInput <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata2,
design = ~ sexe + condition)
dds <- DESeq(ddsInput)
resGA <- results(dds, contrast=c("condition","DM1","Ctrl"), lfcThreshold=.4, altHypothesis="greaterAbs")
table(resGA$padj < 0.05)
4° Check with PCA & dispersion estimation plots
rld <- rlogTransformation(dds, blind=TRUE)
pdf("plots_v2.pdf")
plotPCA(rld, intgroup=c("condition", "sexe"))
plotDispEsts(dds)
dev.off()
G) Writing outputs files
head(resGA)
write.csv2(resGA, "DM1_vs_Ctrl_ClubBioInfo.csv")
resGA_padj05 <- resGA[which(resGA$padj < 0.05 ),]
write.csv2(resGA_padj05, "DM1_vs_Ctrl_ClubBioInfo_sig.csv")