Club Bio-Info’ 14/05/2018

A) Package Installation

  source("https://bioconductor.org/biocLite.R")
    biocLite("DESeq2")

B) Loading

  library("DESeq2")

Vignette

C) Formatting the descriptive data.frame …

… from the count files (HTSeq)

The data.frame needs to have the columns “sampleName”, “fileName” and then one or more columns of characteristics.

  directory <- "HTSeq/"

    sampleFiles <- grep("*txt",list.files(directory),value=TRUE)
    sampleName <- sub("*_HTSeq_counts.txt","",sampleFiles)

Here, you need one caharcteristics’ column: “condition”. We can do it with regular expressions.

  sampleCondition <- as.character(sampleName)
    for (i in 1:length(sampleCondition)){
    sampleCondition[i] <- gsub("*_B*[[:digit:]]+", "", sampleCondition[i])
    }

Or, by writing the vector and transforming it into a ‘factor’ type:

  sampleCondition <- c("Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "DM1",
    "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1")
    sampleCondition <- as.factor(sampleCondition)

We write the data.frame “sampleTable” and we load it into the variable ‘ddsInput’

  sampleTable <- data.frame(sampleName = sampleName,
    fileName = sampleFiles,
    condition = sampleCondition)
    sampleTable
    str(sampleTable)

    ddsInput <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
    directory = directory,
    design = ~ condition)

… from a matrix of counts (rawcounts)

  cts <- as.matrix(read.csv("readscounts_EWang_Tibialis_GRCH37-87_raw.csv",sep=",",row.names="gene_id"))

    coldata <- as.data.frame(matrix(nc=1, nr=20))
    colnames(coldata) <- "condition"
    rownames(coldata) <- colnames(cts)

    sampleCondition <- c("Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "Ctrl", "DM1",
    "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1", "DM1")
    coldata$condition <- as.factor(sampleCondition)

we load it into the variable ‘ddsInput’

  ddsInput <- DESeqDataSetFromMatrix(countData = cts,
    colData = coldata,
    design = ~ condition)

D) Launch of DESeq2

  dds <- DESeq(ddsInput)

Tests of log2 fold change above or below a threshold > https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#tests-of-log2-fold-change-above-or-below-a-threshold

  resGA <- results(dds, contrast=c("condition","DM1","Ctrl"), lfcThreshold=.4, altHypothesis="greaterAbs")
    table(resGA$padj < 0.05)

E) Verification of the model: DEseq2 images

  library(RColorBrewer)
    library(gplots)
    library(pheatmap)

Data transformation for visualization

  rld <- rlogTransformation(dds, blind=TRUE)
    vsd <- varianceStabilizingTransformation(dds, blind=TRUE)

PDF generating

  pdf("plots.pdf")

    ylim <- c(-10,10)
    drawLines <- function() abline(h=c(-.4,.4),col="dodgerblue",lwd=2)
    plotMA(resGA, ylim=ylim); drawLines()

    plotPCA(rld, intgroup="condition")

    plotDispEsts(dds)

    sampleDists <- dist( t( assay(rld) ) )
    sampleDistMatrix <- as.matrix( sampleDists )
    colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
    heatmap.2( sampleDistMatrix, trace="none", col=colours)

    plot(resGA$baseMean+1, -log10(resGA$pvalue),
    log="x", xlab="mean of normalized counts",
    ylab=expression(-log[10](pvalue)),
    ylim=c(0,30),
    cex=.4, col=rgb(0,0,0,.3))

    use <- resGA$baseMean > 10
    table(use)
    h1 <- hist(resGA$pvalue[!use], breaks=0:50/50, plot=FALSE)
    h2 <- hist(resGA$pvalue[use], breaks=0:50/50, plot=FALSE)
    colori <- c(`do not pass`="khaki", `pass`="powderblue")
    barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
    col = colori, space = 0, main = "", ylab="frequency")
    text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
    adj = c(0.5,1.7), xpd=NA)
    legend("topright", fill=rev(colori), legend=rev(names(colori)))


    dev.off()

F) Refinement of the model

The 2nd main component is not explained.

Patients data

Searching in the Chr Y Specific Genes Count Table

Y gene 2654896 2655740 0 - 0 gene_id “ENSG00000184895” gene_name “SRY”

Y gene 2803112 2850547 0 + 0 gene_id “ENSG00000067646” gene_name “ZFY”

Y gene 7142013 7249589 0 + 0 gene_id “ENSG00000099725” gene_name “PRKY”

1° Raw & normalized counts tables

  counts_raw <- counts(dds, norm=F)
    counts_norm <- counts(dds, norm=T)

    write.csv(counts_raw,file="readscounts_raw.csv")
    write.csv(counts_norm,file="readscounts_norm.csv")

2° Search for SRY, ZFY & PRKY genes

  data.Y <- data.frame(SRY=counts_norm[which(rownames(counts_norm) %in% "ENSG00000184895"),],
    ZFY=counts_norm[which(rownames(counts_norm) %in% "ENSG00000067646"),], PRKY=counts_norm[which(rownames(counts_norm)
    %in% "ENSG00000099725"),])

    data.Y

    sexe <- as.factor(c("F", "M", "F", "M", "F", "M", "F", "M", "M", "F", "M", "F", "F", "F", "F", "M", "M", "F",
    "F", "F"))

3° Modification of the descriptive data.frame

  coldata2 <- as.data.frame(matrix(nc=2, nr=20))
    colnames(coldata2) <- c("condition","sexe")
    rownames(coldata2) <- rownames(coldata)
    coldata2$condition <- coldata$condition
    coldata2$sexe <- sexe

    coldata2
    str(coldata2)

    ddsInput <- DESeqDataSetFromMatrix(countData = cts,
    colData = coldata2,
    design = ~ sexe + condition)

    dds <- DESeq(ddsInput)

    resGA <- results(dds, contrast=c("condition","DM1","Ctrl"), lfcThreshold=.4, altHypothesis="greaterAbs")
    table(resGA$padj < 0.05)

4° Check with PCA & dispersion estimation plots

    rld <- rlogTransformation(dds, blind=TRUE)

    pdf("plots_v2.pdf")
    plotPCA(rld, intgroup=c("condition", "sexe"))

    plotDispEsts(dds)

    dev.off()

G) Writing outputs files

  head(resGA)

    write.csv2(resGA, "DM1_vs_Ctrl_ClubBioInfo.csv")

    resGA_padj05 <- resGA[which(resGA$padj < 0.05 ),]
    write.csv2(resGA_padj05, "DM1_vs_Ctrl_ClubBioInfo_sig.csv")