(Optional) preprocessing of FASTQ files

Before applying DEUS we strongly recommend to trim your input FASTQ files. Trim Galore can be used to do adapter and quality trimming. The --max_length parameter should be set to (read length - 1) to remove non-small RNA sequences.

# remove adapters (automatically recognized by Trim Galore)
/software/trim_galore_v0.x.x/trim_galore -q 0 --length 16 --max_length 49 $in_file -o $out_dir
# quality trimming (use dummy adapter to suppress adapter trimming)
/software/trim_galore_v0.x.x/trim_galore -q 20 --length 1 -a CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC --stringency 50 $in_file -o $out_dir

Installing DEUS

You can install DEUS from GitHub with:

if (!require("devtools")) install.packages("devtools", repos='http://cran.us.r-project.org')
devtools::install_github("timjeske/DEUS")
library(DEUS)

Alternatively, you can install DEUS from a local copy.

First, you need to unzip it:

unzip DEUS-master.zip

Then you can install it in R:

if (!require("devtools")) install.packages("devtools", repos='http://cran.us.r-project.org')
devtools::install_local("./DEUS-master")
library(DEUS)

Setting parameters

Before running the individual steps of the DEUS pipeline it is helpful to set some required parameters. The in_dir is the folder where the (trimmed) FASTQ files are stored. The out_dir is the folder where plots of differential expression analysis (DEA), clustering and summary output files will be stored. The phenofile is the condition file for DEA (examplary condition file).

in_dir <- system.file("extdata", package = "DEUS")
out_dir <- tempdir()
phenofile <- system.file("extdata", "condition.tsv", package = "DEUS")
pheno_info <- read.table(phenofile, header=T, row.names=1, check.names=FALSE)
pheno_info
##                      sample condition
## cond1_s1 cond1_s1.fwd.fq.gz     cond1
## cond1_s2 cond1_s2.fwd.fq.gz     cond1
## cond1_s3 cond1_s3.fwd.fq.gz     cond1
## cond1_s4 cond1_s4.fwd.fq.gz     cond1
## cond1_s5 cond1_s5.fwd.fq.gz     cond1
## cond2_s1 cond2_s1.fwd.fq.gz     cond2
## cond2_s2 cond2_s2.fwd.fq.gz     cond2
## cond2_s3 cond2_s3.fwd.fq.gz     cond2
## cond2_s4 cond2_s4.fwd.fq.gz     cond2
## cond2_s5 cond2_s5.fwd.fq.gz     cond2

Counting unique sequences

To create the count table for differential expression analysis, createCountTableFromFastQs counts all unique sequences in each FASTQ file given in your pheno_info. As an optional step, we only keep sequences with an average sequence count larger than 1 to remove low expressed sequences and reduce the size of the count table.

count_table <- createCountTableFromFastQs(in_dir, pheno_info=pheno_info)
count_table_filtered <- count_table[rowMeans(count_table)>1,]
head(count_table_filtered, n=2)
##                           cond1_s1 cond1_s2 cond1_s3 cond1_s4 cond1_s5
## TCTCAGCCTTTTGGCTAAGATCAAG       19       19       21       10       18
## CTCAGCCTTTTGGCTAAGATCAAGT       16       19       13       21       17
##                           cond2_s1 cond2_s2 cond2_s3 cond2_s4 cond2_s5
## TCTCAGCCTTTTGGCTAAGATCAAG       13       10       15       22       10
## CTCAGCCTTTTGGCTAAGATCAAGT       12        6       16       19       10

Create identifiers for each sequence

A map is created by createMap which assigns a unique identifier to each sequence.

map <- createMap(count_table_filtered)
head(map, n=2)
##                           SequenceID
## TCTCAGCCTTTTGGCTAAGATCAAG      seq_1
## CTCAGCCTTTTGGCTAAGATCAAGT      seq_2

Running differential expression analysis

Differential expression analysis is done by calling the function runDESeq2. Results are filtered using a 0.05 Pvalue cutoff.

design <- ~ condition
de_results <- runDESeq2(count_table_filtered, pheno_info, design, out_dir=out_dir)
sig_results <- de_results$de_result
sig_results <- sig_results[!is.na(sig_results$IHWPvalue) & sig_results$IHWPvalue < 0.05,]
head(sig_results, n=2)
##                                 Pvalue       padj baseMean Log2FoldChange
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.0007176335 0.03462413 7.263225      -1.521815
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.0026808638 0.05501307 6.594336      -1.431923
##                               lfcSE      stat  IHWPvalue
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.4498748 -3.382752 0.04498802
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.4769671 -3.002143 0.04498802

Running BLAST

The significant sequences are subsequently stored in a FASTA file for BLAST annotation using sequencesAsFasta. BLAST is executed via runBlast. It is necessary to set the BLAST binary, your BLAST database and the number of threads to be used.

blast_exec <- "/your/path/to/ncbi-blast-x.x.x/bin/blastn"
blast_db <-system.file("extdata", "blastdb/DASHR_subset.fa", package = "DEUS")
blast_ncores <- 2
sig_seq_fasta <- sequencesAsFasta(sig_results,map)
blast_result <- runBlast(blast_exec, blast_db, blast_ncores, sig_seq_fasta) 

In our example, we load pre-compiled data.

blast_int <- system.file("extdata", "results/blast_result_sig_sequences.tsv", package="DEUS")
blast_result <- read.table(blast_int, header=T, sep="\t")
head(blast_result)
##   SequenceID Annotation Length BlastEvalue
## 1      seq_6   U2-L1036     25    8.23e-09
## 2      seq_6    U2-L996     25    8.23e-09
## 3      seq_6    U2-L968     25    8.23e-09
## 4      seq_6    U2-L946     25    8.23e-09
## 5      seq_6    U2-L930     25    8.23e-09
## 6      seq_6    U2-L929     25    8.23e-09

Compute counts per condition

The results of runDESeq2 are used as input for getConditionCountStats to compute the mean and the standard deviation of the normalized counts for each analyzed condition.

count_stats <- getConditionCountStats(de_results$norm_counts, pheno_info)
head(count_stats, n=2)
##                           NormCounts_cond1_Mean NormCounts_cond1_Sd
## TCTCAGCCTTTTGGCTAAGATCAAG              16.86876            4.147471
## CTCAGCCTTTTGGCTAAGATCAAGT              16.67663            2.914544
##                           NormCounts_cond2_Mean NormCounts_cond2_Sd
## TCTCAGCCTTTTGGCTAAGATCAAG              13.54612            4.701311
## CTCAGCCTTTTGGCTAAGATCAAGT              12.20912            4.944475

Run clustering

The CD-Hit algorithm is executed via runClustering. The function requires the cd-hit-est binary.

cd_hit <- "/your/path/to/cdhit/cd-hit-est"
sig_seq_fasta <- sequencesAsFasta(sig_results,map)
clust_result <- runClustering(cd_hit, sig_seq_fasta, out_dir, identity_cutoff=0.9, length_cutoff=0.9, wordlength=9, map)

For this manual, we will load available example data again.

clust_int <- system.file("extdata", "results/clust_result_sig_sequences.tsv", package="DEUS")
clust_result <- read.table(clust_int, header=T, sep="\t")
rownames(clust_result) <- clust_result$SequenceID
head(clust_result)
##           SequenceID ClusterID
## seq_1000    seq_1000       170
## seq_10011  seq_10011       632
## seq_1002    seq_1002       171
## seq_10023  seq_10023       633
## seq_10034  seq_10034       564
## seq_10036  seq_10036       634

Merging the results

Finally, the group-wise mean and standard deviation of the normalized counts, the results of DEA, BLASTing and clustering are merged together by mergeResults. Additionally, addCountsOfFeatureClasses is used to count the number of BLAST hits grouped by user defined feature classes. Summary files are then written to the out_dir by writeSummaryFiles.

classes <- c("tRNA","[Hh]sa","^U")
summary <- mergeResults(sig_results, count_stats, blast_result, clust_result, map)
summary <- addCountsOfFeatureClasses(summary, classes)
writeSummaryFiles(summary, out_dir)
head(summary, n=2)
##                           SequenceID Log2FoldChange       Pvalue
## TGCTCACTTTGGCAGCACATATACT  seq_12936       4.575022 3.738245e-07
## CTTCTCAGTCTTTTGGCTAAGATCA  seq_13006       4.446684 9.264864e-07
##                             IHWPvalue NormCounts_cond1_Mean
## TGCTCACTTTGGCAGCACATATACT 0.001663810                     0
## CTTCTCAGTCTTTTGGCTAAGATCA 0.002061792                     0
##                           NormCounts_cond1_Sd NormCounts_cond2_Mean
## TGCTCACTTTGGCAGCACATATACT                   0              6.599719
## CTTCTCAGTCTTTTGGCTAAGATCA                   0              6.000207
##                           NormCounts_cond2_Sd ClusterID Length BlastEvalue
## TGCTCACTTTGGCAGCACATATACT            1.821142       318     25    8.23e-09
## CTTCTCAGTCTTTTGGCTAAGATCA            1.707972       721     25    8.23e-09
##                           tRNA [Hh]sa ^U Other
## TGCTCACTTTGGCAGCACATATACT    0      0 26     0
## CTTCTCAGTCTTTTGGCTAAGATCA    0      0 17     0
##                                                                                                                                                                                                                                           FeatureList
## TGCTCACTTTGGCAGCACATATACT U6-L1687,U6-L1676,U6-L1561,U6-L1528,U6-L1493,U6-L1482,U6-L1306,U6-L1258,U6-L1248,U6-L1121,U6-L1110,U6-L1106,U6-L1093,U6-L1025,U6-L882,U6-L843,U6-L814,U6-L746,U6-L690,U6-L583,U6-L542,U6-L484,U6-L403,U6-L306,U6-L76,U6-L49
## CTTCTCAGTCTTTTGGCTAAGATCA                                                                                     U2-L1014,U2-L999,U2-L990,U2-L988,U2-L987,U2-L986,U2-L985,U2-L983,U2-L883,U2-L826,U2-L750,U2-L594,U2-L584,U2-L332,U2-L164,U2-L117,U2-L90

Computing the NA fraction

The function getNoBlastHitFraction can be used to compute the fraction of reads without annotation for each sample. This fraction corresponds to the fraction of reads that are likely to be ignored in mapping approaches because they map to regions that are not annotated.

getNoBlastHitFraction(summary, de_results$norm_counts)
##          NA_fraction
## cond1_s1 0.016237479
## cond1_s2 0.017400664
## cond1_s3 0.017287479
## cond1_s4 0.015625000
## cond1_s5 0.017318969
## cond2_s1 0.005076142
## cond2_s2 0.006242367
## cond2_s3 0.006804610
## cond2_s4 0.005953174
## cond2_s5 0.006856780

Extended expression analysis using sequence clusters

In addition to the default DEUS pipeline, we developed an adjusted approach that aggregates sequence counts for each sequence cluster and uses this information to find differentially expressed sequence clusters. By some small adjustments, the cluster expression analysis can be included into the default DEUS pipeline.

Instead of creating the sequence map for differentially expressed sequences only, it will be created on the raw input sequences since all sequences will be used for clustering.

map <- createMap(count_table)

Afterwards, all sequences can be clustered. Based on the clustering results, the sum of sequence counts will be created for each cluster via mergeAndAggregate.

cd_hit <- "/your/path/to/cdhit/cd-hit-est"
all_seq_fasta <- sequencesAsFasta(count_table,map)
clust_result <- runClustering(cd_hit, all_seq_fasta, out_dir, identity_cutoff=0.9, length_cutoff=0.9, wordlength=9, map)
cl_counts <- mergeAndAggregate(map, count_table, clust_result)

For this manual, we will load the provided clustering results.

clust_int <- system.file("extdata", "results/clust_result_clust_sequences.tsv", package="DEUS")
clust_result <- read.table(clust_int, header=T, sep="\t")
rownames(clust_result) <- clust_result$SequenceID
cl_counts <- mergeAndAggregate(map, count_table, clust_result)
head(cl_counts, n=2)
##   cond1_s1 cond1_s2 cond1_s3 cond1_s4 cond1_s5 cond2_s1 cond2_s2 cond2_s3
## 0      142      119      152      146      139      148      112      150
## 1       91       96       75       84       85      120      129      108
##   cond2_s4 cond2_s5
## 0      167      136
## 1      128       96

The resulting count matrix is then used as input for the DE analysis.

design <- ~ condition
cl_de_results <- runDESeq2(cl_counts, pheno_info, design, out_dir = out_dir, prefix = "Cluster")
cl_sig_results <- cl_de_results$de_result
head(cl_sig_results, n=2)
##        Pvalue         padj  baseMean Log2FoldChange     lfcSE      stat
## 0 0.563340354 0.8110233879 133.23181     0.06532971 0.1130493 0.5778871
## 1 0.000151244 0.0006688245  95.71573     0.46504625 0.1227353 3.7890179
##     IHWPvalue
## 0 1.000000000
## 1 0.001355991

Subsequently, the existing DEA results based on individual sequence counts and the generated DEA results based on sequence clusters can be merged into a single data frame via mergeSingleAndClusterResults. Based on the individual and cluster Pvalues, the resulting data frame is filtered to detect sequences that are differentially expressed based on their sequence count or part of a differentially expressed sequence cluster.

sig_results <- mergeSingleAndClusterResults(cl_sig_results,clust_result,sig_results,map)
sig_results <- sig_results[((!is.na(sig_results$IHWPval) & sig_results$IHWPval < 0.05) | (!is.na(sig_results$Cl_IHWPval) & sig_results$Cl_IHWPval < 0.05)),]
head(sig_results, n=2)
##                                 Pvalue       padj baseMean Log2FoldChange
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.0007176335 0.03462413 7.263225      -1.521815
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.0026808638 0.05501307 6.594336      -1.431923
##                               lfcSE      stat  IHWPvalue SequenceID
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.4498748 -3.382752 0.04498802      seq_6
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.4769671 -3.002143 0.04498802     seq_15
##                           Cl_Pvalue   Cl_padj Cl_baseMean
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.2181573 0.4118952    56.53792
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.2181573 0.4118952    56.53792
##                           Cl_Log2FoldChange Cl_lfcSE  Cl_stat Cl_IHWPvalue
## TCGCTTCTCAGCCTTTTGGCTAAGA          0.201767 0.163846 1.231443    0.6115901
## CGCTTCTCAGCCTTTTGGCTAAGAT          0.201767 0.163846 1.231443    0.6115901
##                           ClusterID
## TCGCTTCTCAGCCTTTTGGCTAAGA         2
## CGCTTCTCAGCCTTTTGGCTAAGAT         2

Next, we run BLAST on all selected sequences.

blast_exec <- "/your/path/to/ncbi-blast-x.x.x/bin/blastn"
blast_db <-system.file("extdata", "blastdb/DASHR_subset.fa", package = "DEUS")
blast_ncores <- 2
sig_seq_fasta <- sequencesAsFasta(sig_results,map)
blast_result <- runBlast(blast_exec, blast_db, blast_ncores, sig_seq_fasta) 

In our example, we load pre-compiled data.

blast_int <- system.file("extdata", "results/blast_result_clust_sequences.tsv", package="DEUS")
blast_result <- read.table(blast_int, header=T, sep="\t")
head(blast_result)
##   SequenceID Annotation Length BlastEvalue
## 1      seq_5   U2-L1076     25    8.23e-09
## 2      seq_5   U2-L1072     25    8.23e-09
## 3      seq_5   U2-L1020     25    8.23e-09
## 4      seq_5   U2-L1019     25    8.23e-09
## 5      seq_5    U2-L974     25    8.23e-09
## 6      seq_5    U2-L973     25    8.23e-09

Finally, the summary table is generated again.

# merge results
classes <- c("tRNA","[Hh]sa","^U")
summary <- mergeResults(sig_results, count_stats, blast_result, clust_result, map)
summary <- addCountsOfFeatureClasses(summary, classes)
writeSummaryFiles(summary, out_dir)
head(summary, n=2)
##                           SequenceID Log2FoldChange       Pvalue
## TGCTCACTTTGGCAGCACATATACT  seq_37001       4.575022 3.738245e-07
## CTTCTCAGTCTTTTGGCTAAGATCA  seq_37090       4.446684 9.264864e-07
##                             IHWPvalue Cl_Log2FoldChange    Cl_Pvalue
## TGCTCACTTTGGCAGCACATATACT 0.001663810          2.079899 3.094328e-13
## CTTCTCAGTCTTTTGGCTAAGATCA 0.002061792          3.820641 8.413101e-05
##                           Cl_IHWPvalue NormCounts_cond1_Mean
## TGCTCACTTTGGCAGCACATATACT 4.206725e-09                     0
## CTTCTCAGTCTTTTGGCTAAGATCA 2.063389e-03                     0
##                           NormCounts_cond1_Sd NormCounts_cond2_Mean
## TGCTCACTTTGGCAGCACATATACT                   0              6.599719
## CTTCTCAGTCTTTTGGCTAAGATCA                   0              6.000207
##                           NormCounts_cond2_Sd ClusterID Length BlastEvalue
## TGCTCACTTTGGCAGCACATATACT            1.821142      2493     25    8.23e-09
## CTTCTCAGTCTTTTGGCTAAGATCA            1.707972      7520     25    8.23e-09
##                           tRNA [Hh]sa ^U Other
## TGCTCACTTTGGCAGCACATATACT    0      0 26     0
## CTTCTCAGTCTTTTGGCTAAGATCA    0      0 17     0
##                                                                                                                                                                                                                                           FeatureList
## TGCTCACTTTGGCAGCACATATACT U6-L1687,U6-L1676,U6-L1561,U6-L1528,U6-L1493,U6-L1482,U6-L1306,U6-L1258,U6-L1248,U6-L1121,U6-L1110,U6-L1106,U6-L1093,U6-L1025,U6-L882,U6-L843,U6-L814,U6-L746,U6-L690,U6-L583,U6-L542,U6-L484,U6-L403,U6-L306,U6-L76,U6-L49
## CTTCTCAGTCTTTTGGCTAAGATCA                                                                                     U2-L1014,U2-L999,U2-L990,U2-L988,U2-L987,U2-L986,U2-L985,U2-L983,U2-L883,U2-L826,U2-L750,U2-L594,U2-L584,U2-L332,U2-L164,U2-L117,U2-L90

In order to compare both aproaches, the function printClusterSummary can be used to get detailed information about the number of significant sequences and clusters.

## ## Sequence summary ##
## Significant sequences: 1906
## Significant sequences not clustered: 0
## Sequences in significant clusters: 22110
## Non-significant sequences in significant clusters: 20369
## - too low expressed: 20369
## - too weak differentially expressed: 0
## Significant sequences in non-significant clusters: 165
## ## Cluster summary ##
## Significant clusters: 4628
## Clusters including significant sequences: 1323
## Significant clusters including no significant sequences: 3429