Before applying DEUS we strongly recommend to trim your input FASTQ files. Trim Galore can be used to do adapter and quality trimming. The --max_length
parameter should be set to (read length - 1) to remove non-small RNA sequences.
# remove adapters (automatically recognized by Trim Galore)
/software/trim_galore_v0.x.x/trim_galore -q 0 --length 16 --max_length 49 $in_file -o $out_dir
# quality trimming (use dummy adapter to suppress adapter trimming)
/software/trim_galore_v0.x.x/trim_galore -q 20 --length 1 -a CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC --stringency 50 $in_file -o $out_dir
You can install DEUS from GitHub with:
if (!require("devtools")) install.packages("devtools", repos='http://cran.us.r-project.org')
devtools::install_github("timjeske/DEUS")
library(DEUS)
Alternatively, you can install DEUS from a local copy.
First, you need to unzip it:
unzip DEUS-master.zip
Then you can install it in R:
if (!require("devtools")) install.packages("devtools", repos='http://cran.us.r-project.org')
devtools::install_local("./DEUS-master")
library(DEUS)
Before running the individual steps of the DEUS pipeline it is helpful to set some required parameters. The in_dir
is the folder where the (trimmed) FASTQ files are stored. The out_dir
is the folder where plots of differential expression analysis (DEA), clustering and summary output files will be stored. The phenofile
is the condition file for DEA (examplary condition file).
in_dir <- system.file("extdata", package = "DEUS")
out_dir <- tempdir()
phenofile <- system.file("extdata", "condition.tsv", package = "DEUS")
pheno_info <- read.table(phenofile, header=T, row.names=1, check.names=FALSE)
pheno_info
## sample condition
## cond1_s1 cond1_s1.fwd.fq.gz cond1
## cond1_s2 cond1_s2.fwd.fq.gz cond1
## cond1_s3 cond1_s3.fwd.fq.gz cond1
## cond1_s4 cond1_s4.fwd.fq.gz cond1
## cond1_s5 cond1_s5.fwd.fq.gz cond1
## cond2_s1 cond2_s1.fwd.fq.gz cond2
## cond2_s2 cond2_s2.fwd.fq.gz cond2
## cond2_s3 cond2_s3.fwd.fq.gz cond2
## cond2_s4 cond2_s4.fwd.fq.gz cond2
## cond2_s5 cond2_s5.fwd.fq.gz cond2
To create the count table for differential expression analysis, createCountTableFromFastQs counts all unique sequences in each FASTQ file given in your pheno_info
. As an optional step, we only keep sequences with an average sequence count larger than 1 to remove low expressed sequences and reduce the size of the count table.
count_table <- createCountTableFromFastQs(in_dir, pheno_info=pheno_info)
count_table_filtered <- count_table[rowMeans(count_table)>1,]
head(count_table_filtered, n=2)
## cond1_s1 cond1_s2 cond1_s3 cond1_s4 cond1_s5
## TCTCAGCCTTTTGGCTAAGATCAAG 19 19 21 10 18
## CTCAGCCTTTTGGCTAAGATCAAGT 16 19 13 21 17
## cond2_s1 cond2_s2 cond2_s3 cond2_s4 cond2_s5
## TCTCAGCCTTTTGGCTAAGATCAAG 13 10 15 22 10
## CTCAGCCTTTTGGCTAAGATCAAGT 12 6 16 19 10
A map is created by createMap which assigns a unique identifier to each sequence.
## SequenceID
## TCTCAGCCTTTTGGCTAAGATCAAG seq_1
## CTCAGCCTTTTGGCTAAGATCAAGT seq_2
Differential expression analysis is done by calling the function runDESeq2. Results are filtered using a 0.05 Pvalue cutoff.
design <- ~ condition
de_results <- runDESeq2(count_table_filtered, pheno_info, design, out_dir=out_dir)
sig_results <- de_results$de_result
sig_results <- sig_results[!is.na(sig_results$IHWPvalue) & sig_results$IHWPvalue < 0.05,]
head(sig_results, n=2)
## Pvalue padj baseMean Log2FoldChange
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.0007176335 0.03462413 7.263225 -1.521815
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.0026808638 0.05501307 6.594336 -1.431923
## lfcSE stat IHWPvalue
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.4498748 -3.382752 0.04498802
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.4769671 -3.002143 0.04498802
The significant sequences are subsequently stored in a FASTA file for BLAST annotation using sequencesAsFasta. BLAST is executed via runBlast. It is necessary to set the BLAST binary, your BLAST database and the number of threads to be used.
blast_exec <- "/your/path/to/ncbi-blast-x.x.x/bin/blastn"
blast_db <-system.file("extdata", "blastdb/DASHR_subset.fa", package = "DEUS")
blast_ncores <- 2
sig_seq_fasta <- sequencesAsFasta(sig_results,map)
blast_result <- runBlast(blast_exec, blast_db, blast_ncores, sig_seq_fasta)
In our example, we load pre-compiled data.
blast_int <- system.file("extdata", "results/blast_result_sig_sequences.tsv", package="DEUS")
blast_result <- read.table(blast_int, header=T, sep="\t")
head(blast_result)
## SequenceID Annotation Length BlastEvalue
## 1 seq_6 U2-L1036 25 8.23e-09
## 2 seq_6 U2-L996 25 8.23e-09
## 3 seq_6 U2-L968 25 8.23e-09
## 4 seq_6 U2-L946 25 8.23e-09
## 5 seq_6 U2-L930 25 8.23e-09
## 6 seq_6 U2-L929 25 8.23e-09
The results of runDESeq2 are used as input for getConditionCountStats to compute the mean and the standard deviation of the normalized counts for each analyzed condition.
count_stats <- getConditionCountStats(de_results$norm_counts, pheno_info)
head(count_stats, n=2)
## NormCounts_cond1_Mean NormCounts_cond1_Sd
## TCTCAGCCTTTTGGCTAAGATCAAG 16.86876 4.147471
## CTCAGCCTTTTGGCTAAGATCAAGT 16.67663 2.914544
## NormCounts_cond2_Mean NormCounts_cond2_Sd
## TCTCAGCCTTTTGGCTAAGATCAAG 13.54612 4.701311
## CTCAGCCTTTTGGCTAAGATCAAGT 12.20912 4.944475
The CD-Hit algorithm is executed via runClustering. The function requires the cd-hit-est binary.
cd_hit <- "/your/path/to/cdhit/cd-hit-est"
sig_seq_fasta <- sequencesAsFasta(sig_results,map)
clust_result <- runClustering(cd_hit, sig_seq_fasta, out_dir, identity_cutoff=0.9, length_cutoff=0.9, wordlength=9, map)
For this manual, we will load available example data again.
clust_int <- system.file("extdata", "results/clust_result_sig_sequences.tsv", package="DEUS")
clust_result <- read.table(clust_int, header=T, sep="\t")
rownames(clust_result) <- clust_result$SequenceID
head(clust_result)
## SequenceID ClusterID
## seq_1000 seq_1000 170
## seq_10011 seq_10011 632
## seq_1002 seq_1002 171
## seq_10023 seq_10023 633
## seq_10034 seq_10034 564
## seq_10036 seq_10036 634
Finally, the group-wise mean and standard deviation of the normalized counts, the results of DEA, BLASTing and clustering are merged together by mergeResults. Additionally, addCountsOfFeatureClasses is used to count the number of BLAST hits grouped by user defined feature classes. Summary files are then written to the out_dir
by writeSummaryFiles.
classes <- c("tRNA","[Hh]sa","^U")
summary <- mergeResults(sig_results, count_stats, blast_result, clust_result, map)
summary <- addCountsOfFeatureClasses(summary, classes)
writeSummaryFiles(summary, out_dir)
head(summary, n=2)
## SequenceID Log2FoldChange Pvalue
## TGCTCACTTTGGCAGCACATATACT seq_12936 4.575022 3.738245e-07
## CTTCTCAGTCTTTTGGCTAAGATCA seq_13006 4.446684 9.264864e-07
## IHWPvalue NormCounts_cond1_Mean
## TGCTCACTTTGGCAGCACATATACT 0.001663810 0
## CTTCTCAGTCTTTTGGCTAAGATCA 0.002061792 0
## NormCounts_cond1_Sd NormCounts_cond2_Mean
## TGCTCACTTTGGCAGCACATATACT 0 6.599719
## CTTCTCAGTCTTTTGGCTAAGATCA 0 6.000207
## NormCounts_cond2_Sd ClusterID Length BlastEvalue
## TGCTCACTTTGGCAGCACATATACT 1.821142 318 25 8.23e-09
## CTTCTCAGTCTTTTGGCTAAGATCA 1.707972 721 25 8.23e-09
## tRNA [Hh]sa ^U Other
## TGCTCACTTTGGCAGCACATATACT 0 0 26 0
## CTTCTCAGTCTTTTGGCTAAGATCA 0 0 17 0
## FeatureList
## TGCTCACTTTGGCAGCACATATACT U6-L1687,U6-L1676,U6-L1561,U6-L1528,U6-L1493,U6-L1482,U6-L1306,U6-L1258,U6-L1248,U6-L1121,U6-L1110,U6-L1106,U6-L1093,U6-L1025,U6-L882,U6-L843,U6-L814,U6-L746,U6-L690,U6-L583,U6-L542,U6-L484,U6-L403,U6-L306,U6-L76,U6-L49
## CTTCTCAGTCTTTTGGCTAAGATCA U2-L1014,U2-L999,U2-L990,U2-L988,U2-L987,U2-L986,U2-L985,U2-L983,U2-L883,U2-L826,U2-L750,U2-L594,U2-L584,U2-L332,U2-L164,U2-L117,U2-L90
The function getNoBlastHitFraction can be used to compute the fraction of reads without annotation for each sample. This fraction corresponds to the fraction of reads that are likely to be ignored in mapping approaches because they map to regions that are not annotated.
getNoBlastHitFraction(summary, de_results$norm_counts)
## NA_fraction
## cond1_s1 0.016237479
## cond1_s2 0.017400664
## cond1_s3 0.017287479
## cond1_s4 0.015625000
## cond1_s5 0.017318969
## cond2_s1 0.005076142
## cond2_s2 0.006242367
## cond2_s3 0.006804610
## cond2_s4 0.005953174
## cond2_s5 0.006856780
In addition to the default DEUS pipeline, we developed an adjusted approach that aggregates sequence counts for each sequence cluster and uses this information to find differentially expressed sequence clusters. By some small adjustments, the cluster expression analysis can be included into the default DEUS pipeline.
Instead of creating the sequence map for differentially expressed sequences only, it will be created on the raw input sequences since all sequences will be used for clustering.
map <- createMap(count_table)
Afterwards, all sequences can be clustered. Based on the clustering results, the sum of sequence counts will be created for each cluster via mergeAndAggregate.
cd_hit <- "/your/path/to/cdhit/cd-hit-est"
all_seq_fasta <- sequencesAsFasta(count_table,map)
clust_result <- runClustering(cd_hit, all_seq_fasta, out_dir, identity_cutoff=0.9, length_cutoff=0.9, wordlength=9, map)
cl_counts <- mergeAndAggregate(map, count_table, clust_result)
For this manual, we will load the provided clustering results.
clust_int <- system.file("extdata", "results/clust_result_clust_sequences.tsv", package="DEUS")
clust_result <- read.table(clust_int, header=T, sep="\t")
rownames(clust_result) <- clust_result$SequenceID
cl_counts <- mergeAndAggregate(map, count_table, clust_result)
head(cl_counts, n=2)
## cond1_s1 cond1_s2 cond1_s3 cond1_s4 cond1_s5 cond2_s1 cond2_s2 cond2_s3
## 0 142 119 152 146 139 148 112 150
## 1 91 96 75 84 85 120 129 108
## cond2_s4 cond2_s5
## 0 167 136
## 1 128 96
The resulting count matrix is then used as input for the DE analysis.
design <- ~ condition
cl_de_results <- runDESeq2(cl_counts, pheno_info, design, out_dir = out_dir, prefix = "Cluster")
cl_sig_results <- cl_de_results$de_result
head(cl_sig_results, n=2)
## Pvalue padj baseMean Log2FoldChange lfcSE stat
## 0 0.563340354 0.8110233879 133.23181 0.06532971 0.1130493 0.5778871
## 1 0.000151244 0.0006688245 95.71573 0.46504625 0.1227353 3.7890179
## IHWPvalue
## 0 1.000000000
## 1 0.001355991
Subsequently, the existing DEA results based on individual sequence counts and the generated DEA results based on sequence clusters can be merged into a single data frame via mergeSingleAndClusterResults. Based on the individual and cluster Pvalues, the resulting data frame is filtered to detect sequences that are differentially expressed based on their sequence count or part of a differentially expressed sequence cluster.
sig_results <- mergeSingleAndClusterResults(cl_sig_results,clust_result,sig_results,map)
sig_results <- sig_results[((!is.na(sig_results$IHWPval) & sig_results$IHWPval < 0.05) | (!is.na(sig_results$Cl_IHWPval) & sig_results$Cl_IHWPval < 0.05)),]
head(sig_results, n=2)
## Pvalue padj baseMean Log2FoldChange
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.0007176335 0.03462413 7.263225 -1.521815
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.0026808638 0.05501307 6.594336 -1.431923
## lfcSE stat IHWPvalue SequenceID
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.4498748 -3.382752 0.04498802 seq_6
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.4769671 -3.002143 0.04498802 seq_15
## Cl_Pvalue Cl_padj Cl_baseMean
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.2181573 0.4118952 56.53792
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.2181573 0.4118952 56.53792
## Cl_Log2FoldChange Cl_lfcSE Cl_stat Cl_IHWPvalue
## TCGCTTCTCAGCCTTTTGGCTAAGA 0.201767 0.163846 1.231443 0.6115901
## CGCTTCTCAGCCTTTTGGCTAAGAT 0.201767 0.163846 1.231443 0.6115901
## ClusterID
## TCGCTTCTCAGCCTTTTGGCTAAGA 2
## CGCTTCTCAGCCTTTTGGCTAAGAT 2
Next, we run BLAST on all selected sequences.
blast_exec <- "/your/path/to/ncbi-blast-x.x.x/bin/blastn"
blast_db <-system.file("extdata", "blastdb/DASHR_subset.fa", package = "DEUS")
blast_ncores <- 2
sig_seq_fasta <- sequencesAsFasta(sig_results,map)
blast_result <- runBlast(blast_exec, blast_db, blast_ncores, sig_seq_fasta)
In our example, we load pre-compiled data.
blast_int <- system.file("extdata", "results/blast_result_clust_sequences.tsv", package="DEUS")
blast_result <- read.table(blast_int, header=T, sep="\t")
head(blast_result)
## SequenceID Annotation Length BlastEvalue
## 1 seq_5 U2-L1076 25 8.23e-09
## 2 seq_5 U2-L1072 25 8.23e-09
## 3 seq_5 U2-L1020 25 8.23e-09
## 4 seq_5 U2-L1019 25 8.23e-09
## 5 seq_5 U2-L974 25 8.23e-09
## 6 seq_5 U2-L973 25 8.23e-09
Finally, the summary table is generated again.
# merge results
classes <- c("tRNA","[Hh]sa","^U")
summary <- mergeResults(sig_results, count_stats, blast_result, clust_result, map)
summary <- addCountsOfFeatureClasses(summary, classes)
writeSummaryFiles(summary, out_dir)
head(summary, n=2)
## SequenceID Log2FoldChange Pvalue
## TGCTCACTTTGGCAGCACATATACT seq_37001 4.575022 3.738245e-07
## CTTCTCAGTCTTTTGGCTAAGATCA seq_37090 4.446684 9.264864e-07
## IHWPvalue Cl_Log2FoldChange Cl_Pvalue
## TGCTCACTTTGGCAGCACATATACT 0.001663810 2.079899 3.094328e-13
## CTTCTCAGTCTTTTGGCTAAGATCA 0.002061792 3.820641 8.413101e-05
## Cl_IHWPvalue NormCounts_cond1_Mean
## TGCTCACTTTGGCAGCACATATACT 4.206725e-09 0
## CTTCTCAGTCTTTTGGCTAAGATCA 2.063389e-03 0
## NormCounts_cond1_Sd NormCounts_cond2_Mean
## TGCTCACTTTGGCAGCACATATACT 0 6.599719
## CTTCTCAGTCTTTTGGCTAAGATCA 0 6.000207
## NormCounts_cond2_Sd ClusterID Length BlastEvalue
## TGCTCACTTTGGCAGCACATATACT 1.821142 2493 25 8.23e-09
## CTTCTCAGTCTTTTGGCTAAGATCA 1.707972 7520 25 8.23e-09
## tRNA [Hh]sa ^U Other
## TGCTCACTTTGGCAGCACATATACT 0 0 26 0
## CTTCTCAGTCTTTTGGCTAAGATCA 0 0 17 0
## FeatureList
## TGCTCACTTTGGCAGCACATATACT U6-L1687,U6-L1676,U6-L1561,U6-L1528,U6-L1493,U6-L1482,U6-L1306,U6-L1258,U6-L1248,U6-L1121,U6-L1110,U6-L1106,U6-L1093,U6-L1025,U6-L882,U6-L843,U6-L814,U6-L746,U6-L690,U6-L583,U6-L542,U6-L484,U6-L403,U6-L306,U6-L76,U6-L49
## CTTCTCAGTCTTTTGGCTAAGATCA U2-L1014,U2-L999,U2-L990,U2-L988,U2-L987,U2-L986,U2-L985,U2-L983,U2-L883,U2-L826,U2-L750,U2-L594,U2-L584,U2-L332,U2-L164,U2-L117,U2-L90
In order to compare both aproaches, the function printClusterSummary can be used to get detailed information about the number of significant sequences and clusters.
printClusterSummary(summary)
## ## Sequence summary ##
## Significant sequences: 1906
## Significant sequences not clustered: 0
## Sequences in significant clusters: 22110
## Non-significant sequences in significant clusters: 20369
## - too low expressed: 20369
## - too weak differentially expressed: 0
## Significant sequences in non-significant clusters: 165
## ## Cluster summary ##
## Significant clusters: 4628
## Clusters including significant sequences: 1323
## Significant clusters including no significant sequences: 3429