NOTE: This analysis requires at least 10GB of RAM to run. It uses large files not included in the repository and many steps can take a few minutes to run.
input_folder <- "raw_input" # Where all the large input files are. Ignored by git.
output_folder <- "results" # Where plots will be saved
output_format <- "pdf" # The file format of saved plots
pub_fig_folder <- "publication"
revision_n <- 1
result_path <- function(name) {
file.path(output_folder, paste0(name, ".", output_format))
}
save_publication_fig <- function(name, figure_number) {
file.path(result_path(name), paste0("revision_", revision_n), paste0("figure_", figure_number, "--", name, ".", output_format))
}
These settings are shared between the RDP, SILVA, and Greengenes analyses, since the results of those three analyeses are combined in one plot later. This ensures that all of the graphs use the same color and size scales, instead of optimizing them for each data set, as would be done automatically otherwise.
size_range <- c(0.0004, 0.015) # The size range of nodes
label_size_range <- c(0.002, 0.05) # The size range of labels
all_size_interval <- c(1, 3000000) # The range of read counts to display in the whole database plots
pcr_size_interval <- c(1, 25000) # The range of read counts to display in the PCR plots
label_max <- 50 # The maximum number of labels to show on each graph
max_taxonomy_depth <- 4 # The maximum number of taxonomic ranks to show
min_seq_count <- NULL # The minimum number of sequeces need to show a taxon.
just_bacteria <- TRUE # If TRUE, only show bacterial taxa
max_mismatch <- 10 # Percentage mismatch tolerated in pcr
pcr_success_cutoff <- 0.90 # Any taxon with a greater proportion of PCR sucess will be excluded from the PCR plots
min_seq_length <- 1200 # Use to encourage full length sequences
forward_primer = c("515F" = "GTGYCAGCMGCCGCGGTAA")
reverse_primer = c("806R" = "GGACTACNVGGGTWTCTAAT")
pcr_success_color_scale = c(viridis::plasma(10)[4:9], "lightgrey")
The code below parses and subsets the entire SILVA non-redundant reference database (Quast et al. 2012). The input file is not included in this repository because it is 905MB, but you can download it from the SILVA website here:
https://www.arb-silva.de/no_cache/download/archive/release_123_1/Exports/
library(metacoder)
silva_path <- file.path(input_folder, "SILVA_123.1_SSURef_Nr99_tax_silva.fasta")
silva_seq <- read_fasta(silva_path)
silva <- extract_tax_data(names(silva_seq),
regex = "^(.*?) (.*)$",
key = c(id = "info", "class"),
class_sep = ";")
silva$data$tax_data$sequence <- silva_seq
print(silva)
## <Taxmap>
## 147708 taxa: aaab. Eukaryota, aaac. Bacteria ... iknc. Albula vulpes (bonefish)
## 147708 edges: NA->aaab, NA->aaac, NA->aaad ... ijpd->ikna, ijph->iknb, ijpn->iknc
## 1 data sets:
## tax_data:
## # A tibble: 598,470 x 5
## taxon_id id class input sequence
## <chr> <chr> <chr> <chr> <chr>
## 1 ghyh AB0014… Eukaryota;SAR;Alveo… AB001438.1.1662 Euk… GAAACUGCGAAUGGCUCAUUA…
## 2 ghyi AB0060… Eukaryota;Archaepla… AB006051.1.1796 Euk… UACCUGGUUGAUCCUGCCAGU…
## 3 hcfq FJ9118… Eukaryota;Opisthoko… FJ911802.1.1606 Euk… GGGAUGUGAUUUGUUAAUUGG…
## # ... with 5.985e+05 more rows
## 0 functions:
Next I will subset the taxa in the data set (depending on parameter settings). This can help make the graphs less cluttered and make it easier to compare databases.
if (! is.null(min_seq_count)) {
silva <- filter_taxa(silva, n_obs >= min_seq_count)
}
if (just_bacteria) {
silva <- filter_taxa(silva, taxon_names == "Bacteria", subtaxa = TRUE)
}
if (! is.null(max_taxonomy_depth)) {
silva <- filter_taxa(silva, n_supertaxa <= max_taxonomy_depth)
}
print(silva)
## <Taxmap>
## 5006 taxa: aaac. Bacteria, aaah. Proteobacteria ... akgw. uncultured soil bacterium
## 5006 edges: NA->aaac, aaac->aaah, aaac->aaai ... abww->akgt, acdd->akgu, acnw->akgw
## 1 data sets:
## tax_data:
## # A tibble: 513,121 x 5
## taxon_id id class input sequence
## <chr> <chr> <chr> <chr> <chr>
## 1 afrh AY2301… Bacteria;Proteobact… AY230195.1.1496 Bac… UGUUUGAUCCUGGCUCAGAUU…
## 2 afri AY2307… Bacteria;Firmicutes… AY230764.1.1512 Bac… UGAUCCUGGCUCAGGACGAAC…
## 3 afrj AY2308… Bacteria;Proteobact… AY230816.1.1447 Bac… AGCGAACGCUGGCGGCAUGCU…
## # ... with 5.131e+05 more rows
## 0 functions:
These are not bacterial and will bias the digital PCR results. Note that the invert
option makes it so taxa are included that did not pass the filter. This is different than simply using name != "Chloroplast", subtaxa = TRUE
since the effects of invert
are applied after those of subtaxa
.
silva <- filter_taxa(silva, taxon_names == "Chloroplast", subtaxa = TRUE, invert = TRUE)
print(silva)
## <Taxmap>
## 3843 taxa: aaac. Bacteria ... akgw. uncultured soil bacterium
## 3843 edges: NA->aaac, aaac->aaah, aaac->aaai ... abww->akgt, acdd->akgu, acnw->akgw
## 1 data sets:
## tax_data:
## # A tibble: 513,121 x 5
## taxon_id id class input sequence
## <chr> <chr> <chr> <chr> <chr>
## 1 afrh AY2301… Bacteria;Proteobact… AY230195.1.1496 Bac… UGUUUGAUCCUGGCUCAGAUU…
## 2 afri AY2307… Bacteria;Firmicutes… AY230764.1.1512 Bac… UGAUCCUGGCUCAGGACGAAC…
## 3 afrj AY2308… Bacteria;Proteobact… AY230816.1.1447 Bac… AGCGAACGCUGGCGGCAUGCU…
## # ... with 5.131e+05 more rows
## 0 functions:
Although graphing everything can be a bit overwhelming (3843 taxa), it gives an intuitive feel for the complexity of the database:
silva_plot_all <- heat_tree(silva,
node_size = n_obs,
node_color = n_obs,
node_size_range = size_range * 2,
edge_size_range = size_range,
node_size_interval = all_size_interval,
edge_size_interval = all_size_interval,
node_color_interval = all_size_interval,
edge_color_interval = all_size_interval,
node_label = taxon_names,
node_label_size_range = label_size_range,
node_label_max = label_max,
node_color_axis_label = "Sequence count",
margin_size = c(.01, 0, 0, 0),
output_file = result_path("silva--all"))
print(silva_plot_all)
Before doing the digital PCR, I will filter for only full length sequences, since shorter sequences might not have a primer binding site and the digital PCR would fail, not because of an inability of a real primer to bind to real DNA, but because of missing sequence information in the database. Ideally, this kind of filtering for full length sequences would involve something like a multiple sequences alignment so we don’t remove sequences that are actually full length, but just happen to be shorter than the cutoff of 1200 naturally. However, this method is easy and should work OK.
if (! is.null(min_seq_length)) {
silva <- filter_obs(silva, data = "tax_data", drop_taxa = TRUE,
vapply(sequence, nchar, numeric(1)) >= min_seq_length)
}
print(silva)
## <Taxmap>
## 3842 taxa: aaac. Bacteria, aaah. Proteobacteria ... akgw. uncultured soil bacterium
## 3842 edges: NA->aaac, aaac->aaah, aaac->aaai ... abww->akgt, acdd->akgu, acnw->akgw
## 1 data sets:
## tax_data:
## # A tibble: 513,120 x 5
## taxon_id id class input sequence
## <chr> <chr> <chr> <chr> <chr>
## 1 afrh AY2301… Bacteria;Proteobact… AY230195.1.1496 Bac… UGUUUGAUCCUGGCUCAGAUU…
## 2 afri AY2307… Bacteria;Firmicutes… AY230764.1.1512 Bac… UGAUCCUGGCUCAGGACGAAC…
## 3 afrj AY2308… Bacteria;Proteobact… AY230816.1.1447 Bac… AGCGAACGCUGGCGGCAUGCU…
## # ... with 5.131e+05 more rows
## 0 functions:
Next I will conduct digital PCR with a new set of universal 16S primers (Walters et al. 2016), allowing for a maximum mismatch of 10%.
silva_pcr <- primersearch(silva, sequence,
forward = forward_primer,
reverse = reverse_primer,
mismatch = max_mismatch)
print(silva_pcr)
## <Taxmap>
## 3842 taxa: aaac. Bacteria, aaah. Proteobacteria ... akgw. uncultured soil bacterium
## 3842 edges: NA->aaac, aaac->aaah, aaac->aaai ... abww->akgt, acdd->akgu, acnw->akgw
## 3 data sets:
## tax_data:
## # A tibble: 513,120 x 5
## taxon_id id class input sequence
## <chr> <chr> <chr> <chr> <chr>
## 1 afrh AY2301… Bacteria;Proteobact… AY230195.1.1496 Bac… UGUUUGAUCCUGGCUCAGAUU…
## 2 afri AY2307… Bacteria;Firmicutes… AY230764.1.1512 Bac… UGAUCCUGGCUCAGGACGAAC…
## 3 afrj AY2308… Bacteria;Proteobact… AY230816.1.1447 Bac… AGCGAACGCUGGCGGCAUGCU…
## # ... with 5.131e+05 more rows
## amplicons:
## # A tibble: 491,469 x 14
## taxon_id seq_index f_primer r_primer f_mismatch r_mismatch f_start f_end r_start
## <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 afrh 1 GTGYCAG… GGACTAC… 0 0 499 517 771
## 2 afri 2 GTGYCAG… GGACTAC… 0 0 512 530 783
## 3 afrj 3 GTGYCAG… GGACTAC… 0 0 433 451 705
## # ... with 4.915e+05 more rows, and 5 more variables: r_end <dbl>, f_match <chr>,
## # r_match <chr>, amplicon <chr>, product <chr>
## tax_amp_stats:
## # A tibble: 3,842 x 13
## taxon_id query_count seq_count amp_count amplified multiple prop_amplified
## <chr> <dbl> <dbl> <dbl> <lgl> <lgl> <dbl>
## 1 aaac 513120 491346 491469 TRUE TRUE 0.958
## 2 aaah 200083 193537 193601 TRUE TRUE 0.967
## 3 aaai 138517 132344 132369 TRUE TRUE 0.955
## # ... with 3,839 more rows, and 6 more variables: med_amp_len <dbl>,
## # min_amp_len <dbl>, max_amp_len <dbl>, med_prod_len <dbl>, min_prod_len <dbl>,
## # max_prod_len <dbl>
## 0 functions:
Now the object silva_pcr
has all the information that silva
has plus the results of the digital PCR. Lets plot the whole database again, but coloring based on digital PCR success.
silva_plot_pcr_all <- heat_tree(silva_pcr,
node_size = seq_count,
node_color = prop_amplified,
node_label = taxon_names,
node_size_range = size_range * 2,
edge_size_range = size_range,
node_size_interval = all_size_interval,
edge_size_interval = all_size_interval,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
edge_color_interval = c(0, 1),
node_color_interval = c(0, 1),
node_label_size_range = label_size_range,
node_label_max = label_max,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequence count",
output_file = result_path("silva--pcr_all"))
print(silva_plot_pcr_all)
Since these are universal bacterial primers, we would expect most of the database to amplify. The plot shows this for the most part, but there are many taxa where around 20% of their sequences are not amplified and some that have no sequences amplified. If we wanted to look at only those sequences that did not amplify, we could filter taxa by PCR success and plot what remains. I am including the supertaxa of those taxa that did not amplify since excluding them would split the tree into a cloud of fragments lacking taxonomic context.
silva_plot_pcr_fail <- silva_pcr %>%
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
heat_tree(node_size = query_count - seq_count,
node_color = prop_amplified,
node_label = taxon_names,
node_size_range = size_range * 2,
edge_size_range = size_range,
node_size_interval = pcr_size_interval,
edge_size_interval = pcr_size_interval,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_color_interval = c(0, 1),
edge_color_interval = c(0, 1),
node_label_size_range = label_size_range,
node_label_max = label_max,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequences not amplified",
output_file = result_path("silva--pcr_fail"))
print(silva_plot_pcr_fail)
This is a figure I will use for the publication, so I have limited the number of labels printed to 50 to increase readability. However, we can do the same graph with many more smaller labels for exploring with a PDF viewer with good zooming abilities:
silva_plot_pcr_fail <- silva_pcr %>%
filter_taxa(prop_amplified < pcr_success_cutoff, supertaxa = TRUE) %>%
heat_tree(node_size = query_count - seq_count,
node_color = prop_amplified,
node_label = taxon_names,
node_size_range = size_range * 2,
edge_size_range = size_range,
node_size_interval = pcr_size_interval,
edge_size_interval = pcr_size_interval,
node_color_range = pcr_success_color_scale,
node_color_trans = "linear",
node_color_interval = c(0, 1),
edge_color_interval = c(0, 1),
node_label_size_range = label_size_range * 0.6,
node_label_max = label_max * 10,
node_color_axis_label = "Proportion PCR success",
node_size_axis_label = "Sequences not amplified",
output_file = result_path("silva--pcr_fail_detailed"))
print(silva_plot_pcr_fail)
Some results from this file will be combined with others from similar analyses to make a composite figure. Below, the needed objects are saved so that they can be loaded by another Rmd file.
save(file = file.path(output_folder, "silva_data.RData"),
silva_plot_all, silva_plot_pcr_fail)
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
## [4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 metacoder_0.2.1.9009 taxa_0.3.1 stringr_1.3.1
## [5] glossary_0.1.0 knitcitations_1.0.8 knitr_1.20
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.4 purrr_0.2.5 reshape2_1.4.3 lattice_0.20-35 colorspace_1.3-2
## [6] htmltools_0.3.6 viridisLite_0.3.0 yaml_2.2.0 utf8_1.1.4 rlang_0.2.1
## [11] pillar_1.3.0 glue_1.3.0 bindr_0.1.1 plyr_1.8.4 ggfittext_0.6.0
## [16] munsell_0.5.0 gtable_0.2.0 codetools_0.2-15 evaluate_0.11 labeling_0.3
## [21] parallel_3.4.4 fansi_0.2.3 Rcpp_0.12.18 readr_1.1.1 backports_1.1.2
## [26] scales_0.5.0 jsonlite_1.5 gridExtra_2.3 ggplot2_3.0.0 hms_0.4.2
## [31] digest_0.6.15 stringi_1.2.4 dplyr_0.7.6 rprojroot_1.3-2 grid_3.4.4
## [36] bibtex_0.4.2 cli_1.0.0 tools_3.4.4 magrittr_1.5 lazyeval_0.2.1
## [41] tibble_1.4.2 RefManageR_1.2.0 crayon_1.3.4 ape_5.1 pkgconfig_2.0.1
## [46] xml2_1.2.0 lubridate_1.7.4 assertthat_0.2.0 rmarkdown_1.10 httr_1.3.1
## [51] viridis_0.5.1 R6_2.2.2 igraph_1.2.1 nlme_3.1-137 compiler_3.4.4
Quast, Christian, Elmar Pruesse, Pelin Yilmaz, Jan Gerken, Timmy Schweer, Pablo Yarza, Jörg Peplies, and Frank Oliver Glöckner. 2012. “The Silva Ribosomal Rna Gene Database Project: Improved Data Processing and Web-Based Tools.” Nucleic Acids Research. Oxford Univ Press, gks1219.
Walters, William, Embriette R Hyde, Donna Berg-Lyons, Gail Ackermann, Greg Humphrey, Alma Parada, Jack A Gilbert, et al. 2016. “Improved Bacterial 16s rRNA Gene (V4 and V4-5) and Fungal Internal Transcribed Spacer Marker Gene Primers for Microbial Community Surveys.” mSystems 1 (1). American Society for Microbiology Journals: e00009–15.