library(dplyr)
library(purrr)
library(furrr)
library(tidyr)
library(readr)
library(ggplot2)
library(sessioninfo)
library(metacoder)
library(vegan)
library(viridis)
library(DT)
library(stringr)
library(qsubmitter)
library(taxize)
cgrb <- remote_server$new(server = "shell.cgrb.oregonstate.edu", user = "fosterz", port = 732)
files <- remote_server$new(server = "files.cgrb.oregonstate.edu", user = "fosterz", port = 732)
remote_repository_path <- "/dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode"
seed <- 1
min_evalue <- '1e-3'
set.seed(seed)
A blast-based classification will be useful for the the non-target sequences detection. It will also be useful for verifying the dada2
-based classification.
I will make FASTA files of ASV and OTU sequences with header containing their row indexes in the abundance matrix.
# ASVs
abundance_asv <- read_csv(file.path('intermediate_data', 'abundance_asv.csv'))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## sequence = col_character(),
## taxonomy = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
query_seq_path_asv <- file.path('intermediate_data', 'blast_query_asv.fa')
paste0('>asv_', 1:nrow(abundance_asv), '\n', abundance_asv$sequence) %>%
write_lines(file = query_seq_path_asv)
# OTUs
abundance_otu <- read_csv(file.path('intermediate_data', 'abundance_otu.csv'))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## sequence = col_character(),
## taxonomy = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
query_seq_path_otu <- file.path('intermediate_data', 'blast_query_otu.fa')
paste0('>otu_', 1:nrow(abundance_otu), '\n', abundance_otu$sequence) %>%
write_lines(file = query_seq_path_otu)
First I will need to transfer the file from this computer to the CGRB cluster.
# ASVs
remote_query_path_asv <- file.path(remote_repository_path, 'blast_query_asv.fa')
rsync_push(local_path = query_seq_path_asv, remote_path = remote_query_path_asv, remote = files)
# OTUs
remote_query_path_otu <- file.path(remote_repository_path, 'blast_query_otu.fa')
rsync_push(local_path = query_seq_path_otu, remote_path = remote_query_path_otu, remote = files)
Then I can run blast remotely.
out_cols <- 'qseqid sallseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sskingdoms score sscinames'
run_blast_remotely <- function(query_path, out_path, ...) {
blast_command <- paste('blastn',
"-query", query_path,
"-db nt",
"-dust no",
paste("-evalue", min_evalue),
# "-perc_identity 70",
paste0("-outfmt '10 ", out_cols, "'"),
"-max_hsps 1",
"-max_target_seqs 100",
'-num_threads 8',
'-out', out_path)
qsub(command = blast_command,
remote = cgrb,
remote_cwd = remote_repository_path,
cores = 8,
queue = 'bpp@!(uncia)', # For some reason, I was getting errors on uncia, so this avoids that node
...)
}
# ASVs
remote_blast_out_asv <- file.path(remote_repository_path, 'blast_result_asv.csv')
run_blast_remotely(query_path = remote_query_path_asv, out_path = remote_blast_out_asv)
## Command:
## blastn -query /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_query_asv.fa -db nt -dust no -evalue 1e-3 -outfmt '10 qseqid sallseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sskingdoms score sscinames' -max_hsps 1 -max_target_seqs 100 -num_threads 8 -out /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_result_asv.csv
## Job 9386752 sumbitted.
## Submission script: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.i9386752.sh
## Current working directory: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode
## Started: 2021-05-04 12:14:44
## Finished: 2021-05-04 12:53:17
## Duration: 39 mins
## Standard output: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.o9386752
## Standard error: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.e9386752
# OTUs
remote_blast_out_otu <- file.path(remote_repository_path, 'blast_result_otu.csv')
run_blast_remotely(query_path = remote_query_path_otu, out_path = remote_blast_out_otu)
## Command:
## blastn -query /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_query_otu.fa -db nt -dust no -evalue 1e-3 -outfmt '10 qseqid sallseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sskingdoms score sscinames' -max_hsps 1 -max_target_seqs 100 -num_threads 8 -out /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_result_otu.csv
## Job 9386827 sumbitted.
## Submission script: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.i9386827.sh
## Current working directory: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode
## Started: 2021-05-04 12:53:24
## Finished: 2021-05-04 13:19:04
## Duration: 26 mins
## Standard output: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.o9386827
## Standard error: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.e9386827
and download the results:
# ASVs
blast_result_path_asv <- file.path('intermediate_data', 'blast_results_asv.csv')
rsync_pull(local_path = blast_result_path_asv, remote_path = remote_blast_out_asv, remote = files)
# OTUs
blast_result_path_otu <- file.path('intermediate_data', 'blast_results_otu.csv')
rsync_pull(local_path = blast_result_path_otu, remote_path = remote_blast_out_otu, remote = files)
and read them into R:
blast_results_asv <- read_csv(blast_result_path_asv, col_names = strsplit(out_cols, split = ' ')[[1]], col_types = 'ccddddddddddcccc')
## Warning: 14 parsing failures.
## row col expected actual file
## 157446 -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 165606 -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 236317 -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 245796 -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 258836 -- 16 columns 19 columns 'intermediate_data/blast_results_asv.csv'
## ...... ... .......... .......... .........................................
## See problems(...) for more details.
blast_results_otu <- read_csv(blast_result_path_otu, col_names = strsplit(out_cols, split = ' ')[[1]], col_types = 'ccddddddddddcccc')
## Warning: 13 parsing failures.
## row col expected actual file
## 64328 -- 16 columns 17 columns 'intermediate_data/blast_results_otu.csv'
## 69450 -- 16 columns 17 columns 'intermediate_data/blast_results_otu.csv'
## 129574 -- 16 columns 17 columns 'intermediate_data/blast_results_otu.csv'
## 140223 -- 16 columns 19 columns 'intermediate_data/blast_results_otu.csv'
## 140224 -- 16 columns 19 columns 'intermediate_data/blast_results_otu.csv'
## ...... ... .......... .......... .........................................
## See problems(...) for more details.
There might be some warnings about parsing errors caused by ,
in the sscinames
column, but these should not affect the analysis since that information is not used and there are no columns after it to mess up. I will select the best hit for each ASV based on e-value and percent identity:
select_best_blast_hit <- function(blast_results) {
blast_results %>%
group_by(qseqid) %>%
filter(evalue == min(evalue)) %>%
filter(pident == max(pident)) %>%
filter(row_number() == 1) # break ties by picking first value
}
blast_results_asv <- select_best_blast_hit(blast_results_asv)
blast_results_otu <- select_best_blast_hit(blast_results_otu)
Then I can look up the taxonomic info from the NCBI taxonomy database using the taxon ID that blast returns:
lookup_tax <- function(blast_results) {
blast_results$staxids <- sub(blast_results$staxids, pattern = ';.+$', replacement = '')
classification(as.uid(unique(blast_results$staxids), check = FALSE), db = 'ncbi')
}
blast_class_asv <- lookup_tax(blast_results_asv)
blast_class_otu <- lookup_tax(blast_results_otu)
and add that to the results table
get_classification <- function(x) {
if (is.logical(x)) {
return(NA)
} else {
return(paste(x$name, collapse = ';'))
}
}
blast_results_asv$blast_tax <- map_chr(blast_class_asv, get_classification)[blast_results_asv$staxids]
blast_results_otu$blast_tax <- map_chr(blast_class_otu, get_classification)[blast_results_otu$staxids]
and combine that with the abundance matrix
add_to_abund <- function(abundance, blast_results) {
blast_results %>%
ungroup() %>%
transmute(sequence = abundance$sequence[as.numeric(sub(qseqid, pattern = '^asv_|otu_', replacement = ''))],
blast_pid = pident,
blast_cov = c(qend - qstart) / nchar(sequence) * 100,
blast_tax = blast_tax) %>%
right_join(abundance, by = 'sequence')
}
abundance_asv <- add_to_abund(abundance_asv, blast_results_asv)
abundance_otu <- add_to_abund(abundance_otu, blast_results_otu)
Finally, lets save that modified matrix for further analyses. Note that this overwrites the abundance matrix.
write_csv(abundance_asv, file.path('intermediate_data', 'abundance_asv.csv'))
write_csv(abundance_otu, file.path('intermediate_data', 'abundance_otu.csv'))
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.0.3 (2020-10-10)
## os Pop!_OS 20.04 LTS
## system x86_64, linux-gnu
## ui X11
## language en_US:en
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz America/Vancouver
## date 2021-05-04
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## ape 5.4-1 2020-08-13 [1] CRAN (R 4.0.2)
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.2)
## bold 1.1.0 2020-06-17 [1] CRAN (R 4.0.2)
## cli 2.1.0 2020-10-12 [1] CRAN (R 4.0.3)
## cluster 2.1.0 2019-06-19 [4] CRAN (R 4.0.0)
## codetools 0.2-16 2018-12-24 [4] CRAN (R 4.0.0)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 4.0.2)
## conditionz 0.1.0 2019-04-24 [1] CRAN (R 4.0.2)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.2)
## crul 1.0.0 2020-07-30 [1] CRAN (R 4.0.2)
## curl 4.3 2019-12-02 [1] CRAN (R 4.0.2)
## data.table 1.13.2 2020-10-19 [1] CRAN (R 4.0.3)
## digest 0.6.27 2020-10-24 [1] CRAN (R 4.0.3)
## dplyr * 1.0.2 2020-08-18 [1] CRAN (R 4.0.2)
## DT * 0.16 2020-10-13 [1] CRAN (R 4.0.3)
## ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.2)
## evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.2)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.2)
## foreach 1.5.1 2020-10-15 [1] CRAN (R 4.0.3)
## furrr * 0.2.1 2020-10-21 [1] CRAN (R 4.0.3)
## future * 1.19.1 2020-09-22 [1] CRAN (R 4.0.3)
## generics 0.1.0 2020-10-31 [1] CRAN (R 4.0.3)
## ggplot2 * 3.3.2 2020-06-19 [1] CRAN (R 4.0.2)
## globals 0.13.1 2020-10-11 [1] CRAN (R 4.0.3)
## glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
## gridExtra 2.3 2017-09-09 [1] CRAN (R 4.0.3)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.2)
## hms 0.5.3 2020-01-08 [1] CRAN (R 4.0.2)
## htmltools 0.5.1.1 2021-01-22 [1] CRAN (R 4.0.3)
## htmlwidgets 1.5.2 2020-10-03 [1] CRAN (R 4.0.3)
## httpcode 0.3.0 2020-04-10 [1] CRAN (R 4.0.2)
## iterators 1.0.13 2020-10-15 [1] CRAN (R 4.0.3)
## jsonlite 1.7.1 2020-09-07 [1] CRAN (R 4.0.2)
## knitr 1.30 2020-09-22 [1] CRAN (R 4.0.2)
## lattice * 0.20-41 2020-04-02 [4] CRAN (R 4.0.0)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.2)
## listenv 0.8.0 2019-12-05 [1] CRAN (R 4.0.3)
## magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.2)
## MASS 7.3-53 2020-09-09 [4] CRAN (R 4.0.2)
## Matrix 1.2-18 2019-11-27 [4] CRAN (R 4.0.0)
## metacoder * 0.3.4 2020-04-29 [1] CRAN (R 4.0.3)
## mgcv 1.8-33 2020-08-27 [4] CRAN (R 4.0.2)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.2)
## nlme 3.1-149 2020-08-23 [4] CRAN (R 4.0.2)
## permute * 0.9-5 2019-03-12 [1] CRAN (R 4.0.2)
## pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.2)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.2)
## plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.2)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.2)
## qsubmitter * 0.1 2020-11-11 [1] local
## R6 2.5.0 2020-10-28 [1] CRAN (R 4.0.3)
## Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.2)
## readr * 1.4.0 2020-10-05 [1] CRAN (R 4.0.3)
## reshape 0.8.8 2018-10-23 [1] CRAN (R 4.0.2)
## rlang 0.4.10 2020-12-30 [1] CRAN (R 4.0.3)
## rmarkdown 2.5 2020-10-21 [1] CRAN (R 4.0.3)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 4.0.2)
## scales 1.1.1 2020-05-11 [1] CRAN (R 4.0.2)
## sessioninfo * 1.1.1 2018-11-05 [1] CRAN (R 4.0.2)
## sharedbib 0.1.0.9003 2020-10-16 [1] local
## stringi 1.5.3 2020-09-09 [1] CRAN (R 4.0.2)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.2)
## taxa * 0.3.4 2020-04-29 [1] CRAN (R 4.0.3)
## taxize * 0.9.99 2020-10-30 [1] CRAN (R 4.0.3)
## tibble 3.0.4 2020-10-12 [1] CRAN (R 4.0.3)
## tidyr * 1.1.2 2020-08-27 [1] CRAN (R 4.0.2)
## tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.2)
## triebeard 0.3.0 2016-08-04 [1] CRAN (R 4.0.2)
## urltools 1.7.3 2019-04-14 [1] CRAN (R 4.0.2)
## uuid 0.1-4 2020-02-26 [1] CRAN (R 4.0.2)
## vctrs 0.3.4 2020-08-29 [1] CRAN (R 4.0.2)
## vegan * 2.5-6 2019-09-01 [1] CRAN (R 4.0.2)
## viridis * 0.5.1 2018-03-29 [1] CRAN (R 4.0.3)
## viridisLite * 0.3.0 2018-02-01 [1] CRAN (R 4.0.2)
## withr 2.3.0 2020-09-22 [1] CRAN (R 4.0.3)
## xfun 0.19 2020-10-30 [1] CRAN (R 4.0.3)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.2)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.2)
## zoo 1.8-8 2020-05-02 [1] CRAN (R 4.0.2)
##
## [1] /home/fosterz/R/x86_64-pc-linux-gnu-library/4.0
## [2] /usr/local/lib/R/site-library
## [3] /usr/lib/R/site-library
## [4] /usr/lib/R/library