Alternative taxonomic assignment using BLAST

Prepare

Packages used

library(dplyr)
library(purrr)
library(furrr)
library(tidyr)
library(readr)
library(ggplot2)
library(sessioninfo)
library(metacoder)
library(vegan)
library(viridis)
library(DT)
library(stringr)
library(qsubmitter)
library(taxize)

Parameters

cgrb <- remote_server$new(server = "shell.cgrb.oregonstate.edu", user = "fosterz", port  = 732)
files <- remote_server$new(server = "files.cgrb.oregonstate.edu", user = "fosterz", port  = 732)
remote_repository_path <- "/dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode"

Parameters

seed <- 1
min_evalue <- '1e-3'
set.seed(seed)

BLAST

A blast-based classification will be useful for the the non-target sequences detection. It will also be useful for verifying the dada2-based classification.

Make query file

I will make FASTA files of ASV and OTU sequences with header containing their row indexes in the abundance matrix.

# ASVs
abundance_asv <- read_csv(file.path('intermediate_data', 'abundance_asv.csv'))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   sequence = col_character(),
##   taxonomy = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

query_seq_path_asv <- file.path('intermediate_data', 'blast_query_asv.fa')
paste0('>asv_', 1:nrow(abundance_asv), '\n', abundance_asv$sequence) %>%
  write_lines(file = query_seq_path_asv)

# OTUs
abundance_otu <- read_csv(file.path('intermediate_data', 'abundance_otu.csv'))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   sequence = col_character(),
##   taxonomy = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

query_seq_path_otu <- file.path('intermediate_data', 'blast_query_otu.fa')
paste0('>otu_', 1:nrow(abundance_otu), '\n', abundance_otu$sequence) %>%
  write_lines(file = query_seq_path_otu)

Run BLAST

First I will need to transfer the file from this computer to the CGRB cluster.

# ASVs
remote_query_path_asv <- file.path(remote_repository_path, 'blast_query_asv.fa')
rsync_push(local_path = query_seq_path_asv, remote_path = remote_query_path_asv, remote = files)

# OTUs
remote_query_path_otu <- file.path(remote_repository_path, 'blast_query_otu.fa')
rsync_push(local_path = query_seq_path_otu, remote_path = remote_query_path_otu, remote = files)

Then I can run blast remotely.

out_cols <- 'qseqid sallseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sskingdoms score sscinames'
run_blast_remotely <- function(query_path, out_path, ...) {
  blast_command <- paste('blastn',
                         "-query", query_path,
                         "-db nt",
                         "-dust no",
                         paste("-evalue", min_evalue),
                         # "-perc_identity 70",
                         paste0("-outfmt '10 ", out_cols, "'"),
                         "-max_hsps 1",
                         "-max_target_seqs 100",
                         '-num_threads 8',
                         '-out', out_path)
  qsub(command = blast_command,
       remote = cgrb,
       remote_cwd = remote_repository_path,
       cores = 8,
       queue = 'bpp@!(uncia)', # For some reason, I was getting errors on uncia, so this avoids that node
       ...)
}

# ASVs
remote_blast_out_asv <- file.path(remote_repository_path, 'blast_result_asv.csv')
run_blast_remotely(query_path = remote_query_path_asv, out_path = remote_blast_out_asv)

## Command:
##  blastn -query /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_query_asv.fa -db nt -dust no -evalue 1e-3 -outfmt '10 qseqid sallseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sskingdoms score sscinames' -max_hsps 1 -max_target_seqs 100 -num_threads 8 -out /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_result_asv.csv

## Job 9386752 sumbitted.

## Submission script: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.i9386752.sh

## Current working directory: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode

## Started: 2021-05-04 12:14:44

## Finished: 2021-05-04 12:53:17

## Duration: 39 mins

## Standard output: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.o9386752

## Standard error: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.e9386752

# OTUs
remote_blast_out_otu <- file.path(remote_repository_path, 'blast_result_otu.csv')
run_blast_remotely(query_path = remote_query_path_otu, out_path = remote_blast_out_otu)

## Command:
##  blastn -query /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_query_otu.fa -db nt -dust no -evalue 1e-3 -outfmt '10 qseqid sallseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids sskingdoms score sscinames' -max_hsps 1 -max_target_seqs 100 -num_threads 8 -out /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/blast_result_otu.csv

## Job 9386827 sumbitted.

## Submission script: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.i9386827.sh

## Current working directory: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode

## Started: 2021-05-04 12:53:24

## Finished: 2021-05-04 13:19:04

## Duration: 26 mins

## Standard output: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.o9386827

## Standard error: /dfs/Grunwald_Lab/home/fosterz/repositories/rps10_barcode/qsub_records/blastn.e9386827

and download the results:

# ASVs
blast_result_path_asv <- file.path('intermediate_data', 'blast_results_asv.csv')
rsync_pull(local_path = blast_result_path_asv, remote_path = remote_blast_out_asv, remote = files)

# OTUs
blast_result_path_otu <- file.path('intermediate_data', 'blast_results_otu.csv')
rsync_pull(local_path = blast_result_path_otu, remote_path = remote_blast_out_otu, remote = files)

and read them into R:

blast_results_asv <- read_csv(blast_result_path_asv, col_names = strsplit(out_cols, split = ' ')[[1]], col_types = 'ccddddddddddcccc')

## Warning: 14 parsing failures.
##    row col   expected     actual                                      file
## 157446  -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 165606  -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 236317  -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 245796  -- 16 columns 17 columns 'intermediate_data/blast_results_asv.csv'
## 258836  -- 16 columns 19 columns 'intermediate_data/blast_results_asv.csv'
## ...... ... .......... .......... .........................................
## See problems(...) for more details.

blast_results_otu <- read_csv(blast_result_path_otu, col_names = strsplit(out_cols, split = ' ')[[1]], col_types = 'ccddddddddddcccc')

## Warning: 13 parsing failures.
##    row col   expected     actual                                      file
##  64328  -- 16 columns 17 columns 'intermediate_data/blast_results_otu.csv'
##  69450  -- 16 columns 17 columns 'intermediate_data/blast_results_otu.csv'
## 129574  -- 16 columns 17 columns 'intermediate_data/blast_results_otu.csv'
## 140223  -- 16 columns 19 columns 'intermediate_data/blast_results_otu.csv'
## 140224  -- 16 columns 19 columns 'intermediate_data/blast_results_otu.csv'
## ...... ... .......... .......... .........................................
## See problems(...) for more details.

There might be some warnings about parsing errors caused by , in the sscinames column, but these should not affect the analysis since that information is not used and there are no columns after it to mess up. I will select the best hit for each ASV based on e-value and percent identity:

select_best_blast_hit <- function(blast_results) {
  blast_results %>%
    group_by(qseqid) %>%
    filter(evalue == min(evalue)) %>%
    filter(pident == max(pident)) %>%
    filter(row_number() == 1) # break ties by picking first value
}

blast_results_asv <- select_best_blast_hit(blast_results_asv)
blast_results_otu <- select_best_blast_hit(blast_results_otu)

Then I can look up the taxonomic info from the NCBI taxonomy database using the taxon ID that blast returns:

lookup_tax <- function(blast_results) {
  blast_results$staxids <- sub(blast_results$staxids, pattern = ';.+$', replacement = '')
  classification(as.uid(unique(blast_results$staxids), check = FALSE), db = 'ncbi')
}

blast_class_asv <- lookup_tax(blast_results_asv)
blast_class_otu <- lookup_tax(blast_results_otu)

and add that to the results table

get_classification <- function(x) {
  if (is.logical(x)) {
    return(NA)
  } else {
    return(paste(x$name, collapse = ';'))
  }
}

blast_results_asv$blast_tax <- map_chr(blast_class_asv, get_classification)[blast_results_asv$staxids]
blast_results_otu$blast_tax <- map_chr(blast_class_otu, get_classification)[blast_results_otu$staxids]

and combine that with the abundance matrix

add_to_abund <- function(abundance, blast_results) {
  blast_results %>%
    ungroup() %>%
    transmute(sequence = abundance$sequence[as.numeric(sub(qseqid, pattern = '^asv_|otu_', replacement = ''))],
              blast_pid = pident,
              blast_cov = c(qend - qstart) / nchar(sequence) * 100,
              blast_tax = blast_tax) %>%
    right_join(abundance, by = 'sequence')
}

abundance_asv <- add_to_abund(abundance_asv, blast_results_asv)
abundance_otu <- add_to_abund(abundance_otu, blast_results_otu)

Finally, lets save that modified matrix for further analyses. Note that this overwrites the abundance matrix.

write_csv(abundance_asv, file.path('intermediate_data', 'abundance_asv.csv'))
write_csv(abundance_otu, file.path('intermediate_data', 'abundance_otu.csv'))

Software used

sessioninfo::session_info()

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.3 (2020-10-10)
##  os       Pop!_OS 20.04 LTS           
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language en_US:en                    
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Vancouver           
##  date     2021-05-04                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package     * version    date       lib source        
##  ape           5.4-1      2020-08-13 [1] CRAN (R 4.0.2)
##  assertthat    0.2.1      2019-03-21 [1] CRAN (R 4.0.2)
##  bold          1.1.0      2020-06-17 [1] CRAN (R 4.0.2)
##  cli           2.1.0      2020-10-12 [1] CRAN (R 4.0.3)
##  cluster       2.1.0      2019-06-19 [4] CRAN (R 4.0.0)
##  codetools     0.2-16     2018-12-24 [4] CRAN (R 4.0.0)
##  colorspace    1.4-1      2019-03-18 [1] CRAN (R 4.0.2)
##  conditionz    0.1.0      2019-04-24 [1] CRAN (R 4.0.2)
##  crayon        1.3.4      2017-09-16 [1] CRAN (R 4.0.2)
##  crul          1.0.0      2020-07-30 [1] CRAN (R 4.0.2)
##  curl          4.3        2019-12-02 [1] CRAN (R 4.0.2)
##  data.table    1.13.2     2020-10-19 [1] CRAN (R 4.0.3)
##  digest        0.6.27     2020-10-24 [1] CRAN (R 4.0.3)
##  dplyr       * 1.0.2      2020-08-18 [1] CRAN (R 4.0.2)
##  DT          * 0.16       2020-10-13 [1] CRAN (R 4.0.3)
##  ellipsis      0.3.1      2020-05-15 [1] CRAN (R 4.0.2)
##  evaluate      0.14       2019-05-28 [1] CRAN (R 4.0.2)
##  fansi         0.4.1      2020-01-08 [1] CRAN (R 4.0.2)
##  foreach       1.5.1      2020-10-15 [1] CRAN (R 4.0.3)
##  furrr       * 0.2.1      2020-10-21 [1] CRAN (R 4.0.3)
##  future      * 1.19.1     2020-09-22 [1] CRAN (R 4.0.3)
##  generics      0.1.0      2020-10-31 [1] CRAN (R 4.0.3)
##  ggplot2     * 3.3.2      2020-06-19 [1] CRAN (R 4.0.2)
##  globals       0.13.1     2020-10-11 [1] CRAN (R 4.0.3)
##  glue          1.4.2      2020-08-27 [1] CRAN (R 4.0.2)
##  gridExtra     2.3        2017-09-09 [1] CRAN (R 4.0.3)
##  gtable        0.3.0      2019-03-25 [1] CRAN (R 4.0.2)
##  hms           0.5.3      2020-01-08 [1] CRAN (R 4.0.2)
##  htmltools     0.5.1.1    2021-01-22 [1] CRAN (R 4.0.3)
##  htmlwidgets   1.5.2      2020-10-03 [1] CRAN (R 4.0.3)
##  httpcode      0.3.0      2020-04-10 [1] CRAN (R 4.0.2)
##  iterators     1.0.13     2020-10-15 [1] CRAN (R 4.0.3)
##  jsonlite      1.7.1      2020-09-07 [1] CRAN (R 4.0.2)
##  knitr         1.30       2020-09-22 [1] CRAN (R 4.0.2)
##  lattice     * 0.20-41    2020-04-02 [4] CRAN (R 4.0.0)
##  lifecycle     0.2.0      2020-03-06 [1] CRAN (R 4.0.2)
##  listenv       0.8.0      2019-12-05 [1] CRAN (R 4.0.3)
##  magrittr      1.5        2014-11-22 [1] CRAN (R 4.0.2)
##  MASS          7.3-53     2020-09-09 [4] CRAN (R 4.0.2)
##  Matrix        1.2-18     2019-11-27 [4] CRAN (R 4.0.0)
##  metacoder   * 0.3.4      2020-04-29 [1] CRAN (R 4.0.3)
##  mgcv          1.8-33     2020-08-27 [4] CRAN (R 4.0.2)
##  munsell       0.5.0      2018-06-12 [1] CRAN (R 4.0.2)
##  nlme          3.1-149    2020-08-23 [4] CRAN (R 4.0.2)
##  permute     * 0.9-5      2019-03-12 [1] CRAN (R 4.0.2)
##  pillar        1.4.6      2020-07-10 [1] CRAN (R 4.0.2)
##  pkgconfig     2.0.3      2019-09-22 [1] CRAN (R 4.0.2)
##  plyr          1.8.6      2020-03-03 [1] CRAN (R 4.0.2)
##  purrr       * 0.3.4      2020-04-17 [1] CRAN (R 4.0.2)
##  qsubmitter  * 0.1        2020-11-11 [1] local         
##  R6            2.5.0      2020-10-28 [1] CRAN (R 4.0.3)
##  Rcpp          1.0.5      2020-07-06 [1] CRAN (R 4.0.2)
##  readr       * 1.4.0      2020-10-05 [1] CRAN (R 4.0.3)
##  reshape       0.8.8      2018-10-23 [1] CRAN (R 4.0.2)
##  rlang         0.4.10     2020-12-30 [1] CRAN (R 4.0.3)
##  rmarkdown     2.5        2020-10-21 [1] CRAN (R 4.0.3)
##  rstudioapi    0.11       2020-02-07 [1] CRAN (R 4.0.2)
##  scales        1.1.1      2020-05-11 [1] CRAN (R 4.0.2)
##  sessioninfo * 1.1.1      2018-11-05 [1] CRAN (R 4.0.2)
##  sharedbib     0.1.0.9003 2020-10-16 [1] local         
##  stringi       1.5.3      2020-09-09 [1] CRAN (R 4.0.2)
##  stringr     * 1.4.0      2019-02-10 [1] CRAN (R 4.0.2)
##  taxa        * 0.3.4      2020-04-29 [1] CRAN (R 4.0.3)
##  taxize      * 0.9.99     2020-10-30 [1] CRAN (R 4.0.3)
##  tibble        3.0.4      2020-10-12 [1] CRAN (R 4.0.3)
##  tidyr       * 1.1.2      2020-08-27 [1] CRAN (R 4.0.2)
##  tidyselect    1.1.0      2020-05-11 [1] CRAN (R 4.0.2)
##  triebeard     0.3.0      2016-08-04 [1] CRAN (R 4.0.2)
##  urltools      1.7.3      2019-04-14 [1] CRAN (R 4.0.2)
##  uuid          0.1-4      2020-02-26 [1] CRAN (R 4.0.2)
##  vctrs         0.3.4      2020-08-29 [1] CRAN (R 4.0.2)
##  vegan       * 2.5-6      2019-09-01 [1] CRAN (R 4.0.2)
##  viridis     * 0.5.1      2018-03-29 [1] CRAN (R 4.0.3)
##  viridisLite * 0.3.0      2018-02-01 [1] CRAN (R 4.0.2)
##  withr         2.3.0      2020-09-22 [1] CRAN (R 4.0.3)
##  xfun          0.19       2020-10-30 [1] CRAN (R 4.0.3)
##  xml2          1.3.2      2020-04-23 [1] CRAN (R 4.0.2)
##  yaml          2.2.1      2020-02-01 [1] CRAN (R 4.0.2)
##  zoo           1.8-8      2020-05-02 [1] CRAN (R 4.0.2)
## 
## [1] /home/fosterz/R/x86_64-pc-linux-gnu-library/4.0
## [2] /usr/local/lib/R/site-library
## [3] /usr/lib/R/site-library
## [4] /usr/lib/R/library