Non-target amplification

Prepare

Packages used

library(dplyr)
library(purrr)
library(furrr)
library(tidyr)
library(readr)
library(ggplot2)
library(sessioninfo)
library(metacoder)
library(vegan)
library(viridis)
library(DT)
library(stringr)
library(qsubmitter)
library(taxize)
library(forcats)
library(ips)
library(insect)
library(ape)
library(phangorn)

Parameters

minimum_read_count <- 10
seed <- 1
set.seed(seed)

Detecting non-target amplification

Since the reference databases will not have all the non-targets, it will be hard to detect non-target amplification. I will use blast against Genbank nt. Genbank is not curated, but it should be good enough to reliably assign a phylum.

abundance_asv <- read_csv(file.path('intermediate_data', 'abundance_asv.csv'))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   sequence = col_character(),
##   blast_tax = col_character(),
##   taxonomy = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

abundance_asv

## # A tibble: 9,528 x 52
##    sequence blast_pid blast_cov blast_tax taxonomy    A1    A2    B1    B2    C1
##    <chr>        <dbl>     <dbl> <chr>     <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AGTCCGT…      81.5      99.3 cellular… Eukaryo…     0     0     0     0     0
##  2 TTTCCGT…      89.0      99.6 cellular… Eukaryo…     0     0     0     0     0
##  3 GAAAATC…      82.0      99.3 cellular… Eukaryo…     0     0    18 10732    17
##  4 TTTCCGT…      90.9      99.6 cellular… Eukaryo…     0     0     0     0     0
##  5 TTTCCGT…      99.7      99.0 cellular… Eukaryo…     0     0     0     0     0
##  6 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0     0     0
##  7 AGTCCGT…      81.7      99.3 cellular… Eukaryo…     0     0     0     0     0
##  8 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0     0     0
##  9 AGTCCAC…      86.8      93.9 cellular… Eukaryo…     0     0     0     0     0
## 10 GAAAATC…      85.5      99.8 cellular… Eukaryo…     0     0     0     0     0
## # … with 9,518 more rows, and 42 more variables: C2 <dbl>, D1 <dbl>, D2 <dbl>,
## #   E1 <dbl>, E2 <dbl>, F1 <dbl>, F2 <dbl>, G1 <dbl>, G2 <dbl>, H1 <dbl>,
## #   A3 <dbl>, A4 <dbl>, B3 <dbl>, B4 <dbl>, C3 <dbl>, C4 <dbl>, D3 <dbl>,
## #   D4 <dbl>, E3 <dbl>, E4 <dbl>, F3 <dbl>, F4 <dbl>, G3 <dbl>, G4 <dbl>,
## #   H3 <dbl>, H4 <dbl>, A5 <dbl>, A6 <dbl>, B5 <dbl>, B6 <dbl>, C5 <dbl>,
## #   C6 <dbl>, D5 <dbl>, D6 <dbl>, E5 <dbl>, E6 <dbl>, F5 <dbl>, F6 <dbl>,
## #   G5 <dbl>, G6 <dbl>, H5 <dbl>, H6 <dbl>

abundance_otu <- read_csv(file.path('intermediate_data', 'abundance_otu.csv'))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   sequence = col_character(),
##   blast_tax = col_character(),
##   taxonomy = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

abundance_otu

## # A tibble: 7,250 x 52
##    sequence blast_pid blast_cov blast_tax taxonomy    A1    A2    B1    B2    C1
##    <chr>        <dbl>     <dbl> <chr>     <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AGTCCGT…      81.5      99.3 cellular… Eukaryo…     0     0     0     0     0
##  2 TTTCCGT…      89.0      99.6 cellular… Eukaryo…     0     0     0     0     0
##  3 TTTCCGT…      90.9      99.6 cellular… Eukaryo…     0     0     0     0     0
##  4 TTTCCGT…      99.7      99.0 cellular… Eukaryo…     0     0     0     0     0
##  5 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0     0     0
##  6 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0     0     0
##  7 AGTCCAC…      86.8      93.9 cellular… Eukaryo…     0     0     0     0     0
##  8 TTTCCGT…      99.2      99.6 cellular… Eukaryo…     0     0     0     0     0
##  9 AGTCCAC…      93.7      99.3 cellular… Eukaryo…     0     0     0     0     0
## 10 TTTCCGT…     100        98.7 cellular… Eukaryo…     0     0     0     0     0
## # … with 7,240 more rows, and 42 more variables: C2 <dbl>, D1 <dbl>, D2 <dbl>,
## #   E1 <dbl>, E2 <dbl>, F1 <dbl>, F2 <dbl>, G1 <dbl>, G2 <dbl>, H1 <dbl>,
## #   A3 <dbl>, A4 <dbl>, B3 <dbl>, B4 <dbl>, C3 <dbl>, C4 <dbl>, D3 <dbl>,
## #   D4 <dbl>, E3 <dbl>, E4 <dbl>, F3 <dbl>, F4 <dbl>, G3 <dbl>, G4 <dbl>,
## #   H3 <dbl>, H4 <dbl>, A5 <dbl>, A6 <dbl>, B5 <dbl>, B6 <dbl>, C5 <dbl>,
## #   C6 <dbl>, D5 <dbl>, D6 <dbl>, E5 <dbl>, E6 <dbl>, F5 <dbl>, F6 <dbl>,
## #   G5 <dbl>, G6 <dbl>, H5 <dbl>, H6 <dbl>

metadata <- read_csv(file.path('intermediate_data', 'metadata.csv'))

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   sample_id = col_character(),
##   primer_pair_id = col_character(),
##   dna_type = col_character(),
##   dna_sample_id = col_character(),
##   locus = col_character(),
##   forward = col_character(),
##   reverse = col_character(),
##   sample_type = col_character()
## )

metadata

## # A tibble: 48 x 8
##    sample_id primer_pair_id dna_type dna_sample_id locus forward reverse
##    <chr>     <chr>          <chr>    <chr>         <chr> <chr>   <chr>  
##  1 A1        rps10_Final    mock1    mock1         rps10 rps10-F rps10-R
##  2 A2        rps10_Final    leaf     L95           rps10 rps10-F rps10-R
##  3 A3        rps10_Felipe   mock1    mock1         rps10 rps10_… rps10-R
##  4 A4        rps10_Felipe   leaf     L95           rps10 rps10_… rps10-R
##  5 A5        ITS6/7         mock1    mock1         ITS   ITS6    ITS7   
##  6 A6        ITS6/7         leaf     L95           ITS   ITS6    ITS7   
##  7 B1        rps10_Final    mock2    mock2         rps10 rps10-F rps10-R
##  8 B2        rps10_Final    WR.soil  WS3           rps10 rps10-F rps10-R
##  9 B3        rps10_Felipe   mock2    mock2         rps10 rps10_… rps10-R
## 10 B4        rps10_Felipe   WR.soil  WS3           rps10 rps10_… rps10-R
## # … with 38 more rows, and 1 more variable: sample_type <chr>

Select only environmental samples

Non-target DNA is likely to be found the in the environmental samples, so I will restrict this analysis to those.

metadata <- filter(metadata,
                   dna_type %in% c("leaf", "WR.soil", "ag.soil", "drip", "Pan.soil"),
                   primer_pair_id %in% c('rps10_Final', 'ITS6/7'))
metadata

## # A tibble: 26 x 8
##    sample_id primer_pair_id dna_type dna_sample_id locus forward reverse
##    <chr>     <chr>          <chr>    <chr>         <chr> <chr>   <chr>  
##  1 A2        rps10_Final    leaf     L95           rps10 rps10-F rps10-R
##  2 A6        ITS6/7         leaf     L95           ITS   ITS6    ITS7   
##  3 B2        rps10_Final    WR.soil  WS3           rps10 rps10-F rps10-R
##  4 B6        ITS6/7         WR.soil  WS3           ITS   ITS6    ITS7   
##  5 C1        rps10_Final    ag.soil  AG1           rps10 rps10-F rps10-R
##  6 C2        rps10_Final    WR.soil  WS7           rps10 rps10-F rps10-R
##  7 C5        ITS6/7         ag.soil  AG1           ITS   ITS6    ITS7   
##  8 C6        ITS6/7         WR.soil  WS7           ITS   ITS6    ITS7   
##  9 D1        rps10_Final    drip     D3            rps10 rps10-F rps10-R
## 10 D2        rps10_Final    WR.soil  WS23          rps10 rps10-F rps10-R
## # … with 16 more rows, and 1 more variable: sample_type <chr>

abundance_asv <- select(abundance_asv, sequence:taxonomy, !!! metadata$sample_id)
abundance_asv

## # A tibble: 9,528 x 31
##    sequence blast_pid blast_cov blast_tax taxonomy    A2    A6    B2    B6    C1
##    <chr>        <dbl>     <dbl> <chr>     <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AGTCCGT…      81.5      99.3 cellular… Eukaryo…     0     0     0     0     0
##  2 TTTCCGT…      89.0      99.6 cellular… Eukaryo…     0    44     0  2988     0
##  3 GAAAATC…      82.0      99.3 cellular… Eukaryo…     0     0 10732     0    17
##  4 TTTCCGT…      90.9      99.6 cellular… Eukaryo…     0     0     0   218     0
##  5 TTTCCGT…      99.7      99.0 cellular… Eukaryo…     0     0     0     0     0
##  6 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0  1122     0
##  7 AGTCCGT…      81.7      99.3 cellular… Eukaryo…     0     0     0     0     0
##  8 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0    57     0
##  9 AGTCCAC…      86.8      93.9 cellular… Eukaryo…     0     0     0     0     0
## 10 GAAAATC…      85.5      99.8 cellular… Eukaryo…     0     0     0     0     0
## # … with 9,518 more rows, and 21 more variables: C2 <dbl>, C5 <dbl>, C6 <dbl>,
## #   D1 <dbl>, D2 <dbl>, D5 <dbl>, D6 <dbl>, E1 <dbl>, E2 <dbl>, E5 <dbl>,
## #   E6 <dbl>, F1 <dbl>, F2 <dbl>, F5 <dbl>, F6 <dbl>, G1 <dbl>, G2 <dbl>,
## #   G5 <dbl>, G6 <dbl>, H1 <dbl>, H5 <dbl>

abundance_otu <- select(abundance_otu, sequence:taxonomy, !!! metadata$sample_id)
abundance_otu

## # A tibble: 7,250 x 31
##    sequence blast_pid blast_cov blast_tax taxonomy    A2    A6    B2    B6    C1
##    <chr>        <dbl>     <dbl> <chr>     <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AGTCCGT…      81.5      99.3 cellular… Eukaryo…     0     0     0     0     0
##  2 TTTCCGT…      89.0      99.6 cellular… Eukaryo…     0    44     0  2988     0
##  3 TTTCCGT…      90.9      99.6 cellular… Eukaryo…     0     0     0   218     0
##  4 TTTCCGT…      99.7      99.0 cellular… Eukaryo…     0     0     0     0     0
##  5 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0  1122     0
##  6 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0    57     0
##  7 AGTCCAC…      86.8      93.9 cellular… Eukaryo…     0     0     0     0     0
##  8 TTTCCGT…      99.2      99.6 cellular… Eukaryo…     0     0     0     0     0
##  9 AGTCCAC…      93.7      99.3 cellular… Eukaryo…     0     0     0     0     0
## 10 TTTCCGT…     100        98.7 cellular… Eukaryo…     0     0     0     0     0
## # … with 7,240 more rows, and 21 more variables: C2 <dbl>, C5 <dbl>, C6 <dbl>,
## #   D1 <dbl>, D2 <dbl>, D5 <dbl>, D6 <dbl>, E1 <dbl>, E2 <dbl>, E5 <dbl>,
## #   E6 <dbl>, F1 <dbl>, F2 <dbl>, F5 <dbl>, F6 <dbl>, G1 <dbl>, G2 <dbl>,
## #   G5 <dbl>, G6 <dbl>, H1 <dbl>, H5 <dbl>

Filtering out low abundance_asv ASVs

Most low-abundance_asv ASVs would normally be filtered out during most analyses, so I will do that to.

abundance_asv <- abundance_asv[rowSums(abundance_asv[, metadata$sample_id]) >= minimum_read_count, ]
abundance_asv

## # A tibble: 2,571 x 31
##    sequence blast_pid blast_cov blast_tax taxonomy    A2    A6    B2    B6    C1
##    <chr>        <dbl>     <dbl> <chr>     <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 TTTCCGT…      89.0      99.6 cellular… Eukaryo…     0    44     0  2988     0
##  2 GAAAATC…      82.0      99.3 cellular… Eukaryo…     0     0 10732     0    17
##  3 TTTCCGT…      90.9      99.6 cellular… Eukaryo…     0     0     0   218     0
##  4 TTTCCGT…      99.7      99.0 cellular… Eukaryo…     0     0     0     0     0
##  5 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0  1122     0
##  6 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0    57     0
##  7 GAAAATC…      85.5      99.8 cellular… Eukaryo…     0     0     0     0     0
##  8 GAAAATC…      82.3      99.3 cellular… Eukaryo…     0     0     0     0     0
##  9 TTTCCGT…     100        98.7 cellular… Eukaryo…     0     0     0     0     0
## 10 GAAAATC…      74.3      59.2 cellular… Eukaryo…     0     0     0     0     0
## # … with 2,561 more rows, and 21 more variables: C2 <dbl>, C5 <dbl>, C6 <dbl>,
## #   D1 <dbl>, D2 <dbl>, D5 <dbl>, D6 <dbl>, E1 <dbl>, E2 <dbl>, E5 <dbl>,
## #   E6 <dbl>, F1 <dbl>, F2 <dbl>, F5 <dbl>, F6 <dbl>, G1 <dbl>, G2 <dbl>,
## #   G5 <dbl>, G6 <dbl>, H1 <dbl>, H5 <dbl>

abundance_otu <- abundance_otu[rowSums(abundance_otu[, metadata$sample_id]) >= minimum_read_count, ]
abundance_otu

## # A tibble: 1,773 x 31
##    sequence blast_pid blast_cov blast_tax taxonomy    A2    A6    B2    B6
##    <chr>        <dbl>     <dbl> <chr>     <chr>    <dbl> <dbl> <dbl> <dbl>
##  1 TTTCCGT…      89.0      99.6 cellular… Eukaryo…     0    44     0  2988
##  2 TTTCCGT…      90.9      99.6 cellular… Eukaryo…     0     0     0   218
##  3 TTTCCGT…      99.7      99.0 cellular… Eukaryo…     0     0     0     0
##  4 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0  1122
##  5 TTTCCGT…     100        99.6 cellular… Eukaryo…     0     0     0    57
##  6 TTTCCGT…     100        98.7 cellular… Eukaryo…     0     0     0     0
##  7 GAAAATC…      74.3      59.2 cellular… Eukaryo…     0     0     0     0
##  8 TTTCCGT…     100        99.4 cellular… Eukaryo…     0     0     0 86209
##  9 GAAAATC…      96.1      99.8 cellular… Eukaryo…     0     0     0     0
## 10 GAAAATC…      99.3      99.8 cellular… Eukaryo…  3542     0     0     0
## # … with 1,763 more rows, and 22 more variables: C1 <dbl>, C2 <dbl>, C5 <dbl>,
## #   C6 <dbl>, D1 <dbl>, D2 <dbl>, D5 <dbl>, D6 <dbl>, E1 <dbl>, E2 <dbl>,
## #   E5 <dbl>, E6 <dbl>, F1 <dbl>, F2 <dbl>, F5 <dbl>, F6 <dbl>, G1 <dbl>,
## #   G2 <dbl>, G5 <dbl>, G6 <dbl>, H1 <dbl>, H5 <dbl>

Group ASVs into target and non-target

group_key <- c(Oomycetes = 'Oomycetes|Oomycota', Fungi = 'Fungi', Bacteria = 'Bacteria', Plant = 'Viridiplantae', 
               Archaea = 'Archaea', Animals = 'Metazoa', Virus = 'Viruses', Protist = 'Jakobida|Alveolata|Euglenozoa|Amoebozoa')

groups_target_nontarget <- function(abundance) {
  map_chr(abundance$blast_tax, function(x) {
    if (is.na(x)) {
      return('Unknown')
    }
    is_org <- map_lgl(group_key, grepl, x = x)
    if (sum(is_org) > 1) {
      stop('multiple matches')
    } else if (sum(is_org) == 1) {
      return(names(group_key)[is_org])
    } else {
      return('Other')
    }
  })
}
abundance_asv$group <- groups_target_nontarget(abundance_asv)
abundance_otu$group <- groups_target_nontarget(abundance_otu)

Tally number of reads/ASVs per sample in each group

I will make a table with the numbers of ASVs, reads, and the proportion of reads for each group. First the ASV counts:

groups <- unique(abundance_asv$group)

count_seqs <- function(abundance, measure) {
  counts <- map(metadata$sample_id, function(id) {
    map_dbl(groups, function(group) {
      is_group <- abundance$group == group & abundance[[id]] > 0
      sum(is_group)
    })
  })
  names(counts) <- metadata$sample_id
  counts <- as_tibble(c(group = list(groups), 
                        measure = list(rep(measure, length(group_key))),
                        counts))
  counts
}
asv_counts <- count_seqs(abundance_asv, 'ASVs')
otu_counts <- count_seqs(abundance_otu, 'OTUs')

Then read counts

count_reads <- function(abundance) {
  read_counts <- map(metadata$sample_id, function(id) {
    map_dbl(groups, function(group) {
      is_group <- abundance$group == group & abundance[[id]] > 0
      sum(abundance[[id]][is_group])
    })
  })
  names(read_counts) <- metadata$sample_id
  read_counts <- as_tibble(c(group = list(groups), 
                             measure = list(rep('read count', length(group_key))),
                             read_counts))
  read_counts
}
read_counts_asv <- count_reads(abundance_asv)
read_counts_otu <- count_reads(abundance_otu)

And then I can convert those count to proportions:

convert_to_prop <- function(read_counts, abundance) {
  read_props <- read_counts
  read_props[metadata$sample_id] <- map(metadata$sample_id, function(id) {
    read_props[[id]] / sum(abundance[[id]])
  })
  read_props$measure <- 'Reads'
  read_props
}
read_props_asv <- convert_to_prop(read_counts_asv, abundance_asv)
read_props_otu <- convert_to_prop(read_counts_otu, abundance_otu)

Plot barchart of counts

Finally I can combine these into a single table and add some metadata

group_abund <- bind_rows(otu_counts, asv_counts, read_counts_asv, read_props_asv) %>%
  gather(key = 'sample_id', value = 'abund', !!! metadata$sample_id) %>%
  left_join(metadata, by = 'sample_id')
group_abund

## # A tibble: 832 x 11
##    group measure sample_id abund primer_pair_id dna_type dna_sample_id locus
##    <chr> <chr>   <chr>     <dbl> <chr>          <chr>    <chr>         <chr>
##  1 Oomy… OTUs    A2            2 rps10_Final    leaf     L95           rps10
##  2 Fungi OTUs    A2            0 rps10_Final    leaf     L95           rps10
##  3 Other OTUs    A2            0 rps10_Final    leaf     L95           rps10
##  4 Plant OTUs    A2            0 rps10_Final    leaf     L95           rps10
##  5 Prot… OTUs    A2            0 rps10_Final    leaf     L95           rps10
##  6 Anim… OTUs    A2            0 rps10_Final    leaf     L95           rps10
##  7 Bact… OTUs    A2            0 rps10_Final    leaf     L95           rps10
##  8 Unkn… OTUs    A2            8 rps10_Final    leaf     L95           rps10
##  9 Oomy… ASVs    A2            6 rps10_Final    leaf     L95           rps10
## 10 Fungi ASVs    A2            0 rps10_Final    leaf     L95           rps10
## # … with 822 more rows, and 3 more variables: forward <chr>, reverse <chr>,
## #   sample_type <chr>

I will calculate the proportion of ASVs and OTUs assigned to each group for each locus

group_props <- group_abund %>%
  group_by(measure, locus) %>%
  mutate(sum_abund = sum(abund)) %>%
  group_by(group, .add = TRUE) %>%
  summarise(prop = round(sum(abund) / sum_abund[1], digits = 4)) %>%
  filter(measure %in% c("ASVs", "OTUs", "read count"))

## `summarise()` regrouping output by 'measure', 'locus' (override with `.groups` argument)

datatable(group_props)

Check that proportions sum to 1 (there will be a bit of rounding error):

group_props %>%
  summarise(sum = sum(prop))

## `summarise()` regrouping output by 'measure' (override with `.groups` argument)

## # A tibble: 6 x 3
## # Groups:   measure [3]
##   measure    locus   sum
##   <chr>      <chr> <dbl>
## 1 ASVs       ITS    1   
## 2 ASVs       rps10  1   
## 3 OTUs       ITS    1   
## 4 OTUs       rps10  1   
## 5 read count ITS    1.00
## 6 read count rps10  1.00

Lets take a look at what proportion of oomycetes was found for each metric:

filter(group_props, group == "Oomycetes")

## # A tibble: 6 x 4
## # Groups:   measure, locus [6]
##   measure    locus group       prop
##   <chr>      <chr> <chr>      <dbl>
## 1 ASVs       ITS   Oomycetes 0.107 
## 2 ASVs       rps10 Oomycetes 0.848 
## 3 OTUs       ITS   Oomycetes 0.0552
## 4 OTUs       rps10 Oomycetes 0.533 
## 5 read count ITS   Oomycetes 0.630 
## 6 read count rps10 Oomycetes 0.995

Finally, lets plot this information:

nontarget_plot <- group_abund %>% 
  filter(measure %in% c('Reads', 'ASVs', 'OTUs')) %>%
  mutate(dna_type = fct_collapse(dna_type,
                                 Water = c('drip'),
                                 Plant = c('leaf'),
                                 Soil = c('ag.soil', 'Pan.soil', 'WR.soil')),
         measure = factor(measure, levels = c('Reads', 'ASVs', 'OTUs'), ordered = TRUE),
         locus = ordered(c(rps10 = 'rps10', ITS = 'ITS1')[locus], levels = c('rps10', 'ITS1'))) %>%
  group_by(measure, locus, dna_type) %>%
  mutate(abund = abund / sum(abund)) %>%
  mutate(group = fct_collapse(group,
                              Oomycetes = c('Oomycetes'),
                              Fungi = c('Fungi'),
                              Unknown = c('Unknown'),
                              Other = c('Protist', 'Other', 'Plant', 'Bacteria', 'Animals'))) %>%
  mutate(group = factor(group, levels = c("Unknown", "Other", "Fungi", "Oomycetes"), ordered = TRUE)) %>%
  # ggplot(aes(x = locus, y = abund, fill = group)) +
  ggplot(aes(x = dna_type, y = abund, fill = group)) +
  geom_bar(stat = 'identity') +
  # facet_grid(measure ~ dna_type) +
  facet_grid(measure ~ locus) +
  scale_fill_viridis_d(begin = 0.8, end = 0.2) +
  labs(x = NULL, y = 'Proportion', fill = NULL) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = 'bottom') 
ggsave(nontarget_plot, filename = 'non_target_amplification.pdf', path = 'results', width = 5, height = 5)
nontarget_plot

Caption:

Figure #: Target vs non-target amplification using oomycete-specific primers for the ITS1 and RPS10 loci. Reads and ASVs are from a variety of environmental samples, grouped into soil, water, and plant tissue samples. ASV sequences were given a coarse taxonomic assignment based on BLAST searches against the NCBI nucleotide sequence database. Those assigned “Unknown” did not have a match with a E-value of at least 0.001. Sequences in “Other” include plant, animal, bacterial, and protist sequences.

Look at unknown rps10 sequences

unknown_asv_data <- abundance_asv %>% 
  gather(key = 'sample_id', value = 'count', !!! metadata$sample_id) %>%
  left_join(metadata, by = 'sample_id') %>%
  filter(locus == 'rps10', group == 'Unknown', count > 0) %>%
  select(sequence, sample_id, count, dna_type, sample_type)
datatable(unknown_asv_data)

It seems a lot these are short:

hist(nchar(unknown_asv_data$sequence))

And somewhat low abundance:

hist(unknown_asv_data$count)

The really short and low abundance ones are probably errors, so lets remove those:

unknown_asv_data <- filter(unknown_asv_data, nchar(sequence) >= 100, count >= 100)

I will get a random sample of oomycete reference sequences and see if they cluster together with any in particular or form separate clades

rps10_seqs <- read_fasta(file.path('intermediate_data', 'reference_databases', 'rps10_reference_db.fa'))
names(rps10_seqs) <- gsub(names(rps10_seqs), pattern = ';oodb_[0-9]+;$', replacement = '') %>%
  gsub(pattern = '^.+;', replacement = '') %>%
  gsub(pattern = '_', replacement = ' ')
ref_subsample <- rps10_seqs[!duplicated(names(rps10_seqs))]
ref_subsample <- ref_subsample[sample(length(ref_subsample), 10)]

And make a multiple sequence alignment with both the reference sequences and unknown sequences

unknown_asv_seqs <- setNames(unknown_asv_data$sequence, paste0('ASV ', seq_along(unknown_asv_data$sequence), ' (', unknown_asv_data$count, ')'))
aligned <- c(ref_subsample, unknown_asv_seqs) %>%
  char2dna() %>%
  mafft(method = 'localpair', exec = 'mafft')

It turns out that the unknown sequences are very different from eachother and the reference sequences; so much so that I could not calculate a distance matrix to make a tree.

image(aligned)

I manually BLASTed a few of the sequences on NCBI’s website to verify the BLAST on all of the ASVs worked properly and did not find any close matches. These seem to be some kind of errors or mispriming against unknown organisms.

Software used

sessioninfo::session_info()

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.3 (2020-10-10)
##  os       Pop!_OS 20.04 LTS           
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language en_US:en                    
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Vancouver           
##  date     2021-05-20                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package     * version    date       lib source        
##  ade4          1.7-16     2020-10-28 [1] CRAN (R 4.0.3)
##  ape         * 5.4-1      2020-08-13 [1] CRAN (R 4.0.2)
##  aphid         1.3.3      2019-05-08 [1] CRAN (R 4.0.3)
##  askpass       1.1        2019-01-13 [1] CRAN (R 4.0.2)
##  assertthat    0.2.1      2019-03-21 [1] CRAN (R 4.0.2)
##  bold          1.1.0      2020-06-17 [1] CRAN (R 4.0.2)
##  cli           2.1.0      2020-10-12 [1] CRAN (R 4.0.3)
##  cluster       2.1.0      2019-06-19 [4] CRAN (R 4.0.0)
##  codetools     0.2-16     2018-12-24 [4] CRAN (R 4.0.0)
##  colorspace    1.4-1      2019-03-18 [1] CRAN (R 4.0.2)
##  conditionz    0.1.0      2019-04-24 [1] CRAN (R 4.0.2)
##  crayon        1.3.4      2017-09-16 [1] CRAN (R 4.0.2)
##  crosstalk     1.1.0.1    2020-03-13 [1] CRAN (R 4.0.2)
##  crul          1.0.0      2020-07-30 [1] CRAN (R 4.0.2)
##  curl          4.3        2019-12-02 [1] CRAN (R 4.0.2)
##  data.table    1.13.2     2020-10-19 [1] CRAN (R 4.0.3)
##  digest        0.6.27     2020-10-24 [1] CRAN (R 4.0.3)
##  dplyr       * 1.0.2      2020-08-18 [1] CRAN (R 4.0.2)
##  DT          * 0.16       2020-10-13 [1] CRAN (R 4.0.3)
##  ellipsis      0.3.1      2020-05-15 [1] CRAN (R 4.0.2)
##  evaluate      0.14       2019-05-28 [1] CRAN (R 4.0.2)
##  fansi         0.4.1      2020-01-08 [1] CRAN (R 4.0.2)
##  farver        2.0.3      2020-01-16 [1] CRAN (R 4.0.2)
##  fastmatch     1.1-0      2017-01-28 [1] CRAN (R 4.0.2)
##  forcats     * 0.5.0      2020-03-01 [1] CRAN (R 4.0.2)
##  foreach       1.5.1      2020-10-15 [1] CRAN (R 4.0.3)
##  furrr       * 0.2.1      2020-10-21 [1] CRAN (R 4.0.3)
##  future      * 1.19.1     2020-09-22 [1] CRAN (R 4.0.3)
##  generics      0.1.0      2020-10-31 [1] CRAN (R 4.0.3)
##  ggplot2     * 3.3.2      2020-06-19 [1] CRAN (R 4.0.2)
##  globals       0.13.1     2020-10-11 [1] CRAN (R 4.0.3)
##  glue          1.4.2      2020-08-27 [1] CRAN (R 4.0.2)
##  gridExtra     2.3        2017-09-09 [1] CRAN (R 4.0.3)
##  gtable        0.3.0      2019-03-25 [1] CRAN (R 4.0.2)
##  hms           0.5.3      2020-01-08 [1] CRAN (R 4.0.2)
##  htmltools     0.5.1.1    2021-01-22 [1] CRAN (R 4.0.3)
##  htmlwidgets   1.5.2      2020-10-03 [1] CRAN (R 4.0.3)
##  httpcode      0.3.0      2020-04-10 [1] CRAN (R 4.0.2)
##  igraph        1.2.6      2020-10-06 [1] CRAN (R 4.0.3)
##  insect      * 1.2.0      2018-11-25 [1] CRAN (R 4.0.3)
##  ips         * 0.0.11     2019-07-04 [1] CRAN (R 4.0.3)
##  iterators     1.0.13     2020-10-15 [1] CRAN (R 4.0.3)
##  jsonlite      1.7.1      2020-09-07 [1] CRAN (R 4.0.2)
##  kmer          1.1.2      2019-05-20 [1] CRAN (R 4.0.3)
##  knitr         1.30       2020-09-22 [1] CRAN (R 4.0.2)
##  labeling      0.4.2      2020-10-20 [1] CRAN (R 4.0.3)
##  lattice     * 0.20-41    2020-04-02 [4] CRAN (R 4.0.0)
##  lifecycle     0.2.0      2020-03-06 [1] CRAN (R 4.0.2)
##  listenv       0.8.0      2019-12-05 [1] CRAN (R 4.0.3)
##  magrittr      1.5        2014-11-22 [1] CRAN (R 4.0.2)
##  MASS          7.3-53     2020-09-09 [4] CRAN (R 4.0.2)
##  Matrix        1.2-18     2019-11-27 [4] CRAN (R 4.0.0)
##  metacoder   * 0.3.4      2020-04-29 [1] CRAN (R 4.0.3)
##  mgcv          1.8-33     2020-08-27 [4] CRAN (R 4.0.2)
##  munsell       0.5.0      2018-06-12 [1] CRAN (R 4.0.2)
##  nlme          3.1-149    2020-08-23 [4] CRAN (R 4.0.2)
##  openssl       1.4.3      2020-09-18 [1] CRAN (R 4.0.2)
##  permute     * 0.9-5      2019-03-12 [1] CRAN (R 4.0.2)
##  phangorn    * 2.5.5      2019-06-19 [1] CRAN (R 4.0.2)
##  phylogram     2.1.0      2018-06-25 [1] CRAN (R 4.0.3)
##  pillar        1.4.6      2020-07-10 [1] CRAN (R 4.0.2)
##  pkgconfig     2.0.3      2019-09-22 [1] CRAN (R 4.0.2)
##  plyr          1.8.6      2020-03-03 [1] CRAN (R 4.0.2)
##  prettyunits   1.1.1      2020-01-24 [1] CRAN (R 4.0.2)
##  progress      1.2.2      2019-05-16 [1] CRAN (R 4.0.2)
##  purrr       * 0.3.4      2020-04-17 [1] CRAN (R 4.0.2)
##  qsubmitter  * 0.1        2020-11-11 [1] local         
##  quadprog      1.5-8      2019-11-20 [1] CRAN (R 4.0.2)
##  R6            2.5.0      2020-10-28 [1] CRAN (R 4.0.3)
##  RANN          2.6.1      2019-01-08 [1] CRAN (R 4.0.3)
##  Rcpp          1.0.5      2020-07-06 [1] CRAN (R 4.0.2)
##  readr       * 1.4.0      2020-10-05 [1] CRAN (R 4.0.3)
##  reshape       0.8.8      2018-10-23 [1] CRAN (R 4.0.2)
##  rlang         0.4.10     2020-12-30 [1] CRAN (R 4.0.3)
##  rmarkdown     2.5        2020-10-21 [1] CRAN (R 4.0.3)
##  rstudioapi    0.11       2020-02-07 [1] CRAN (R 4.0.2)
##  scales        1.1.1      2020-05-11 [1] CRAN (R 4.0.2)
##  seqinr        4.2-4      2020-10-10 [1] CRAN (R 4.0.3)
##  sessioninfo * 1.1.1      2018-11-05 [1] CRAN (R 4.0.2)
##  sharedbib     0.1.0.9003 2020-10-16 [1] local         
##  stringi       1.5.3      2020-09-09 [1] CRAN (R 4.0.2)
##  stringr     * 1.4.0      2019-02-10 [1] CRAN (R 4.0.2)
##  taxa        * 0.3.4      2020-04-29 [1] CRAN (R 4.0.3)
##  taxize      * 0.9.99     2020-10-30 [1] CRAN (R 4.0.3)
##  tibble        3.0.4      2020-10-12 [1] CRAN (R 4.0.3)
##  tidyr       * 1.1.2      2020-08-27 [1] CRAN (R 4.0.2)
##  tidyselect    1.1.0      2020-05-11 [1] CRAN (R 4.0.2)
##  utf8          1.1.4      2018-05-24 [1] CRAN (R 4.0.2)
##  uuid          0.1-4      2020-02-26 [1] CRAN (R 4.0.2)
##  vctrs         0.3.4      2020-08-29 [1] CRAN (R 4.0.2)
##  vegan       * 2.5-6      2019-09-01 [1] CRAN (R 4.0.2)
##  viridis     * 0.5.1      2018-03-29 [1] CRAN (R 4.0.3)
##  viridisLite * 0.3.0      2018-02-01 [1] CRAN (R 4.0.2)
##  withr         2.3.0      2020-09-22 [1] CRAN (R 4.0.3)
##  xfun          0.19       2020-10-30 [1] CRAN (R 4.0.3)
##  XML           3.99-0.5   2020-07-23 [1] CRAN (R 4.0.2)
##  xml2          1.3.2      2020-04-23 [1] CRAN (R 4.0.2)
##  yaml          2.2.1      2020-02-01 [1] CRAN (R 4.0.2)
##  zoo           1.8-8      2020-05-02 [1] CRAN (R 4.0.2)
## 
## [1] /home/fosterz/R/x86_64-pc-linux-gnu-library/4.0
## [2] /usr/local/lib/R/site-library
## [3] /usr/lib/R/site-library
## [4] /usr/lib/R/library