Prepare

Packages used

library(dplyr)
library(purrr)
library(furrr)
library(tidyr)
library(readr)
library(ggplot2)
library(sessioninfo)
library(metacoder)
library(vegan)
library(viridis)
library(DT)
library(stringr)
library(qsubmitter)
library(taxize)
library(forcats)
library(Biostrings)
library(ips)
library(insect)
library(ggpubr)
library(knitr)

Parameters

seed <- 1
only_shared_species <- TRUE # If TRUE, only use species that are present in both databases
subsample_to_same_num <- TRUE # If TRUE, subsample the amplicons used to the same number for each database
set.seed(seed)

Parallel processing

Commands that have “future” in them are run on multiple cores using the furrr and future packages.

plan(multiprocess)
## Warning: [ONE-TIME WARNING] Forked processing ('multicore') is disabled
## in future (>= 1.13.0) when running R from RStudio, because it is
## considered unstable. Because of this, plan("multicore") will fall
## back to plan("sequential"), and plan("multiprocess") will fall back to
## plan("multisession") - not plan("multicore") as in the past. For more details,
## how to control forked processing or not, and how to silence this warning in
## future R sessions, see ?future::supportsMulticore

Introduction

This analysis will attempt to estimate how good each locus is at distinguishing species, what I am calling “taxonomic resolution”. I will use the primer sequences and reference databases for each method to extract the portion of the sequence that would be amplified. I then align each sequence to all other sequences and record how many base pairs different each sequence is from the most similar sequence from a different species. The method with the highest numbers should be the one with the highest taxonomic resolution. This is similar to a barcode gap analysis, except that only the most similar comparisons are considered.

Extract all oomycete from reference databases

rps_ref <- read_fasta(file.path('intermediate_data', 'reference_databases', 'rps10_reference_db.fa'))
its_ref <- read_fasta(file.path('intermediate_data', 'reference_databases', 'its1_reference_db.fa'))
rps_ref <- rps_ref[grepl(names(rps_ref), pattern = 'Oomycetes')]
its_ref <- its_ref[grepl(names(its_ref), pattern = 'Oomycetes')]

Remove spaces from sequences (not sure why there would be spaces).

rps_ref <- trimws(gsub(rps_ref, pattern = ' ', replacement = ''))
its_ref <- trimws(gsub(its_ref, pattern = ' ', replacement = ''))

Replace names with just species names

names(rps_ref) <- str_match(names(rps_ref), ';([a-zA-Z0-9_\\-\'". ]+);oodb_.+;$')[,2]
names(its_ref) <- str_match(names(its_ref), ';([a-zA-Z0-9_\\-\'". ]+);(unite|rob2011|phytodb|mock)_.+;$')[,2]

Replace underscores with spaces

names(rps_ref) <- gsub(names(rps_ref), pattern = '_', replacement = ' ')
names(its_ref) <- gsub(names(its_ref), pattern = '_', replacement = ' ')

Remove species with ambiguous names or numbers

is_ambiguous <- function(n) {
  grepl(n, pattern = ' sp$') |
    grepl(n, pattern = ' spp$') | 
    grepl(n, pattern = ' spp.') |
    grepl(n, pattern = ' sp.') |
    grepl(n, pattern = ' aff. ')  | 
    grepl(n, pattern = ' cf.') |
    grepl(n, pattern = '[0-9]+') 
}
rps_ref <- rps_ref[! is_ambiguous(names(rps_ref))]
its_ref <- its_ref[! is_ambiguous(names(its_ref))]

Remove information below species

names(rps_ref) <- sub(names(rps_ref), pattern = ' var\\.?.+$', replacement = '')
names(rps_ref) <- sub(names(rps_ref), pattern = ' subsp\\.?.+$', replacement = '')
names(its_ref) <- sub(names(its_ref), pattern = ' var\\.?.+$', replacement = '')
names(its_ref) <- sub(names(its_ref), pattern = ' subsp\\.?.+$', replacement = '')

Now the names should be cleaned up enough that they can be compared to eachother:

unique(names(rps_ref))
##   [1] "Achlya hypogyna"                  "Albugo laibachii"                
##   [3] "Aphanomyces astaci"               "Aphanomyces cladogamus"          
##   [5] "Aphanomyces cochlioides"          "Aphanomyces euteiches"           
##   [7] "Aphanomyces frigidophilus"        "Aphanomyces invadans"            
##   [9] "Aphanomyces laevis"               "Aphanomyces stellatus"           
##  [11] "Bremia lactucae"                  "Bremia sonchicola"               
##  [13] "Halophytophthora bahamensis"      "Halophytophthora batemanensis"   
##  [15] "Halophytophthora polymorphica"    "Halophytophthora porrigovesica"  
##  [17] "Hyaloperonospora parasitica"      "Peronosclerospora philippinensis"
##  [19] "Peronosclerospora sacchari"       "Peronosclerospora sorghi"        
##  [21] "Peronospora effusa"               "Peronospora atriplicis-hastatae" 
##  [23] "Peronospora belbahrii"            "Peronospora boni-henrici"        
##  [25] "Peronospora lepigoni"             "Peronospora litoralis"           
##  [27] "Peronospora minor"                "Peronospora obovata"             
##  [29] "Peronospora polycarpi"            "Peronospora rumicis"             
##  [31] "Peronospora schachtii"            "Peronospora tabacina"            
##  [33] "Peronospora destructor"           "Phytophthora agathidicida"       
##  [35] "Phytophthora alni"                "Phytophthora amnicola"           
##  [37] "Phytophthora andina"              "Phytophthora aquimorbida"        
##  [39] "Phytophthora arecae"              "Phytophthora arenaria"           
##  [41] "Phytophthora asiatica"            "Phytophthora asparagi"           
##  [43] "Phytophthora austrocedri"         "Phytophthora betacei"            
##  [45] "Phytophthora bilorbang"           "Phytophthora bishii"             
##  [47] "Phytophthora bisheria"            "Phytophthora boehmeriae"         
##  [49] "Phytophthora boodjera"            "Phytophthora borealis"           
##  [51] "Phytophthora botryosa"            "Phytophthora brassicae"          
##  [53] "Phytophthora cactorum"            "Phytophthora cajani"             
##  [55] "Phytophthora cambivora"           "Phytophthora capensis"           
##  [57] "Phytophthora capsici"             "Phytophthora captiosa"           
##  [59] "Phytophthora chlamydospora"       "Phytophthora cinnamomi"          
##  [61] "Phytophthora citrophthora"        "Phytophthora clandestina"        
##  [63] "Phytophthora colocasiae"          "Phytophthora condilina"          
##  [65] "Phytophthora constricta"          "Phytophthora cooljarloo"         
##  [67] "Phytophthora crassamura"          "Phytophthora cryptogea"          
##  [69] "Phytophthora cuyabensis"          "Phytophthora cyperi"             
##  [71] "Phytophthora drechsleri"          "Phytophthora elongata"           
##  [73] "Phytophthora erythroseptica"      "Phytophthora europaea"           
##  [75] "Phytophthora fallax"              "Phytophthora fluvialis"          
##  [77] "Phytophthora foliorum"            "Phytophthora fragariae"          
##  [79] "Phytophthora frigida"             "Phytophthora gallica"            
##  [81] "Phytophthora gemini"              "Phytophthora gibbosa"            
##  [83] "Phytophthora glovera"             "Phytophthora gonapodyides"       
##  [85] "Phytophthora gondwanensis"        "Phytophthora gregata"            
##  [87] "Phytophthora hedraiandra"         "Phytophthora heveae"             
##  [89] "Phytophthora hibernalis"          "Phytophthora himalsilva"         
##  [91] "Phytophthora humicola"            "Phytophthora hydrogena"          
##  [93] "Phytophthora hydropathica"        "Phytophthora idaei"              
##  [95] "Phytophthora ilicis"              "Phytophthora infestans"          
##  [97] "Phytophthora insolita"            "Phytophthora ipomoeae"           
##  [99] "Phytophthora iranica"             "Phytophthora irrigata"           
## [101] "Phytophthora katsurae"            "Phytophthora kelmania"           
## [103] "Phytophthora kernoviae"           "Phytophthora kwongonina"         
## [105] "Phytophthora lacustris"           "Phytophthora lagoariana"         
## [107] "Phytophthora lateralis"           "Phytophthora litoralis"          
## [109] "Phytophthora macrochlamydospora"  "Phytophthora meadii"             
## [111] "Phytophthora medicaginis"         "Phytophthora megakarya"          
## [113] "Phytophthora megasperma"          "Phytophthora melonis"            
## [115] "Phytophthora mengei"              "Phytophthora mexicana"           
## [117] "Phytophthora mirabilis"           "Phytophthora mirablis"           
## [119] "Phytophthora moyootj"             "Phytophthora multivesiculata"    
## [121] "Phytophthora multivora"           "Phytophthora nicotianae"         
## [123] "Phytophthora niederhauserii"      "Phytophthora oreophila"          
## [125] "Phytophthora ornamentata"         "Phytophthora cacuminis"          
## [127] "Phytophthora palmivora"           "Phytophthora parsiana"           
## [129] "Phytophthora parvispora"          "Phytophthora phaseoli"           
## [131] "Phytophthora pini"                "Phytophthora pinifolia"          
## [133] "Phytophthora pistaciae"           "Phytophthora plurivora"          
## [135] "Phytophthora pluvialis"           "Phytophthora polonica"           
## [137] "Phytophthora porri"               "Phytophthora primulae"           
## [139] "Phytophthora pseudocryptogea"     "Phytophthora pseudorosacearum"   
## [141] "Phytophthora pseudosyringae"      "Phytophthora pseudotsugae"       
## [143] "Phytophthora psychrophila"        "Phytophthora quercetorum"        
## [145] "Phytophthora quercina"            "Phytophthora quininea"           
## [147] "Phytophthora ramorum"             "Phytophthora richardiae"         
## [149] "Phytophthora riparia"             "Phytophthora rosacearum"         
## [151] "Phytophthora rubi"                "Phytophthora sansomeana"         
## [153] "Phytophthora siskiyouensis"       "Phytophthora sulawesiensis"      
## [155] "Phytophthora syringae"            "Phytophthora tentaculata"        
## [157] "Phytophthora thermophila"         "Phytophthora trifolii"           
## [159] "Phytophthora tropicalis"          "Phytophthora uliginosa"          
## [161] "Phytophthora versiformis"         "Phytophthora vignae"             
## [163] "Phytophthora x stagnum"           "Phytophthora citricola"          
## [165] "Phytophthora nemorosa"            "Phytophthora sojae"              
## [167] "Phytopythium aichiense"           "Phytopythium carbonicum"         
## [169] "Phytopythium chamaehyphon"        "Phytopythium citrinum"           
## [171] "Phytopythium kandeliae"           "Phytopythium litorale"           
## [173] "Phytopythium montanum"            "Phytopythium oedochilum"         
## [175] "Phytopythium sindhum"             "Phytopythium vexans"             
## [177] "Plasmopara halstedii"             "Plasmopara obducens"             
## [179] "Plasmopara viticola"              "Pseudoperonospora cubensis"      
## [181] "Pseudoperonospora humuli"         "Pythium abappressorium"          
## [183] "Pythium acanthophoron"            "Pythium acrogynum"               
## [185] "Pythium adhaerens"                "Pythium alternatum"              
## [187] "Pythium anandrum"                 "Pythium angustatum"              
## [189] "Pythium aphanidermatum"           "Pythium apiculatum"              
## [191] "Pythium apleroticum"              "Pythium barbulae"                
## [193] "Pythium biforme"                  "Pythium brachiatum"              
## [195] "Pythium camurandrum"              "Pythium canariense"              
## [197] "Pythium carolinianum"             "Pythium cederbergense"           
## [199] "Pythium chondricola"              "Pythium citrinum"                
## [201] "Pythium conidiophorum"            "Pythium contiguanum"             
## [203] "Pythium cryptoirregulare"         "Pythium cylindrosporum"          
## [205] "Pythium cystogenes"               "Pythium debaryanum"              
## [207] "Pythium dimorphum"                "Pythium dissimile"               
## [209] "Pythium dissotocum"               "Pythium echinulatum"             
## [211] "Pythium emineosum"                "Pythium folliculosum"            
## [213] "Pythium glomeratum"               "Pythium heterothallicum"         
## [215] "Pythium hypogynum"                "Pythium insidiosum"              
## [217] "Pythium irregulare"               "Pythium iwayamai"                
## [219] "Pythium junctum"                  "Pythium kashmirense"             
## [221] "Pythium kunmingense"              "Pythium longandrum"              
## [223] "Pythium longisporangium"          "Pythium lucens"                  
## [225] "Pythium lutarium"                 "Pythium lycopersicum"            
## [227] "Pythium mamillatum"               "Pythium mastophorum"             
## [229] "Phytopythium megacarpum"          "Pythium middletoni"              
## [231] "Pythium monospermum"              "Pythium mercuriale"              
## [233] "Pythium nagaii"                   "Pythium oligandrum"              
## [235] "Pythium oopapillum"               "Pythium ornacarpum"              
## [237] "Pythium ornamentatum"             "Pythium pachycaule"              
## [239] "Pythium paddicum"                 "Pythium parvum"                  
## [241] "Pythium pectinolyticum"           "Pythium periilum"                
## [243] "Pythium perplexum"                "Pythium phragmiticola"           
## [245] "Pythium phragmitis"               "Pythium pleroticum"              
## [247] "Pythium prolatum"                 "Pythium radiosum"                
## [249] "Pythium rhizo-oryzae"             "Pythium rhizosaccharum"          
## [251] "Pythium rishiriense"              "Pythium rostratifingens"         
## [253] "Pythium scleroteichum"            "Pythium segnitium"               
## [255] "Pythium selbyi"                   "Pythium senticosum"              
## [257] "Pythium sukuiense"                "Pythium takayamanum"             
## [259] "Pythium terrestris"               "Pythium tracheiphilum"           
## [261] "Pythium ultimum"                  "Pythium undulatum"               
## [263] "Pythium utonaiense"               "Pythium viniferum"               
## [265] "Pythium violae"                   "Pythium aristosporum"            
## [267] "Pythium arrhenomanes"             "Pythium catenulatum"             
## [269] "Pythium coloratum"                "Pythium graminicola"             
## [271] "Pythium myriotylum"               "Pythium nunn"                    
## [273] "Pythium pyrilobum"                "Pythium sylvaticum"              
## [275] "Pythium vanterpoolii"             "Pythium volutum"                 
## [277] "Salisapilia tartarea"             "Salisapilia nakagirii"           
## [279] "Salisapilia sapeloensis"          "Saprolegnia diclina"             
## [281] "Saprolegnia ferax"                "Saprolegnia parasitica"          
## [283] "Sclerospora graminicola"          "Thraustotheca clavata"           
## [285] "Pythium rhizoterrae"              "Pythium aquasilvae"              
## [287] "Halophytophthora exoprolifera"    "Halophytophthora kandeliae"      
## [289] "Lagenidium pulchrum"              "Lagenidium sinuatum"             
## [291] "Phytopythium boreale"             "Phytopythium delwarense"         
## [293] "Phytopythium helicoides"          "Phytopythium mercuriale"         
## [295] "Phytopythium mirpurense"          "Phytopythium ostracodes"         
## [297] "Pythium amasculinum"              "Pythium apinafurcum"             
## [299] "Pythium aquatile"                 "Pythium attrantheridium"         
## [301] "Pythium brassicum"                "Pythium capillosum"              
## [303] "Pythium cucurbitacearum"          "Pythium diclinum"                
## [305] "Pythium erinaceum"                "Pythium flevoense"               
## [307] "Pythium grandisporangium"         "Pythium hydnosporum"             
## [309] "Pythium inflatum"                 "Pythium intermedium"             
## [311] "Pythium iriomotense"              "Pythium jasmonium"               
## [313] "Pythium marsipum"                 "Pythium megalacanthum"           
## [315] "Pythium middletonii"              "Pythium minus"                   
## [317] "Pythium nodosum"                  "Pythium okanoganense"            
## [319] "Pythium orthogonon"               "Pythium paroecandrum"            
## [321] "Pythium periplocum"               "Pythium plurisporium"            
## [323] "Pythium porphyrae"                "Pythium recalcitrans"            
## [325] "Pythium salpingophorum"           "Pythium schmitthenneri"          
## [327] "Pythium solare"                   "Pythium tardicrescens"           
## [329] "Pythium terretris"                "Pythium torulosum"               
## [331] "Pythium uncinulatum"              "Pythium volotum"
unique(names(its_ref))
##   [1] "Bremia lactucae"                    
##   [2] "Hyaloperonospora erophilae"         
##   [3] "Hyaloperonospora thlaspeos-arvensis"
##   [4] "Hyaloperonospora parasitica"        
##   [5] "Hyaloperonospora cheiranthi"        
##   [6] "Phytophthora cryptogea"             
##   [7] "Peronospora myosotidis"             
##   [8] "Peronospora trifoliorum"            
##   [9] "Peronospora trifolii-minoris"       
##  [10] "Peronospora astragalina"            
##  [11] "Aphanomyces euteiches"              
##  [12] "Aphanomyces cochlioides"            
##  [13] "Peronospora conglomerata"           
##  [14] "Peronospora agrestis"               
##  [15] "Peronospora flava"                  
##  [16] "Phytophthora infestans"             
##  [17] "Pythium ultimum"                    
##  [18] "Albugo candida"                     
##  [19] "Plasmopara halstedii"               
##  [20] "Plasmopara obducens"                
##  [21] "Plasmopara nivea"                   
##  [22] "Peronospora potentillae"            
##  [23] "Perofascia lepidii"                 
##  [24] "Peronospora lamii"                  
##  [25] "Peronospora manshurica"             
##  [26] "Peronospora meliloti"               
##  [27] "Peronospora alsinearum"             
##  [28] "Peronospora ervi"                   
##  [29] "Peronospora polygoni"               
##  [30] "Pseudoperonospora urticae"          
##  [31] "Phytophthora nicotianae"            
##  [32] "Peronospora farinosa"               
##  [33] "Phytophthora capsici"               
##  [34] "Peronospora parva"                  
##  [35] "Peronospora radii"                  
##  [36] "Peronospora chenopodii-polyspermi"  
##  [37] "Pythium graminicola"                
##  [38] "Peronospora corydalis"              
##  [39] "Plasmopara pusilla"                 
##  [40] "Peronospora viciae"                 
##  [41] "Peronospora coronillae"             
##  [42] "Peronospora aestivalis"             
##  [43] "Peronospora sordida"                
##  [44] "Peronospora boni-henrici"           
##  [45] "Peronospora chlorae"                
##  [46] "Pseudoperonospora cannabina"        
##  [47] "Peronospora destructor"             
##  [48] "Achlya ambisexualis"                
##  [49] "Achlya americana"                   
##  [50] "Achlya aquatica"                    
##  [51] "Achlya bisexualis"                  
##  [52] "Achlya caroliniana"                 
##  [53] "Achlya colorata"                    
##  [54] "Achlya conspicua"                   
##  [55] "Achlya dubia"                       
##  [56] "Achlya flagellata"                  
##  [57] "Achlya glomerata"                   
##  [58] "Achlya heterosexualis"              
##  [59] "Achlya oligacantha"                 
##  [60] "Achlya papillosa"                   
##  [61] "Achlya racemosa"                    
##  [62] "Achlya radiosa"                     
##  [63] "Achlya recurva"                     
##  [64] "Aphanomyces cladogamus"             
##  [65] "Aphanomyces iridis"                 
##  [66] "Aphanomyces laevis"                 
##  [67] "Apodachlya brachynema"              
##  [68] "Apodachlya minima"                  
##  [69] "Basidiophora entospora"             
##  [70] "Brevilegnia gracilis"               
##  [71] "Brevilegnia macrospora"             
##  [72] "Brevilegnia unisperma"              
##  [73] "Brevilegnia"                        
##  [74] "Eurychasma dicksonii"               
##  [75] "Halophytophthora exoprolifera"      
##  [76] "Halophytophthora kandeliae"         
##  [77] "Salisapilia tartarea"               
##  [78] "Hyaloperonospora nesliae"           
##  [79] "Hyaloperonospora sisymbrii-sophiae" 
##  [80] "Lagenidium caudatum"                
##  [81] "Leptolegnia caudata"                
##  [82] "Peronospora aparines"               
##  [83] "Peronospora calotheca"              
##  [84] "Peronospora sherardiae"             
##  [85] "Peronospora valerianellae"          
##  [86] "Peronospora violae"                 
##  [87] "Phytophthora citricola"             
##  [88] "Phytophthora alni"                  
##  [89] "Phytophthora alticola"              
##  [90] "Phytophthora arecae"                
##  [91] "Phytophthora austrocedrae"          
##  [92] "Halophytophthora avicenniae"        
##  [93] "Phytophthora batemanensis"          
##  [94] "Phytophthora bisheria"              
##  [95] "Phytophthora boehmeriae"            
##  [96] "Phytophthora botryosa"              
##  [97] "Phytophthora brassicae"             
##  [98] "Phytophthora cactorum"              
##  [99] "Phytophthora cajani"                
## [100] "Phytophthora cambivora"             
## [101] "Phytophthora captiosa"              
## [102] "Phytophthora cinnamomi"             
## [103] "Phytophthora citrophthora"          
## [104] "Phytophthora clandestina"           
## [105] "Phytophthora colocasiae"            
## [106] "Phytophthora drechsleri"            
## [107] "Halophytophthora epistomium"        
## [108] "Phytophthora erythroseptica"        
## [109] "Phytophthora europaea"              
## [110] "Phytophthora fallax"                
## [111] "Phytophthora foliorum"              
## [112] "Phytophthora fragariae"             
## [113] "Phytophthora frigida"               
## [114] "Phytophthora gonapodyides"          
## [115] "Phytophthora hedraiandra"           
## [116] "Phytophthora heveae"                
## [117] "Phytophthora hibernalis"            
## [118] "Phytophthora himalayensis"          
## [119] "Phytophthora humicola"              
## [120] "Phytophthora idaei"                 
## [121] "Phytophthora ilicis"                
## [122] "Phytophthora inflata"               
## [123] "Phytophthora insolita"              
## [124] "Phytophthora inundata"              
## [125] "Phytophthora ipomoeae"              
## [126] "Phytophthora iranica"               
## [127] "Phytophthora katsurae"              
## [128] "Phytophthora kernoviae"             
## [129] "Phytophthora lateralis"             
## [130] "Phytophthora litchii"               
## [131] "Phytophthora macrochlamydospora"    
## [132] "Phytophthora meadii"                
## [133] "Phytophthora medicaginis"           
## [134] "Phytophthora megakarya"             
## [135] "Phytophthora megasperma"            
## [136] "Phytophthora melonis"               
## [137] "Phytophthora mengei"                
## [138] "Phytophthora mexicana"              
## [139] "Phytophthora mirabilis"             
## [140] "Phytophthora multivesiculata"       
## [141] "Phytophthora multivora"             
## [142] "Phytophthora nemorosa"              
## [143] "Phytophthora palmivora"             
## [144] "Phytophthora parsiana"              
## [145] "Phytophthora phaseoli"              
## [146] "Phytophthora pini"                  
## [147] "Phytophthora pinifolia"             
## [148] "Phytophthora pistaciae"             
## [149] "Phytophthora plurivora"             
## [150] "Phytophthora polonica"              
## [151] "Halophytophthora polymorphica"      
## [152] "Phytophthora porri"                 
## [153] "Phytophthora primulae"              
## [154] "Phytophthora pseudosyringae"        
## [155] "Phytophthora pseudotsugae"          
## [156] "Phytophthora psychrophila"          
## [157] "Phytophthora quercetorum"           
## [158] "Phytophthora quercina"              
## [159] "Phytophthora quininea"              
## [160] "Phytophthora ramorum"               
## [161] "Phytophthora rosacearum"            
## [162] "Phytophthora rubi"                  
## [163] "Phytophthora sansomea"              
## [164] "Phytophthora sinensis"              
## [165] "Phytophthora siskiyouensis"         
## [166] "Phytophthora sojae"                 
## [167] "Phytophthora andina"                
## [168] "Phytophthora asparagi"              
## [169] "Phytophthora sulawesiensis"         
## [170] "Phytophthora syringae"              
## [171] "Phytophthora tabaci"                
## [172] "Phytophthora tentaculata"           
## [173] "Phytophthora trifolii"              
## [174] "Phytophthora tropicalis"            
## [175] "Phytophthora uliginosa"             
## [176] "Phytophthora vignae"                
## [177] "Pythium vexans"                     
## [178] "Phytopythium boreale"               
## [179] "Phytopythium carbonicum"            
## [180] "Pythium chamaihyphon"               
## [181] "Ovatisporangium citrinum"           
## [182] "Phytopythium cucurbitacearum"       
## [183] "Phytopythium helicoides"            
## [184] "Ovatisporangium litorale"           
## [185] "Phytopythium megacarpum"            
## [186] "Ovatisporangium montanum"           
## [187] "Phytopythium oedochilum"            
## [188] "Pythium ostracodes"                 
## [189] "Phytopythium sindhum"               
## [190] "Plasmopara euphrasiae"              
## [191] "Plasmoverna anemones-ranunculoides" 
## [192] "Plectospira myriandra"              
## [193] "Protoachlya paradoxa"               
## [194] "Pseudoperonospora cubensis"         
## [195] "Pythiogeton zeae"                   
## [196] "Pythiopsis terrestris"              
## [197] "Pythium abappressorium"             
## [198] "Pythium acanthicum"                 
## [199] "Pythium acanthophoron"              
## [200] "Pythium acrogynum"                  
## [201] "Pythium adhaerens"                  
## [202] "Pythium afertile"                   
## [203] "Pythium attrantheridium"            
## [204] "Pythium macrosporum"                
## [205] "Pythium oopapillum"                 
## [206] "Pythium pleroticum"                 
## [207] "Pythium polymastum"                 
## [208] "Pythium volutum"                    
## [209] "Pythium amasculinum"                
## [210] "Elongisporangium anandrum"          
## [211] "Pythium angustatum"                 
## [212] "Pythium aphanidermatum"             
## [213] "Pythium apiculatum"                 
## [214] "Pythium apleroticum"                
## [215] "Pythium aquatile"                   
## [216] "Pythium aristosporum"               
## [217] "Pythium arrhenomanes"               
## [218] "Pythium buismaniae"                 
## [219] "Pythium camurandrum"                
## [220] "Pythium canariense"                 
## [221] "Pythium capillosum"                 
## [222] "Pythium carolinianum"               
## [223] "Pythium catenulatum"                
## [224] "Pythium chondricola"                
## [225] "Pythium coloratum"                  
## [226] "Pythium conidiophorum"              
## [227] "Pythium contiguanum"                
## [228] "Pythium cryptoirregulare"           
## [229] "Pythium cylindrosporum"             
## [230] "Pythium cystogenes"                 
## [231] "Pythium debaryanum"                 
## [232] "Pythium deliense"                   
## [233] "Pythium diclinum"                   
## [234] "Pythium dimorphum"                  
## [235] "Pythium dissimile"                  
## [236] "Pythium dissotocum"                 
## [237] "Pythium echinulatum"                
## [238] "Pythium emineosum"                  
## [239] "Pythium erinaceum"                  
## [240] "Pythium flevoense"                  
## [241] "Pythium folliculosum"               
## [242] "Pythium glomeratum"                 
## [243] "Pythium grandisporangium"           
## [244] "Pythium helicandrum"                
## [245] "Pythium heterothallicum"            
## [246] "Pythium hydnosporum"                
## [247] "Pythium hypogynum"                  
## [248] "Pythium inflatum"                   
## [249] "Pythium insidiosum"                 
## [250] "Pythium intermedium"                
## [251] "Pythium irregulare"                 
## [252] "Pythium iwayamai"                   
## [253] "Pythium kashmirense"                
## [254] "Pythium kunmingense"                
## [255] "Pythium longandrum"                 
## [256] "Pythium longisporangium"            
## [257] "Pythium lucens"                     
## [258] "Pythium lutarium"                   
## [259] "Pythium lycopersicum"               
## [260] "Pythium mamillatum"                 
## [261] "Pythium marsipium"                  
## [262] "Pythium mastophorum"                
## [263] "Pythium megalacanthum"              
## [264] "Pythium middletonii"                
## [265] "Pythium minus"                      
## [266] "Pythium monospermum"                
## [267] "Pythium multisporum"                
## [268] "Pythium myriotylum"                 
## [269] "Pythium nagaii"                     
## [270] "Pythium nodosum"                    
## [271] "Pythium nunn"                       
## [272] "Pythium okanoganense"               
## [273] "Pythium oligandrum"                 
## [274] "Pythium ornacarpum"                 
## [275] "Pythium ornamentatum"               
## [276] "Pythium orthogonon"                 
## [277] "Pythium pachycaule"                 
## [278] "Pythium paddicum"                   
## [279] "Pythium paroecandrum"               
## [280] "Pythium parvum"                     
## [281] "Pythium pectinolyticum"             
## [282] "Pythium periilum"                   
## [283] "Pythium periplocum"                 
## [284] "Pythium perplexum"                  
## [285] "Pythium phragmitis"                 
## [286] "Pythium plurisporium"               
## [287] "Pythium porphyrae"                  
## [288] "Pythium prolatum"                   
## [289] "Pythium pyrilobum"                  
## [290] "Pythium radiosum"                   
## [291] "Pythium rhizo-oryzae"               
## [292] "Pythium rhizosaccharum"             
## [293] "Pythium rostratifingens"            
## [294] "Pythium rostratum"                  
## [295] "Pythium salpingophorum"             
## [296] "Pythium scleroteichum"              
## [297] "Pythium segnitium"                  
## [298] "Pythium senticosum"                 
## [299] "Pythium jasmonium"                  
## [300] "Pythium sukuiense"                  
## [301] "Pythium sulcatum"                   
## [302] "Pythium sylvaticum"                 
## [303] "Pythium takayamanum"                
## [304] "Pythium tardicrescens"              
## [305] "Pythium terrestris"                 
## [306] "Pythium torulosum"                  
## [307] "Pythium tracheiphilum"              
## [308] "Pythium uncinulatum"                
## [309] "Elongisporangium undulatum"         
## [310] "Pythium vanterpoolii"               
## [311] "Pythium viniferum"                  
## [312] "Pythium violae"                     
## [313] "Pythium zingiberis"                 
## [314] "Saprolegnia anisospora"             
## [315] "Saprolegnia asterophora"            
## [316] "Saprolegnia delica"                 
## [317] "Saprolegnia diclina"                
## [318] "Saprolegnia eccentrica"             
## [319] "Saprolegnia ferax"                  
## [320] "Saprolegnia hypogyna"               
## [321] "Saprolegnia lapponica"              
## [322] "Saprolegnia litoralis"              
## [323] "Saprolegnia megasperma"             
## [324] "Saprolegnia mixta"                  
## [325] "Saprolegnia monilifera"             
## [326] "Saprolegnia monoica"                
## [327] "Saprolegnia parasitica"             
## [328] "Achlya rodrigueziana"               
## [329] "Saprolegnia subterranea"            
## [330] "Saprolegnia terrestris"             
## [331] "Saprolegnia turfosa"                
## [332] "Saprolegnia unispora"               
## [333] "Thraustotheca clavata"              
## [334] "Thraustotheca terrestris"           
## [335] "Phytophthora glovera"               
## [336] "Phytophthora capensis"              
## [337] "Phytophthora lacustris"             
## [338] "Phytophthora sansomeana"            
## [339] "Phytophthora richardiae"            
## [340] "Phytophthora thermophila"           
## [341] "Phytophthora irrigata"              
## [342] "Phytophthora hydropathica"          
## [343] "Phytophthora gallica"               
## [344] "Peronospora effusa"                 
## [345] "Peronospora schachtii"              
## [346] "Phytophthora himalsilva"            
## [347] "Phytophthora hydrogena"             
## [348] "Phytophthora pluvialis"             
## [349] "Phytopythium citrinum"              
## [350] "Pythium undulatum"

Although there are a similar number of species, each database has a distinct set of species:

table(unique(names(rps_ref)) %in% unique(names(its_ref)))
## 
## FALSE  TRUE 
##   105   227
table(unique(names(its_ref)) %in% unique(names(rps_ref)))
## 
## FALSE  TRUE 
##   123   227

This could influence the comparison, so we can use only species present in both to avoid that:

if (only_shared_species) {
  common_species <- intersect(unique(names(rps_ref)), unique(names(its_ref)))
  rps_ref <- rps_ref[names(rps_ref) %in% common_species]
  its_ref <- its_ref[names(its_ref) %in% common_species]
}

Extract predicted amplicons

Only the amplified sequence contribute to the taxonomic resolution of the locus, so I will have to find that region in reference database sequences. Not all reference sequences will have the entire region however. Some will have the region but not the primer sites, making things more complicated.

primer_data <- read_csv(file.path("raw_data", "primer_data.csv"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   primer_id = col_character(),
##   locus = col_character(),
##   direction = col_character(),
##   sequence = col_character()
## )
primer_data <- filter(primer_data, primer_id != 'rps10_f-F')
primer_data
## # A tibble: 4 x 4
##   primer_id locus direction sequence             
##   <chr>     <chr> <chr>     <chr>                
## 1 rps10-F   rps10 Forward   GTTGGTTAGAGYARAAGACT 
## 2 rps10-R   rps10 Reverse   ATRYYTAGAAAGAYTYGAACT
## 3 ITS6      ITS   Forward   GAAGGTGAAGTCGTAACAAGG
## 4 ITS7      ITS   Reverse   AGCGTTCTTCATCGATGTGC

The matchProbePair function from the Biostrings package can extract a region predicted to be amplified by primers, but for some reason it does not allow for ambiguity codes, even though it is a trivial change to make it do so. Below is a modified version of the function that does allow ambiguity codes by providing the fixed parameter and passing it to the matchPattern calls.

matchProbePair <- function(Fprobe, Rprobe, subject, algorithm="auto",
             logfile=NULL, verbose=FALSE, fixed = FALSE)
    {
        ## This won't copy the data if Fprobe and Rprobe are already DNAString objects
        F <- DNAString(Fprobe)
        R <- DNAString(Rprobe)

        ## F and R hits on the + strand
        Fp_hits <- start(matchPattern(F, subject, algorithm=algorithm, fixed = fixed))
        Rp_hits <- start(matchPattern(R, subject, algorithm=algorithm, fixed = fixed))

        ## F and R hits on the - strand
        Fm_hits <- end(matchPattern(reverseComplement(F), subject, algorithm=algorithm, fixed = fixed))
        Rm_hits <- end(matchPattern(reverseComplement(R), subject, algorithm=algorithm, fixed = fixed))

        if (verbose) {
            cat("Fp_hits:", Fp_hits, "  Rp_hits:", Rp_hits,
                "  Fm_hits:", Fm_hits, "  Rm_hits:", Rm_hits, "\n")
        }

        matches0 <- Biostrings:::reduceProbePairMatches(c(Fp_hits, Rp_hits), c(Fm_hits, Rm_hits))
        ans <- Views(subject, start=matches0$start, end=matches0$end)

        if (!is.null(logfile)) {
            nFp <- length(Fp_hits)
            nRp <- length(Rp_hits)
            nFm <- length(Fm_hits)
            nRm <- length(Rm_hits)
            nmatches0 <- length(ans)
            ## cat("", ..., sep="\t") is a trick to get an extra tab
            cat("", nFp, nRp, nFm, nRm, nmatches0, file=logfile, sep="\t")
        }
        ans
}

The code below extracts predicted amplicons using primer matches, removes the primers, and aligns the trimmed amplicons to the sequences that did not have primer matches to try to find other sequences in the reference databases that do not have primer sequences but are full length matches to those that do.

get_amplicon_chr <- function(seq) {
  map2_chr(seq@ranges@start, seq@ranges@width, function(s, w) {
    as.character(seq@subject[seq(from = s, length.out = w)])
  })
}

calc_amplicons <- function(seqs, forward, reverse, min_coverage = 0.9, ...) {
  # Get simulated amplicons using primers
  full_amps <- future_map(seqs, function(s) {
    get_amplicon_chr(matchProbePair(DNAString(s), Fprobe = forward, Rprobe = reverse))
  })
  
  # Check for multiple possible amplicons per input
  if (any(map_int(full_amps, length) > 1)) {
    stop('Some inputs have more than one amplicons.')
  }
  full_amps <- unlist(full_amps[map_int(full_amps, length) == 1])
  
  # Remove primers from amplicons
  full_amps <- substr(full_amps, start = nchar(forward) + 1, stop = nchar(full_amps) - nchar(reverse))
  
  # Align unamplified sequences with best matching amplicons
  unamped <- seqs[! names(seqs) %in% names(full_amps)]
  unamped_aligned <- future_map_chr(unamped, function(s) {
    aligned <- pairwiseAlignment(pattern = full_amps, subject = s, 
                                 type = 'global-local',
                                 # type = 'overlap',
                                 gapOpening = 10, gapExtension = 4)
    best_score_i <- which.max(aligned@score)
    best_align <- aligned[best_score_i]
    aligned_subject_char <- gsub(as.character(best_align@subject), pattern = '-', replacement = '')
    aligned_amp_char <- gsub(as.character(best_align@pattern), pattern = '-', replacement = '')
    if (best_align@pattern@range@width < best_align@pattern@unaligned@ranges@width * min_coverage || nchar(aligned_subject_char) == 0) {
      aligned_subject_char <- NA
    }
    aligned_subject_char
  })
  
  # Add full amplicons to inferred amplicons from alignment
  all_amps <- c(full_amps, unamped_aligned[! is.na(unamped_aligned)])
  all_amps <- all_amps[is.na(all_amps) | nchar(all_amps) > 0]
  setNames(all_amps[names(seqs)], names(seqs))
}

rps_amps <- calc_amplicons(rps_ref, 
                           forward = primer_data$sequence[primer_data$direction == 'Forward' & primer_data$locus == 'rps10'],
                           reverse = primer_data$sequence[primer_data$direction == 'Reverse' & primer_data$locus == 'rps10'])
its_amps <- calc_amplicons(its_ref, 
                           forward = primer_data$sequence[primer_data$direction == 'Forward' & primer_data$locus == 'ITS'],
                           reverse = primer_data$sequence[primer_data$direction == 'Reverse' & primer_data$locus == 'ITS'])

The amplicon could not be extracted from many sequences, probably because it was incomplete, but enough can be to compare the two methods. Here is the proportion of reference sequences with full length amplicons extracted:

table(is.na(rps_amps))
## 
## FALSE  TRUE 
##   649    48
table(is.na(its_amps))
## 
## FALSE  TRUE 
##   706   544

Lets remove those from the databases:

rps_amps <- rps_amps[! is.na(rps_amps)]
its_amps <- its_amps[! is.na(its_amps)]

Lets also remove any sequences that are duplicates (same sequence and same species name):

rps_amps <- rps_amps[! duplicated(paste0(names(rps_amps), rps_amps))]
its_amps <- its_amps[! duplicated(paste0(names(its_amps), its_amps))]

And sort them by name:

rps_amps <- rps_amps[order(names(rps_amps))]
its_amps <- its_amps[order(names(its_amps))]

And here is the distribution of amplicon lengths with and without primers:

amp_data <- tibble(locus = rep(c('rps10', 'ITS'), c(length(rps_amps), length(its_amps))),
                   seq = c(rps_amps, its_amps),
                   name = names(c(rps_amps, its_amps)),
                   without_primers = nchar(seq))
amp_data$locus <- ordered(amp_data$locus, levels = c('rps10', 'ITS')) # order
its_primer_length <- sum(nchar(primer_data$sequence[primer_data$locus == 'ITS']))
rps_primer_length <- sum(nchar(primer_data$sequence[primer_data$locus == 'rps10']))
amp_data <- mutate(amp_data, with_primers = without_primers + ifelse(locus == 'ITS', its_primer_length, rps_primer_length))

amp_len_plot <- amp_data %>%
  gather(key = 'type', value = 'length', without_primers, with_primers) %>%
  mutate(type = c(without_primers = 'Without primers', with_primers = 'With primers')[type]) %>%
  ggplot(aes(x = length)) +
  geom_histogram(binwidth = 1) +
  facet_grid(type ~ locus, scales = 'free_x') +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank()) 
amp_len_plot

ggsave(amp_len_plot, filename = 'predicted_amplicon_length.pdf', path = file.path('results'), width = 10, height = 5)

Lets look at which species have the largest amplicons:

amp_data %>%
  filter(locus == 'rps10', with_primers > 480) %>%
  arrange(desc(with_primers)) %>%
  select(name, without_primers, with_primers, seq) %>%
  group_by(name) %>%
  filter(with_primers == max(with_primers)) %>%
  filter(! duplicated(name)) %>%
  datatable()

and which have the shortest:

amp_data %>%
  filter(locus == 'rps10', with_primers < 460) %>%
  arrange(with_primers) %>%
  select(name, without_primers, with_primers, seq) %>%
  group_by(name) %>%
  filter(with_primers == max(with_primers)) %>%
  filter(! duplicated(name)) %>%
  datatable()

I will save FASTA files of the amplicons for use in other analyses

paste0(">", names(rps_amps), "\n", rps_amps) %>%
  write_lines(file = file.path('intermediate_data', 'reference_databases', 'predicted_amplicons_rps10.fa'))
paste0(">", names(its_amps), "\n", its_amps) %>%
  write_lines(file = file.path('intermediate_data', 'reference_databases', 'predicted_amplicons_its1.fa'))

I will also save a CSV version of this:

amp_data %>%
  filter(locus == 'rps10') %>%
  arrange(desc(with_primers)) %>%
  select(name, without_primers, with_primers, amplicon = seq) %>%
  group_by(name) %>%
  filter(with_primers == max(with_primers)) %>%
  filter(! duplicated(name)) %>%
  write_csv(file = file.path('results', 'rps10_predicted_amplicon_length.csv'))

Again, lets only use species present in both

if (only_shared_species) {
  common_species <- intersect(unique(names(rps_amps)), unique(names(its_amps)))
  rps_amps <- rps_amps[names(rps_amps) %in% common_species]
  its_amps <- its_amps[names(its_amps) %in% common_species]
}
unique(names(rps_amps))
##  [1] "Hyaloperonospora parasitica" "Phytophthora andina"        
##  [3] "Phytophthora brassicae"      "Phytophthora cactorum"      
##  [5] "Phytophthora cambivora"      "Phytophthora capensis"      
##  [7] "Phytophthora capsici"        "Phytophthora cinnamomi"     
##  [9] "Phytophthora citricola"      "Phytophthora citrophthora"  
## [11] "Phytophthora cryptogea"      "Phytophthora drechsleri"    
## [13] "Phytophthora erythroseptica" "Phytophthora gonapodyides"  
## [15] "Phytophthora hydropathica"   "Phytophthora infestans"     
## [17] "Phytophthora kernoviae"      "Phytophthora megakarya"     
## [19] "Phytophthora megasperma"     "Phytophthora melonis"       
## [21] "Phytophthora multivora"      "Phytophthora nicotianae"    
## [23] "Phytophthora palmivora"      "Phytophthora plurivora"     
## [25] "Phytophthora quercetorum"    "Phytophthora sojae"         
## [27] "Phytophthora syringae"       "Phytopythium carbonicum"    
## [29] "Phytopythium citrinum"       "Phytopythium megacarpum"    
## [31] "Phytopythium sindhum"        "Pseudoperonospora cubensis" 
## [33] "Pythium abappressorium"      "Pythium acrogynum"          
## [35] "Pythium apiculatum"          "Pythium attrantheridium"    
## [37] "Pythium camurandrum"         "Pythium cryptoirregulare"   
## [39] "Pythium cylindrosporum"      "Pythium cystogenes"         
## [41] "Pythium debaryanum"          "Pythium emineosum"          
## [43] "Pythium glomeratum"          "Pythium hydnosporum"        
## [45] "Pythium irregulare"          "Pythium jasmonium"          
## [47] "Pythium kashmirense"         "Pythium kunmingense"        
## [49] "Pythium lucens"              "Pythium mamillatum"         
## [51] "Pythium megalacanthum"       "Pythium middletonii"        
## [53] "Pythium minus"               "Pythium monospermum"        
## [55] "Pythium ornacarpum"          "Pythium paroecandrum"       
## [57] "Pythium pectinolyticum"      "Pythium pleroticum"         
## [59] "Pythium rhizo-oryzae"        "Pythium rhizosaccharum"     
## [61] "Pythium rostratifingens"     "Pythium segnitium"          
## [63] "Pythium senticosum"          "Pythium sylvaticum"         
## [65] "Pythium takayamanum"         "Pythium terrestris"         
## [67] "Pythium ultimum"             "Pythium viniferum"

Align predicted amplicons

To make a distance matrix, I will align the amplicons:

nj_tree <- function(seqs, ...) {
  # Align sequences:
  aligned <- seqs %>%
    insect::char2dna() %>%
    ips::mafft(method = 'localpair', exec = '/usr/bin/mafft')
  
  # Make distance matrix
  dist <- ape::dist.dna(aligned, ...)
  
  # Make tree
  tree <- ape::nj(dist)
  tree <- ape::ladderize(tree)
  tree <- phangorn::midpoint(tree)
  
  tree
}

calc_align <- function(amps) {
  amps %>%
    gsub(pattern = '[^AGCTN-]', replacement = 'N') %>%
    char2dna() %>%
    mafft(method = "globalpair", exec = 'mafft')
}

plot_align <- function(amps, aligned, title, spot = "top", start = 1, end = ncol(aligned)) {
  # aligned <- rps_amp_aligned; amps <- rps_amps; title <- "Rps10"; spot = "top"; start = 50; end = ncol(aligned)
  
  # Subset alignment to region to plot       horiz = TRUE, xpd = TRUE)
  aligned <- aligned[, start:end]
  
  # Make tree and order alignments to match tree
  tree <- nj_tree(amps)
  tree <- ladderize(tree)
  aligned <- aligned[rev(tree$tip.label[tree$edge[tree$edge[,2] <= length(tree$tip.label), 2]]), ]
  
  # Get sequence conservation at each position 
  major_allele_prop <- unlist(lapply(1:ncol(aligned), function(index) {
    chars <- as.character(aligned[, index, drop = TRUE])
    most_common_base <- sort(table(chars), decreasing = TRUE)[1]
    return(most_common_base / nrow(aligned))
  }))
  barplot_data <- data.frame(prop = major_allele_prop,
                             fill = 1 - major_allele_prop)
  
  # Get consensus sequence
  consensus <- as.DNAbin(matrix(names(major_allele_prop), nrow = 1))
  rownames(consensus) <- "Consensus  "
  
  make_one_plot <- function(y_offset) {
    # The barplot at the top
    par(fig=c(0.206, 1, 0.95 - y_offset, 1 - y_offset), new = TRUE, mar = c(0.5, 4, 1, 0.37))
    barplot(t(as.matrix(barplot_data)), cex.axis = 1, ylab = "PID", col = c('grey', 'red'), border = NA, space = 0, at = c(0, 1))
    # The consensus
    par(fig=c(0.3, 1, 0.94 - y_offset, 0.95 - y_offset), new = TRUE, mar = c(0.1, 0, 0, 2))
    image(consensus, cex.lab = 0.7, show.label = TRUE, yaxt = "n", xaxt = "n", legend = FALSE)
    # The alignment 
    par(fig=c(0.3, 1, 0.5 - y_offset, 0.95 - y_offset), new = TRUE, mar = c(5, 0, 1, 2))
    image(aligned, cex.lab = 0.5, show.label = FALSE, yaxt = "n", xlab = "Alignment position", legend = FALSE)
    # The tree and labels 
    par(fig=c(0, 0.35, 0.5 - y_offset, 0.951 - y_offset), new = TRUE, mar = c(3.9, 4, 0, 0))
    plot(tree, use.edge.length = FALSE, adj = 1, cex = 0.6)
    # The subfigure label
    mtext(title, cex = 2, adj = 0, padj = -1.5)
  }
  
  # Make plots
  if (spot == "top") {
    make_one_plot(0)
  } else {
    make_one_plot(.5)
  }
}

rps_amp_aligned <- calc_align(rps_amps)
its_amp_aligned <- calc_align(its_amps)

pdf(file = file.path('results', 'alignment_plot.pdf'), width = 8, height = 11)
# png(file = file.path('results', 'alignment_plot.png'), width = 2000, height = 3000, pointsize = 28)
plot_align(rps_amps, rps_amp_aligned, title = expression(italic("rps10")), start = 34)
## Warning in par(fig = c(0.206, 1, 0.95 - y_offset, 1 - y_offset), new = TRUE, :
## calling par(new=TRUE) with no plot
## Warning in plot.window(xlim, ylim, log = log, ...): "at" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "at" is
## not a graphical parameter
plot_align(its_amps, its_amp_aligned, title = "ITS1", start = 39, spot = "bottom")
## Warning in plot.window(xlim, ylim, log = log, ...): "at" is not a graphical
## parameter

## Warning in plot.window(xlim, ylim, log = log, ...): "at" is not a graphical
## parameter
# Add legend
par(fig=c(0.15, 0.5, 0.51, 0.61), new = TRUE, mar = c(0, 0, 0, 0))
legend(0, 0, legend = c('A', 'G', 'C', 'T', '-'), pch = 22, pt.bg = c("red", "yellow", "green", "blue", "black"), 
       pt.cex = 2, bty = "n", xjust = 0.5, yjust = 0.5, 
       horiz = TRUE, xpd = TRUE)
dev.off()
## png 
##   2
include_graphics(file.path('results', 'alignment_plot.pdf'))

Caption: Multiple sequence alignments of the region predicted to be amplified by each method, not including the primer binding sites. The sequences used represent the subset of species present in both reference databases. The sequences are ordered vertically based on a neighbour-joining tree. Along the top of each alignment there is a barchart representing the proportion of sequences matching the consensus sequence at each alignment position, with gray representing matches and red representing mismatches.

Calculate distance to nearest non-target

Being able to differentiate different species is key to metabarcoding. Therefore, the number of differences between a sequence and the nearest sequence assigned to a different species is useful information. First I will find which species share identical sequences;

calc_identical_sp <- function(aligned, threshold = 1) {
  my_dist <- dist.dna(aligned, model = 'N', as.matrix = TRUE)
  identical_sp <- map(1:nrow(my_dist), function(i) {
    sp_dists <- my_dist[i, -i]
    identical_sp <- unique(names(sp_dists[sp_dists < threshold]))
    identical_sp[identical_sp != rownames(my_dist)[i]]
  })
  names(identical_sp) <- rownames(my_dist)
  unique_identical_sp <- map(unique(names(identical_sp)), function(x) {
    unique(unlist(identical_sp[names(identical_sp) == x]))
  })
  names(unique_identical_sp) <- unique(names(identical_sp))
  out <- lapply(seq_along(unique_identical_sp), function(index) {
  sort(c(names(unique_identical_sp)[index], unique_identical_sp[[index]]))
  })
  out[map_dbl(out, length) > 1 & ! duplicated(out)]
}

rps_ident_sp <- calc_identical_sp(rps_amp_aligned)
its_ident_sp <- calc_identical_sp(its_amp_aligned)

and save the results in a table:

tibble(locus = rep(c('RPS10', 'ITS1'), c(length(rps_ident_sp), length(its_ident_sp))), 
       identical_seqs = c(map_chr(rps_ident_sp, paste0, collapse = ' ; '), 
                          map_chr(its_ident_sp, paste0, collapse = ' ; '))) %>%
  write_csv(file = file.path('results', 'species_resolution_table.csv'))

I will also make a plot of the number of differences between each sequence and their nearest sequence from another species.

calc_nearest_seq_diff <- function(aligned) {
  my_dist <- dist.dna(aligned, model = 'N', as.matrix = TRUE)
  out <- map_dbl(1:nrow(my_dist), function(i) {
    sp_dists <- my_dist[i, -i]
    min(sp_dists[names(sp_dists) != rownames(my_dist)[i]], na.rm = TRUE)
  })
  names(out) <- rownames(my_dist)
  out
}

calc_nearest_seq_diff_dist <- function(aligned, max = 5) {
  diffs <- calc_nearest_seq_diff(aligned)
  uniq_sp_names <- unique(names(diffs))
  diffs <- map_dbl(uniq_sp_names, function(n) {
    max(diffs[names(diffs) == n], na.rm = TRUE)
  })
  names(diffs) <- uniq_sp_names
  out <- map_int(1:max - 1, function(n) {
    sum(diffs == n)
  })
  out[max + 1] <- sum(diffs >= max) 
  names(out) <- c(1:max - 1, paste0(max, '+'))
  out / length(diffs)
}

and make a table of that as well:

diff_props <- c(calc_nearest_seq_diff_dist(its_amp_aligned), calc_nearest_seq_diff_dist(rps_amp_aligned))
diff_props_data <- tibble(prop = diff_props,
       n = names(diff_props),
       locus = ordered(rep(c('ITS1', 'rps10'), each = 6), levels =  c('rps10', 'ITS1')))
print(diff_props_data)
## # A tibble: 12 x 3
##      prop n     locus
##     <dbl> <chr> <ord>
##  1 0.294  0     ITS1 
##  2 0.235  1     ITS1 
##  3 0.162  2     ITS1 
##  4 0.103  3     ITS1 
##  5 0.0441 4     ITS1 
##  6 0.162  5+    ITS1 
##  7 0.147  0     rps10
##  8 0.0147 1     rps10
##  9 0.0735 2     rps10
## 10 0.0441 3     rps10
## 11 0.0441 4     rps10
## 12 0.676  5+    rps10

We can plot this in a few ways:

sp_diff_plot <- diff_props_data %>%
  ggplot(aes(x = n, y = prop, fill = locus)) +
  geom_bar(stat = 'identity', position = "dodge2") +
  geom_hline(aes(yintercept = 1), linetype = "dashed") +
  scale_fill_viridis_d(begin = 0.8, end = 0.2) +
  labs(x = 'Base pairs different from most similar species', y = 'Proportion of species', fill = 'Locus') +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = c(0.1, 0.8),
        legend.background = element_rect(fill=NA))
sp_diff_plot

ggsave(sp_diff_plot, filename = 'sp_resolution_barchart.pdf', path = file.path('results'), width = 5, height = 5)

Figure #: The distribution of the number of base pairs different each species is to the most similar species base on pairwise alignments of predicted amplicons in the Rps10 and ITS1 databases. Only sequences with unambiguous, species-level taxonomic classifications that contain the entire amplicon are included. Zero differences for a species mean that there is at least one other species that is predicted to have an identical amplicon sequence.

rps_diffs <- calc_nearest_seq_diff(rps_amp_aligned)
its_diffs <- calc_nearest_seq_diff(its_amp_aligned)
diff_boxplot <- tibble(locus = ordered(rep(c('ITS1', 'rps10'), c(length(its_diffs), length(rps_diffs))), levels =  c('rps10', 'ITS1')),
       species = c(names(its_diffs), names(rps_diffs)),
       diff = c(its_diffs, rps_diffs)) %>%
  ggplot(aes(x = locus, y = diff, fill = locus)) +
  geom_boxplot() +
  scale_fill_viridis_d(begin = 0.8, end = 0.2) +
  guides(fill = FALSE) +
  labs(x = 'Locus', y = 'Base pairs different from most similar species', fill = NULL) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom")
diff_boxplot

ggsave(diff_boxplot, filename = 'sp_resolution_boxplot.pdf', path = file.path('results'), width = 2, height = 4)

We can combine the two plots like so:

diff_plot_combined <- ggarrange(diff_boxplot, sp_diff_plot,
          labels = c("A", "B"),
          widths = c(0.3, 1),
          ncol = 2, nrow = 1)
diff_plot_combined

ggsave(diff_plot_combined, filename = 'sp_resolution_combined.pdf', path = file.path('results'), width = 7, height = 5)

Figure #: The distribution of the number of base pairs different each species is to the most similar species based on pairwise alignments of predicted amplicons in the Rps10 and ITS1 databases. Only sequences for species present in both databases are included. A) The distribution of differences for each locus. B) Proportion of species with each difference count. Zero differences for a species mean that there is at least one other species that is predicted to have an identical amplicon sequence.

Software used

sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 4.0.3 (2020-10-10)
##  os       Pop!_OS 20.04 LTS           
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language en_US:en                    
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/Vancouver           
##  date     2021-05-20                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version    date       lib source        
##  abind          1.4-5      2016-07-21 [1] CRAN (R 4.0.3)
##  ade4           1.7-16     2020-10-28 [1] CRAN (R 4.0.3)
##  ape          * 5.4-1      2020-08-13 [1] CRAN (R 4.0.2)
##  aphid          1.3.3      2019-05-08 [1] CRAN (R 4.0.3)
##  askpass        1.1        2019-01-13 [1] CRAN (R 4.0.2)
##  assertthat     0.2.1      2019-03-21 [1] CRAN (R 4.0.2)
##  backports      1.2.0      2020-11-02 [1] CRAN (R 4.0.3)
##  BiocGenerics * 0.34.0     2020-04-27 [1] Bioconductor  
##  Biostrings   * 2.56.0     2020-04-27 [1] Bioconductor  
##  bold           1.1.0      2020-06-17 [1] CRAN (R 4.0.2)
##  broom          0.7.2      2020-10-20 [1] CRAN (R 4.0.3)
##  car            3.0-10     2020-09-29 [1] CRAN (R 4.0.3)
##  carData        3.0-4      2020-05-22 [1] CRAN (R 4.0.3)
##  cellranger     1.1.0      2016-07-27 [1] CRAN (R 4.0.2)
##  cli            2.1.0      2020-10-12 [1] CRAN (R 4.0.3)
##  cluster        2.1.0      2019-06-19 [4] CRAN (R 4.0.0)
##  codetools      0.2-16     2018-12-24 [4] CRAN (R 4.0.0)
##  colorspace     1.4-1      2019-03-18 [1] CRAN (R 4.0.2)
##  conditionz     0.1.0      2019-04-24 [1] CRAN (R 4.0.2)
##  cowplot        1.1.0      2020-09-08 [1] CRAN (R 4.0.2)
##  crayon         1.3.4      2017-09-16 [1] CRAN (R 4.0.2)
##  crosstalk      1.1.0.1    2020-03-13 [1] CRAN (R 4.0.2)
##  crul           1.0.0      2020-07-30 [1] CRAN (R 4.0.2)
##  curl           4.3        2019-12-02 [1] CRAN (R 4.0.2)
##  data.table     1.13.2     2020-10-19 [1] CRAN (R 4.0.3)
##  digest         0.6.27     2020-10-24 [1] CRAN (R 4.0.3)
##  dplyr        * 1.0.2      2020-08-18 [1] CRAN (R 4.0.2)
##  DT           * 0.16       2020-10-13 [1] CRAN (R 4.0.3)
##  ellipsis       0.3.1      2020-05-15 [1] CRAN (R 4.0.2)
##  evaluate       0.14       2019-05-28 [1] CRAN (R 4.0.2)
##  fansi          0.4.1      2020-01-08 [1] CRAN (R 4.0.2)
##  farver         2.0.3      2020-01-16 [1] CRAN (R 4.0.2)
##  fastmatch      1.1-0      2017-01-28 [1] CRAN (R 4.0.2)
##  forcats      * 0.5.0      2020-03-01 [1] CRAN (R 4.0.2)
##  foreach        1.5.1      2020-10-15 [1] CRAN (R 4.0.3)
##  foreign        0.8-80     2020-05-24 [1] CRAN (R 4.0.2)
##  furrr        * 0.2.1      2020-10-21 [1] CRAN (R 4.0.3)
##  future       * 1.19.1     2020-09-22 [1] CRAN (R 4.0.3)
##  generics       0.1.0      2020-10-31 [1] CRAN (R 4.0.3)
##  ggplot2      * 3.3.2      2020-06-19 [1] CRAN (R 4.0.2)
##  ggpubr       * 0.4.0      2020-06-27 [1] CRAN (R 4.0.3)
##  ggsignif       0.6.0      2019-08-08 [1] CRAN (R 4.0.3)
##  globals        0.13.1     2020-10-11 [1] CRAN (R 4.0.3)
##  glue           1.4.2      2020-08-27 [1] CRAN (R 4.0.2)
##  gridExtra      2.3        2017-09-09 [1] CRAN (R 4.0.3)
##  gtable         0.3.0      2019-03-25 [1] CRAN (R 4.0.2)
##  haven          2.3.1      2020-06-01 [1] CRAN (R 4.0.2)
##  hms            0.5.3      2020-01-08 [1] CRAN (R 4.0.2)
##  htmltools      0.5.1.1    2021-01-22 [1] CRAN (R 4.0.3)
##  htmlwidgets    1.5.2      2020-10-03 [1] CRAN (R 4.0.3)
##  httpcode       0.3.0      2020-04-10 [1] CRAN (R 4.0.2)
##  igraph         1.2.6      2020-10-06 [1] CRAN (R 4.0.3)
##  insect       * 1.2.0      2018-11-25 [1] CRAN (R 4.0.3)
##  ips          * 0.0.11     2019-07-04 [1] CRAN (R 4.0.3)
##  IRanges      * 2.22.2     2020-05-21 [1] Bioconductor  
##  iterators      1.0.13     2020-10-15 [1] CRAN (R 4.0.3)
##  jsonlite       1.7.1      2020-09-07 [1] CRAN (R 4.0.2)
##  kmer           1.1.2      2019-05-20 [1] CRAN (R 4.0.3)
##  knitr        * 1.30       2020-09-22 [1] CRAN (R 4.0.2)
##  labeling       0.4.2      2020-10-20 [1] CRAN (R 4.0.3)
##  lattice      * 0.20-41    2020-04-02 [4] CRAN (R 4.0.0)
##  lifecycle      0.2.0      2020-03-06 [1] CRAN (R 4.0.2)
##  listenv        0.8.0      2019-12-05 [1] CRAN (R 4.0.3)
##  magrittr       1.5        2014-11-22 [1] CRAN (R 4.0.2)
##  MASS           7.3-53     2020-09-09 [4] CRAN (R 4.0.2)
##  Matrix         1.2-18     2019-11-27 [4] CRAN (R 4.0.0)
##  metacoder    * 0.3.4      2020-04-29 [1] CRAN (R 4.0.3)
##  mgcv           1.8-33     2020-08-27 [4] CRAN (R 4.0.2)
##  munsell        0.5.0      2018-06-12 [1] CRAN (R 4.0.2)
##  nlme           3.1-149    2020-08-23 [4] CRAN (R 4.0.2)
##  openssl        1.4.3      2020-09-18 [1] CRAN (R 4.0.2)
##  openxlsx       4.2.3      2020-10-27 [1] CRAN (R 4.0.3)
##  permute      * 0.9-5      2019-03-12 [1] CRAN (R 4.0.2)
##  phangorn       2.5.5      2019-06-19 [1] CRAN (R 4.0.2)
##  phylogram      2.1.0      2018-06-25 [1] CRAN (R 4.0.3)
##  pillar         1.4.6      2020-07-10 [1] CRAN (R 4.0.2)
##  pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.0.2)
##  plyr           1.8.6      2020-03-03 [1] CRAN (R 4.0.2)
##  prettyunits    1.1.1      2020-01-24 [1] CRAN (R 4.0.2)
##  progress       1.2.2      2019-05-16 [1] CRAN (R 4.0.2)
##  purrr        * 0.3.4      2020-04-17 [1] CRAN (R 4.0.2)
##  qsubmitter   * 0.1        2020-11-11 [1] local         
##  quadprog       1.5-8      2019-11-20 [1] CRAN (R 4.0.2)
##  R6             2.5.0      2020-10-28 [1] CRAN (R 4.0.3)
##  RANN           2.6.1      2019-01-08 [1] CRAN (R 4.0.3)
##  Rcpp           1.0.5      2020-07-06 [1] CRAN (R 4.0.2)
##  readr        * 1.4.0      2020-10-05 [1] CRAN (R 4.0.3)
##  readxl         1.3.1      2019-03-13 [1] CRAN (R 4.0.2)
##  reshape        0.8.8      2018-10-23 [1] CRAN (R 4.0.2)
##  rio            0.5.16     2018-11-26 [1] CRAN (R 4.0.3)
##  rlang          0.4.10     2020-12-30 [1] CRAN (R 4.0.3)
##  rmarkdown      2.5        2020-10-21 [1] CRAN (R 4.0.3)
##  rstatix        0.6.0      2020-06-18 [1] CRAN (R 4.0.3)
##  rstudioapi     0.11       2020-02-07 [1] CRAN (R 4.0.2)
##  S4Vectors    * 0.26.1     2020-05-16 [1] Bioconductor  
##  scales         1.1.1      2020-05-11 [1] CRAN (R 4.0.2)
##  seqinr         4.2-4      2020-10-10 [1] CRAN (R 4.0.3)
##  sessioninfo  * 1.1.1      2018-11-05 [1] CRAN (R 4.0.2)
##  sharedbib      0.1.0.9003 2020-10-16 [1] local         
##  stringi        1.5.3      2020-09-09 [1] CRAN (R 4.0.2)
##  stringr      * 1.4.0      2019-02-10 [1] CRAN (R 4.0.2)
##  taxa         * 0.3.4      2020-04-29 [1] CRAN (R 4.0.3)
##  taxize       * 0.9.99     2020-10-30 [1] CRAN (R 4.0.3)
##  tibble         3.0.4      2020-10-12 [1] CRAN (R 4.0.3)
##  tidyr        * 1.1.2      2020-08-27 [1] CRAN (R 4.0.2)
##  tidyselect     1.1.0      2020-05-11 [1] CRAN (R 4.0.2)
##  utf8           1.1.4      2018-05-24 [1] CRAN (R 4.0.2)
##  uuid           0.1-4      2020-02-26 [1] CRAN (R 4.0.2)
##  vctrs          0.3.4      2020-08-29 [1] CRAN (R 4.0.2)
##  vegan        * 2.5-6      2019-09-01 [1] CRAN (R 4.0.2)
##  viridis      * 0.5.1      2018-03-29 [1] CRAN (R 4.0.3)
##  viridisLite  * 0.3.0      2018-02-01 [1] CRAN (R 4.0.2)
##  withr          2.3.0      2020-09-22 [1] CRAN (R 4.0.3)
##  xfun           0.19       2020-10-30 [1] CRAN (R 4.0.3)
##  XML            3.99-0.5   2020-07-23 [1] CRAN (R 4.0.2)
##  xml2           1.3.2      2020-04-23 [1] CRAN (R 4.0.2)
##  XVector      * 0.28.0     2020-04-27 [1] Bioconductor  
##  yaml           2.2.1      2020-02-01 [1] CRAN (R 4.0.2)
##  zip            2.1.1      2020-08-27 [1] CRAN (R 4.0.3)
##  zlibbioc       1.34.0     2020-04-27 [1] Bioconductor  
##  zoo            1.8-8      2020-05-02 [1] CRAN (R 4.0.2)
## 
## [1] /home/fosterz/R/x86_64-pc-linux-gnu-library/4.0
## [2] /usr/local/lib/R/site-library
## [3] /usr/lib/R/site-library
## [4] /usr/lib/R/library