input_folder <- "raw_input" # Where all the large input files are. Ignored by git.
output_folder <- "results" # Where plots will be saved
output_format <- "pdf" # The file format of saved plots
pub_fig_folder <- "publication"
revision_n <- 1
result_path <- function(name) {
file.path(output_folder, paste0(name, ".", output_format))
}
save_publication_fig <- function(name, figure_number) {
file.path(result_path(name), paste0("revision_", revision_n), paste0("figure_", figure_number, "--", name, ".", output_format))
}
min_p_value <- 0.05
This analysis requires Bioconductor packages to work. These packages are not available on CRAN and must be installed using the following code. The code below is not run during the compilation of this page, but is provided so others can quickly find the right software to replicate this analysis.
source("https://bioconductor.org/biocLite.R")
source("http://bioconductor.org/workflows.R")
workflowInstall("rnaseqGene")
biocLite("GO.db")
biocLite("org.Hs.eg.db")
biocLite("airway")
biocLite("DESeq2")
http://www.bioconductor.org/help/workflows/rnaseqGene/
library(rnaseqGene)
library("airway")
data("airway")
se <- airway
se$dex <- relevel(se$dex, "untrt")
se$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: untrt trt
library("DESeq2")
dds <- DESeqDataSet(se, design = ~ cell + dex)
nrow(dds)
## [1] 64102
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
## [1] 29391
rld <- rlog(dds, blind=FALSE)
head(assay(rld), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## ENSG00000000003 9.385683 9.052608 9.516875 9.285338 9.839085 9.530311 9.663255
## ENSG00000000419 8.869616 9.138271 9.036116 9.075295 8.972126 9.131824 8.861534
## ENSG00000000457 7.961369 7.881385 7.824079 7.921490 7.751699 7.886441 7.957121
## SRR1039521
## ENSG00000000003 9.277699
## ENSG00000000419 9.060905
## ENSG00000000457 7.912123
dds <- estimateSizeFactors(dds)
dds <- DESeq(dds)
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
(res <- results(dds))
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 29391 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 708.6021697 -0.38125388 0.1006560 -3.7876928 0.0001520527 0.001281516
## ENSG00000000419 520.2979006 0.20681271 0.1122218 1.8428925 0.0653447023 0.196230403
## ENSG00000000457 237.1630368 0.03792050 0.1434532 0.2643405 0.7915175170 0.911219223
## ENSG00000000460 57.9326331 -0.08816322 0.2871677 -0.3070095 0.7588361136 0.894672849
## ENSG00000000938 0.3180984 -1.37823397 3.4998753 -0.3937952 0.6937322727 NA
## ... ... ... ... ... ... ...
## ENSG00000273485 1.2864477 -0.1271363 1.6005571 -0.07943253 0.9366886 NA
## ENSG00000273486 15.4525365 -0.1509944 0.4865490 -0.31033758 0.7563043 0.8936787
## ENSG00000273487 8.1632350 1.0464169 0.6990336 1.49694792 0.1344068 0.3291460
## ENSG00000273488 8.5844790 0.1078190 0.6381646 0.16895172 0.8658346 0.9451750
## ENSG00000273489 0.2758994 1.4837258 3.5139452 0.42223933 0.6728503 NA
library(GO.db)
library(org.Hs.eg.db)
res$go_id <- mapIds(org.Hs.eg.db,
keys=rownames(res),
column="GO",
keytype="ENSEMBL",
multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
Remove genes with no GO annotation
nrow(res)
## [1] 29391
res <- res[!is.na(res$go_id), ]
res <- res[res$go_id %in% keys(org.Hs.eg.db, keytype = "GO"), ]
nrow(res)
## [1] 15514
Remove insignificant genes
nrow(res)
res <- res[(! is.na(res$padj)) & res$padj <= 0.05, ]
nrow(res)
Remove genes with small changes
nrow(res)
res <- res[abs(res$log2FoldChange) >= 0.5, ]
nrow(res)
term_class <- function(x, current = x, all_paths = TRUE, type = GOCCPARENTS, verbose = TRUE,
valid_relationships = c("is_a")) {
# Get immediate children of current taxon
parents = tryCatch({
possible_parents <- as.list(type[x[1]])[[1]]
if (! is.null(valid_relationships)) {
possible_parents <- possible_parents[names(possible_parents) %in% valid_relationships]
}
names(AnnotationDbi::Term(possible_parents))
}, error = function(e) {
c()
})
# only go down one path if desired
if (! all_paths) {
parents <- parents[1]
}
parents <- parents[parents != "all"]
if (is.null(parents)) {
return(c())
} else if (length(parents) == 0) {
return(paste0(collapse = "|", AnnotationDbi::Term(x)))
} else {
next_x <- lapply(parents, function(y) c(y, x))
# Run this function on them to get their output
child_output <- lapply(next_x, term_class, all_paths = all_paths, type = type)
output <- unlist(child_output, recursive = FALSE)
return(output)
}
}
cc_class <- lapply(res$go_id, term_class, all_paths = FALSE, type = GOCCPARENTS)
mf_class <- lapply(res$go_id, term_class, all_paths = FALSE, type = GOMFPARENTS)
bp_class <- lapply(res$go_id, term_class, all_paths = FALSE, type = GOBPPARENTS)
cc_res <- res[rep(1:nrow(res), sapply(cc_class, length)), ]
cc_res$class <- unlist(cc_class)
library(metacoder)
obj <- parse_tax_data(as.data.frame(cc_res), class_cols = "class", class_sep = "|")
obj$funcs <- c(obj$funcs,
change = function(x, subset = NULL) {
vapply(obs(x, "tax_data"),
function(i) {
obs_change <- obj$data$tax_data[i, ]$log2FoldChange[obj$data$tax_data[i, ]$padj <= min_p_value]
mean(obs_change, na.rm = TRUE)
},
numeric(1))
},
num_changed = function(x, subset = NULL) {
vapply(obs(x, "tax_data"),
function(i) {
sum(obj$data$tax_data[i, ]$padj <= min_p_value, na.rm = TRUE)
},
numeric(1))
})
set.seed(3)
obj %>%
filter_taxa(num_changed > 0) %>%
filter_taxa(n_supertaxa <= 4) %>%
# filter_taxa(n_supertaxa >= 1) %>%
# filter_taxa(nchar(taxon_names) <= 40) %>%
heat_tree(node_label = ifelse(nchar(taxon_names) <= 50, taxon_names, ""),
node_size = num_changed,
# node_size_trans = "log10",
node_size_range = c(0.01, 0.03),
# node_label_size_trans = "log10",
node_label_size_range = c(0.01, 0.02),
# edge_size_trans = "log10",
edge_size_range = c(0.008, 0.03) / 2,
node_color = 2^abs(change) * sign(change),
node_color_trans = "linear",
node_color_range = diverging_palette(),
node_color_interval = c(-4, 4),
# edge_color_trans = "linear",
# edge_color_range = diverging_palette(),
# edge_color_interval = c(-4, 4),
# node_label_max = 500,
node_color_axis_label = "Factor change",
node_size_axis_label = "Number of genes",
layout = "da", initial_layout = "re",
output_file = result_path("gene_expression--cellular_component"))
bp_res <- res[rep(1:nrow(res), sapply(bp_class, length)), ]
bp_res$class <- unlist(bp_class)
library(metacoder)
obj <- parse_tax_data(as.data.frame(bp_res), class_cols = "class", class_sep = "|")
obj$funcs <- c(obj$funcs,
change = function(x, subset = NULL) {
vapply(obs(x, "tax_data"),
function(i) {
obs_change <- obj$data$tax_data[i, ]$log2FoldChange[obj$data$tax_data[i, ]$padj <= min_p_value]
mean(obs_change, na.rm = TRUE)
},
numeric(1))
},
num_changed = function(x, subset = NULL) {
vapply(obs(x, "tax_data"),
function(i) {
sum(obj$data$tax_data[i, ]$padj <= min_p_value, na.rm = TRUE)
},
numeric(1))
})
set.seed(7) #2, 4, 5, 7*, 19
mgsub <- function(pattern, replacement, x, ...) { # from: http://stackoverflow.com/questions/15253954/replace-multiple-arguments-with-gsub
if (length(pattern)!=length(replacement)) {
stop("pattern and replacement do not have the same length.")
}
result <- x
for (i in 1:length(pattern)) {
result <- gsub(pattern[i], replacement[i], result, ...)
}
result
}
to_replace <- matrix(ncol = 2, byrow = TRUE,
c("regulation of growth", "",
"activation of innate immune response", "",
"system development", "",
"regulation of response to stimulus", "",
"lipid metabolic process", "",
"selenium compound metabolic process", "selenium metabolic process"
))
output_path <- file.path(output_folder,
paste0("gene_expression--biological_process",
output_format))
obj %>%
filter_taxa(num_changed > 0) %>%
filter_taxa(n_supertaxa <= 3) %>%
mutate_obs("plot_data",
taxon_id = taxon_ids,
plotted_name = gsub("_", " ", taxon_names),
f_change = 2^abs(change) * sign(change)) %>%
mutate_obs("plot_data",
short_name = vapply(plotted_name, FUN.VALUE = character(1), function(x) {
mgsub(pattern = to_replace[, 1], replacement = to_replace[, 2], x, fixed = TRUE)
})) %>%
heat_tree(node_label = ifelse(abs(f_change) > 1, short_name, NA),
node_size = num_changed,
# node_size_trans = "log10",
node_size_range = c(0.008, 0.03),
# node_label_size_trans = "log10",
node_label_size_range = c(0.012, 0.02),
# edge_size_trans = "log10",
edge_size_range = c(0.008, 0.03) / 2,
node_color = f_change,
node_color_trans = "linear",
node_color_range = diverging_palette(),
node_color_interval = c(-5, 5),
edge_color_trans = "linear",
edge_color_range = diverging_palette(),
edge_color_interval = c(-5, 5),
node_color_axis_label = "Fold change",
node_size_axis_label = "Number of genes",
layout = "da", initial_layout = "re",
output_file = result_path("gene_expression--biological_process"))
## Adding a new 176 x 3 table called "plot_data"
mf_res <- res[rep(1:nrow(res), sapply(mf_class, length)), ]
mf_res$class <- unlist(mf_class)
library(metacoder)
obj <- parse_tax_data(as.data.frame(mf_res), class_cols = "class", class_sep = "|")
obj$funcs <- c(obj$funcs,
change = function(x, subset = NULL) {
vapply(obs(x, "tax_data"),
function(i) {
obs_change <- obj$data$tax_data[i, ]$log2FoldChange[obj$data$tax_data[i, ]$padj <= min_p_value]
mean(obs_change, na.rm = TRUE)
},
numeric(1))
},
num_changed = function(x, subset = NULL) {
vapply(obs(x, "tax_data"),
function(i) {
sum(obj$data$tax_data[i, ]$padj <= min_p_value, na.rm = TRUE)
},
numeric(1))
})
obj %>%
filter_taxa(num_changed > 0) %>%
filter_taxa(n_supertaxa <= 3) %>%
# filter_taxa(n_supertaxa >= 1) %>%
# filter_taxa(nchar(taxon_names) <= 40) %>%
heat_tree(node_label = ifelse(nchar(taxon_names) <= 50, taxon_names, ""),
node_size = num_changed,
# node_size_trans = "log10",
node_size_range = c(0.01, 0.03),
# node_label_size_trans = "log10",
node_label_size_range = c(0.01, 0.015),
# edge_size_trans = "log10",
edge_size_range = c(0.008, 0.03) / 2,
node_color = 2^abs(change) * sign(change),
node_color_trans = "linear",
node_color_range = diverging_palette(),
node_color_interval = c(-4, 4),
edge_color_trans = "linear",
edge_color_range = diverging_palette(),
edge_color_interval = c(-4, 4),
node_color_axis_label = "Factor change",
node_size_axis_label = "Number of genes",
layout = "da", initial_layout = "re",
output_file = result_path("gene_expression--molecular_function"))
sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
## [4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils datasets methods
## [10] base
##
## other attached packages:
## [1] GO.db_3.5.0 rnaseqGene_1.0.2 fission_0.112.0
## [4] sva_3.26.0 mgcv_1.8-23 nlme_3.1-137
## [7] Gviz_1.22.3 ReportingTools_2.17.3 org.Hs.eg.db_3.5.0
## [10] genefilter_1.60.0 ggbeeswarm_0.6.0 PoiClaClu_1.0.2
## [13] RColorBrewer_1.1-2 pheatmap_1.0.10 ggplot2_3.1.0
## [16] dplyr_0.7.8 vsn_3.46.0 DESeq2_1.18.1
## [19] magrittr_1.5 BiocParallel_1.12.0 GenomicAlignments_1.14.2
## [22] GenomicFeatures_1.30.3 AnnotationDbi_1.40.0 Rsamtools_1.30.0
## [25] Biostrings_2.46.0 XVector_0.18.0 airway_0.112.0
## [28] SummarizedExperiment_1.8.1 DelayedArray_0.4.1 matrixStats_0.54.0
## [31] Biobase_2.38.0 GenomicRanges_1.30.3 GenomeInfoDb_1.14.0
## [34] IRanges_2.12.0 S4Vectors_0.16.0 BiocGenerics_0.24.0
## [37] BiocStyle_2.6.1 metacoder_0.3.0.1 taxa_0.3.1
## [40] stringr_1.3.1 glossary_0.1.0 knitcitations_1.0.8
## [43] knitr_1.20
##
## loaded via a namespace (and not attached):
## [1] utf8_1.1.4 R.utils_2.7.0 tidyselect_0.2.5
## [4] RSQLite_2.1.1 htmlwidgets_1.3 munsell_0.5.0
## [7] codetools_0.2-15 preprocessCore_1.40.0 withr_2.1.2
## [10] colorspace_1.3-2 BiocInstaller_1.28.0 Category_2.44.0
## [13] OrganismDbi_1.20.0 rstudioapi_0.8 labeling_0.3
## [16] GenomeInfoDbData_1.0.0 hwriter_1.3.2 bit64_0.9-7
## [19] rprojroot_1.3-2 biovizBase_1.26.0 R6_2.3.0
## [22] locfit_1.5-9.1 AnnotationFilter_1.2.0 bitops_1.0-6
## [25] reshape_0.8.8 assertthat_0.2.0 promises_1.0.1
## [28] scales_1.0.0 nnet_7.3-12 beeswarm_0.2.3
## [31] gtable_0.2.0 affy_1.56.0 ggbio_1.26.1
## [34] ensembldb_2.2.2 rlang_0.3.0.1 splines_3.4.4
## [37] rtracklayer_1.38.3 lazyeval_0.2.1 acepack_1.4.1
## [40] dichromat_2.0-0 checkmate_1.8.5 yaml_2.2.0
## [43] reshape2_1.4.3 backports_1.1.2 httpuv_1.4.5
## [46] Hmisc_4.1-1 RMySQL_0.10.15 RBGL_1.54.0
## [49] tools_3.4.4 affyio_1.48.0 Rcpp_1.0.0
## [52] plyr_1.8.4 base64enc_0.1-3 progress_1.2.0
## [55] zlibbioc_1.24.0 purrr_0.2.5 RCurl_1.95-4.11
## [58] prettyunits_1.0.2 rpart_4.1-13 cluster_2.0.7-1
## [61] data.table_1.11.8 ProtGenerics_1.10.0 hms_0.4.2
## [64] mime_0.6 evaluate_0.12 xtable_1.8-3
## [67] XML_3.98-1.16 gridExtra_2.3 compiler_3.4.4
## [70] biomaRt_2.34.2 tibble_1.4.2 crayon_1.3.4
## [73] R.oo_1.22.0 htmltools_0.3.6 GOstats_2.44.0
## [76] later_0.7.5 Formula_1.2-3 geneplotter_1.56.0
## [79] lubridate_1.7.4 DBI_1.0.0 Matrix_1.2-14
## [82] readr_1.2.1 cli_1.0.1 R.methodsS3_1.7.1
## [85] igraph_1.2.2 bindr_0.1.1 pkgconfig_2.0.2
## [88] RefManageR_1.2.0 foreign_0.8-70 xml2_1.2.0
## [91] annotate_1.56.2 vipor_0.4.5 AnnotationForge_1.20.0
## [94] bibtex_0.4.2 VariantAnnotation_1.24.5 digest_0.6.18
## [97] graph_1.56.0 rmarkdown_1.10 htmlTable_1.12
## [100] edgeR_3.20.9 GSEABase_1.40.1 curl_3.2
## [103] shiny_1.2.0 jsonlite_1.5 PFAM.db_3.5.0
## [106] bindrcpp_0.2.2 limma_3.34.9 BSgenome_1.46.0
## [109] fansi_0.4.0 pillar_1.3.0 lattice_0.20-35
## [112] GGally_1.4.0 httr_1.3.1 survival_2.42-3
## [115] interactiveDisplayBase_1.16.0 glue_1.3.0 bit_1.1-14
## [118] Rgraphviz_2.22.0 stringi_1.2.4 ggfittext_0.6.0
## [121] blob_1.1.1 AnnotationHub_2.10.1 latticeExtra_0.6-28
## [124] memoise_1.1.0