# **MODIFY THIS CHUNK**
library(here)
doc_id <- "05b"
figout <- here("figures/03-chrombpnet/03-syntax/", doc_id, "/"); dir.create(figout, recursive = TRUE)
out <- here("output/03-chrombpnet/03-syntax/", doc_id); dir.create(out, recursive = TRUE)For specific examples of composite motifs with predicted synergy, we examine the context-specificity of this synergy by running in silico marginalizations for the motif pair at their optimal syntax for every cell type, and then compare predicted effects with expression levels of TFs that may bind these sites.
library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(scales)
library(glue)
library(purrr)
library(stringr)
library(ggrepel)
library(cowplot)
library(pheatmap)
library(ggseqlogo)
script_path <- here("code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "chrombpnet_utils.R"))
ggplot2::theme_set(theme_BOR())cluster_meta <- read_csv(here("output/05-misc/03/TableS2_cluster_meta_qc.csv"))## Rows: 203 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Cluster, organ, organ_code, compartment, L1_annot, L2_annot, L3_an...
## dbl (8): cluster_id, dend_order, ncell, median_numi, median_ngene, median_n...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrombpnet_models_keep <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_models_keep2.tsv"),
col_names = c("Cluster", "Folds_keep", "Cluster_ID"))## Rows: 189 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Cluster, Folds_keep, Cluster_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cluster_order <- cluster_meta %>% arrange(organ, L1_annot) %>% pull(Cluster_ChromBPNet)
organs <- unique(cluster_meta$organ) %>% sort()
organs[organs == "StomachEsophagus"] <- "Stomach"
head(chrombpnet_models_keep)## # A tibble: 6 × 3
## Cluster Folds_keep Cluster_ID
## <chr> <chr> <chr>
## 1 Adrenal_c0 fold_0,fold_1,fold_2,fold_3,fold_4 AG_0
## 2 Adrenal_c1 fold_0,fold_1,fold_2,fold_3,fold_4 AG_1
## 3 Adrenal_c2 fold_0,fold_1,fold_2,fold_3,fold_4 AG_2
## 4 Adrenal_c3 fold_0,fold_1,fold_2,fold_3,fold_4 AG_3
## 5 Brain_c0 fold_0,fold_1,fold_2,fold_3,fold_4 BR_0
## 6 Brain_c1 fold_0,fold_1,fold_2,fold_3,fold_4 BR_1
all_results <- read_tsv(here("output/03-chrombpnet/03-syntax/04c/composite_motif_ISM_results.tsv"))## Rows: 120 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): motif_name, Cluster, annotation_broad, component_motifA, component...
## dbl (19): n_hits_per_cluster, idx_uniq, best_spacing, joint_effect, joint_vs...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ism_dir <- here("output/03-chrombpnet/03-syntax/05a/in_silico_marginalization/")all_gex_decontx.mean <- readRDS(here("output/02-global_analysis/04/tf_gex_decontx.mean.rds"))
all_gex_decontx.detection <- readRDS(here("output/02-global_analysis/04/tf_gex_decontx.detectionrate.rds"))Wrangle into a dataframe for bubble plots:
meanexp_scaled_decontx <- all_gex_decontx.mean %>%
tibble::column_to_rownames(var = "cluster") %>%
apply(2, scales::rescale) %>%
data.frame() %>%
tibble::rownames_to_column(var = "cluster") %>%
gather(TF, Expression, 2:ncol(.))
# see without scaling?
# meanexp_scaled_decontx <- all_gex_decontx.mean %>%
# gather(TF, Expression, 2:ncol(.))
pct1_decontx <- all_gex_decontx.detection %>%
gather(TF, Pct1, 2:ncol(.))
max(pct1_decontx$Pct1, na.rm = TRUE)## [1] 1
bubbleplot_data_decontx <- left_join(meanexp_scaled_decontx, pct1_decontx, by = c("cluster", "TF")) %>%
# NOTE: filter to observations where a gene is detected in at least 1% of cells
filter(Pct1 > 0) %>%
left_join(cluster_meta, by = c("cluster" = "Cluster")) %>%
filter(cluster %in% cluster_meta$Cluster) %>%
replace_na(list(Expression = 0, Pct1 = 0)) %>%
dplyr::select(L1_annot, TF, Expression, Pct1)Helper function:
cluster_meta_subset <- cluster_meta %>% dplyr::select(Cluster_ChromBPNet, L1_annot, organ)
plot_motif_across_celltypes <- function(motif, filter_statement, tf_rel_width = 1, top_n = 40) {
# motif <- "255|FOX_NR:HNF4A/HNF4G"
comp_motif_name <- motif
comp_motif_name_safe <- gsub("\\/|\\|", ".", comp_motif_name)
agg_results <- read_tsv(file.path(ism_dir, comp_motif_name_safe, "aggregated_results.tsv")) %>%
left_join(cluster_meta_subset, by = c("cluster" = "Cluster_ChromBPNet"))
agg_preds_diff <- read_tsv(file.path(ism_dir, comp_motif_name_safe, "aggregated_mean_profile_predictions_diff.tsv"))
cluster_order <- agg_results %>% slice_max(order_by = effect_mean, n = 30) %>% pull(L1_annot) %>% rev()
# fold changes
p2 <- agg_results %>%
filter(L1_annot %in% cluster_order) %>%
mutate(L1_annot = factor(L1_annot, levels = cluster_order)) %>%
mutate(effect_lower = effect_mean - effect_sd,
effect_upper = effect_mean + effect_sd) %>%
ggplot(aes(x = L1_annot, y = effect_mean)) +
geom_bar(aes(fill = organ), stat = "identity") +
geom_errorbar(aes(ymin = effect_lower, ymax = effect_upper)) +
geom_hline(yintercept = log2(1.5), linetype = "dashed") +
scale_fill_manual(values = cmap_organ) +
guides(fill = guide_legend(ncol = 2)) +
xlab(NULL) + ylab("Mean effect") +
coord_flip() +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom") +
ggtitle("effect (log counts)")
# heatmap of profile predictions (difference between edited and background)
p1 <- agg_preds_diff %>%
gather(position, value, 2:ncol(.)) %>%
dplyr::rename(cluster = index) %>%
left_join(cluster_meta_subset, by = c("cluster" = "Cluster_ChromBPNet")) %>%
filter(L1_annot %in% cluster_order) %>%
mutate(L1_annot = factor(L1_annot, levels = cluster_order),
position = as.numeric(position)) %>%
ggplot(aes(x = position, y = L1_annot)) +
geom_tile(aes(fill = value)) +
theme(legend.position = "bottom",
axis.text.y = element_text(size = 10)) + #, color = rev(cluster_meta$Color))) +
scale_x_continuous(breaks = seq(0, 1000, by = 100),
labels = seq(-500, 500, by = 100)) +
scale_fill_gradientn(colors = ylrd) +
xlab("Genomic position (bp)") +
ggtitle(glue("{comp_motif_name} \npredicted profile "))
p3 <- bubbleplot_data_decontx %>%
filter(eval(rlang::parse_expr(filter_statement))) %>%
filter(L1_annot %in% cluster_order) %>%
mutate(L1_annot = factor(L1_annot, levels = cluster_order)) %>%
ggplot(aes(y = L1_annot, x = TF)) +
geom_point(aes(size = Pct1, color = Expression), alpha = 0.8) +
scale_radius(limits = c(0, 1), range = c(1, 5)) +
rotate_x() +
# use the RNA color map without the darkest colors
scale_color_gradientn(colors = whrd) +
# scale_color_gradient(low = "white", high = "red") +
ylab(NULL) + xlab(NULL) +
theme(panel.grid.major = element_line(colour = "grey90", linewidth = 0.2),
# panel.border = element_blank(),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
axis.text.x = element_text(angle = 70, vjust = 0.5, hjust = 0),
axis.text.y = element_blank(),
legend.position = "bottom") +
ggtitle("TF expression") +
scale_x_discrete(position = "top")
plot_grid(p1, p2, p3, nrow = 1, align = "h", axis = "tb", rel_widths = c(1.5, 1, tf_rel_width))
}plot_motif_across_celltypes(motif = "255|FOX_NR:HNF4A/HNF4G",
filter_statement = 'grepl("FOX", TF) | grepl("HNF4", TF)',
tf_rel_width = 2)## Rows: 184 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): motif_name, cluster, orientation
## dbl (5): spacing, counts_after_mean, counts_after_sd, effect_mean, effect_sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 184 Columns: 1001
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): index
## dbl (1000): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plot_motif_across_celltypes(motif = "339|IKZF1_RUNX",
filter_statement = 'grepl("IKZF", TF) | grepl("RUNX", TF)',
tf_rel_width = 0.8)## Rows: 185 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): motif_name, cluster, orientation
## dbl (5): spacing, counts_after_mean, counts_after_sd, effect_mean, effect_sd
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 185 Columns: 1001
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): index
## dbl (1000): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggthemes_4.2.4 RColorBrewer_1.1-3 ggrastr_1.0.1
## [4] magrittr_2.0.3 GenomicFeatures_1.46.5 AnnotationDbi_1.56.2
## [7] Biobase_2.54.0 GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [10] IRanges_2.28.0 S4Vectors_0.32.4 BiocGenerics_0.40.0
## [13] ggseqlogo_0.1 pheatmap_1.0.12 cowplot_1.1.3
## [16] ggrepel_0.9.3 stringr_1.5.0 purrr_1.0.2
## [19] glue_1.8.0 scales_1.3.0 readr_2.1.4
## [22] ggplot2_3.5.1 tidyr_1.3.0 dplyr_1.1.3
## [25] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] matrixStats_1.0.0 bitops_1.0-7
## [3] bit64_4.0.5 filelock_1.0.2
## [5] progress_1.2.2 httr_1.4.7
## [7] rprojroot_2.0.3 tools_4.1.2
## [9] bslib_0.5.1 utf8_1.2.3
## [11] R6_2.5.1 vipor_0.4.5
## [13] DBI_1.1.3 colorspace_2.1-0
## [15] withr_2.5.0 tidyselect_1.2.0
## [17] prettyunits_1.1.1 bit_4.0.5
## [19] curl_5.0.2 compiler_4.1.2
## [21] cli_3.6.1 xml2_1.3.5
## [23] DelayedArray_0.20.0 labeling_0.4.3
## [25] rtracklayer_1.54.0 sass_0.4.7
## [27] rappdirs_0.3.3 digest_0.6.33
## [29] Rsamtools_2.10.0 rmarkdown_2.24
## [31] XVector_0.34.0 pkgconfig_2.0.3
## [33] htmltools_0.5.6 MatrixGenerics_1.6.0
## [35] highr_0.10 dbplyr_2.3.3
## [37] fastmap_1.1.1 rlang_1.1.1
## [39] rstudioapi_0.15.0 RSQLite_2.3.1
## [41] farver_2.1.1 jquerylib_0.1.4
## [43] BiocIO_1.4.0 generics_0.1.3
## [45] jsonlite_1.8.7 vroom_1.6.3
## [47] BiocParallel_1.28.3 RCurl_1.98-1.12
## [49] GenomeInfoDbData_1.2.7 Matrix_1.6-3
## [51] ggbeeswarm_0.7.2 Rcpp_1.0.11
## [53] munsell_0.5.0 fansi_1.0.4
## [55] lifecycle_1.0.3 stringi_1.7.12
## [57] yaml_2.3.7 SummarizedExperiment_1.24.0
## [59] zlibbioc_1.40.0 BiocFileCache_2.2.1
## [61] blob_1.2.4 parallel_4.1.2
## [63] crayon_1.5.2 lattice_0.20-45
## [65] Biostrings_2.62.0 hms_1.1.3
## [67] KEGGREST_1.34.0 knitr_1.43
## [69] pillar_1.9.0 rjson_0.2.21
## [71] biomaRt_2.50.3 XML_3.99-0.14
## [73] evaluate_0.21 png_0.1-8
## [75] vctrs_0.6.3 tzdb_0.4.0
## [77] gtable_0.3.4 cachem_1.0.8
## [79] xfun_0.40 restfulr_0.0.15
## [81] tibble_3.2.1 beeswarm_0.4.0
## [83] GenomicAlignments_1.30.0 memoise_2.0.1