# **MODIFY THIS CHUNK**
library(here)
doc_id <- "05b"
figout <- here("figures/03-chrombpnet/03-syntax/", doc_id, "/"); dir.create(figout, recursive = TRUE)
out    <- here("output/03-chrombpnet/03-syntax/", doc_id); dir.create(out, recursive = TRUE)

1 Overview

For specific examples of composite motifs with predicted synergy, we examine the context-specificity of this synergy by running in silico marginalizations for the motif pair at their optimal syntax for every cell type, and then compare predicted effects with expression levels of TFs that may bind these sites.

2 Set up

library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(scales)
library(glue)
library(purrr)
library(stringr)
library(ggrepel)
library(cowplot)
library(pheatmap)
library(ggseqlogo)

script_path <- here("code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "chrombpnet_utils.R"))

ggplot2::theme_set(theme_BOR())

3 Load data

3.1 Cluster metadata

cluster_meta <- read_csv(here("output/05-misc/03/TableS2_cluster_meta_qc.csv"))
## Rows: 203 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Cluster, organ, organ_code, compartment, L1_annot, L2_annot, L3_an...
## dbl  (8): cluster_id, dend_order, ncell, median_numi, median_ngene, median_n...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrombpnet_models_keep <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_models_keep2.tsv"),
                                   col_names = c("Cluster", "Folds_keep", "Cluster_ID"))
## Rows: 189 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Cluster, Folds_keep, Cluster_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cluster_order <- cluster_meta %>% arrange(organ, L1_annot) %>% pull(Cluster_ChromBPNet)
organs <- unique(cluster_meta$organ) %>% sort()
organs[organs == "StomachEsophagus"] <- "Stomach"

head(chrombpnet_models_keep)
## # A tibble: 6 × 3
##   Cluster    Folds_keep                         Cluster_ID
##   <chr>      <chr>                              <chr>     
## 1 Adrenal_c0 fold_0,fold_1,fold_2,fold_3,fold_4 AG_0      
## 2 Adrenal_c1 fold_0,fold_1,fold_2,fold_3,fold_4 AG_1      
## 3 Adrenal_c2 fold_0,fold_1,fold_2,fold_3,fold_4 AG_2      
## 4 Adrenal_c3 fold_0,fold_1,fold_2,fold_3,fold_4 AG_3      
## 5 Brain_c0   fold_0,fold_1,fold_2,fold_3,fold_4 BR_0      
## 6 Brain_c1   fold_0,fold_1,fold_2,fold_3,fold_4 BR_1

3.2 Composite test results

all_results <- read_tsv(here("output/03-chrombpnet/03-syntax/04c/composite_motif_ISM_results.tsv"))
## Rows: 120 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): motif_name, Cluster, annotation_broad, component_motifA, component...
## dbl (19): n_hits_per_cluster, idx_uniq, best_spacing, joint_effect, joint_vs...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

3.3 In silico marginalization results

ism_dir <- here("output/03-chrombpnet/03-syntax/05a/in_silico_marginalization/")

3.4 TF expression per cluster

all_gex_decontx.mean <- readRDS(here("output/02-global_analysis/04/tf_gex_decontx.mean.rds"))
all_gex_decontx.detection <- readRDS(here("output/02-global_analysis/04/tf_gex_decontx.detectionrate.rds"))

Wrangle into a dataframe for bubble plots:

meanexp_scaled_decontx <- all_gex_decontx.mean %>% 
  tibble::column_to_rownames(var = "cluster") %>% 
  apply(2, scales::rescale) %>%
  data.frame() %>%
  tibble::rownames_to_column(var = "cluster") %>%
  gather(TF, Expression, 2:ncol(.))

# see without scaling?
# meanexp_scaled_decontx <- all_gex_decontx.mean %>%
#   gather(TF, Expression, 2:ncol(.))

pct1_decontx <- all_gex_decontx.detection %>% 
  gather(TF, Pct1, 2:ncol(.))

max(pct1_decontx$Pct1, na.rm = TRUE)
## [1] 1
bubbleplot_data_decontx <- left_join(meanexp_scaled_decontx, pct1_decontx, by = c("cluster", "TF")) %>%
  # NOTE: filter to observations where a gene is detected in at least 1% of cells
  filter(Pct1 > 0) %>% 
  left_join(cluster_meta, by = c("cluster" = "Cluster")) %>% 
  filter(cluster %in% cluster_meta$Cluster) %>% 
  replace_na(list(Expression = 0, Pct1 = 0)) %>% 
  dplyr::select(L1_annot, TF, Expression, Pct1)

4 Plots per motif

Helper function:

cluster_meta_subset <- cluster_meta %>% dplyr::select(Cluster_ChromBPNet, L1_annot, organ)

plot_motif_across_celltypes <- function(motif, filter_statement, tf_rel_width = 1, top_n = 40) {
  
  # motif <- "255|FOX_NR:HNF4A/HNF4G"
  
  comp_motif_name <- motif
  comp_motif_name_safe <- gsub("\\/|\\|", ".", comp_motif_name)
  
  agg_results <- read_tsv(file.path(ism_dir, comp_motif_name_safe, "aggregated_results.tsv")) %>% 
    left_join(cluster_meta_subset, by = c("cluster" = "Cluster_ChromBPNet"))
  
  agg_preds_diff <- read_tsv(file.path(ism_dir, comp_motif_name_safe, "aggregated_mean_profile_predictions_diff.tsv"))
  
  cluster_order <- agg_results %>% slice_max(order_by = effect_mean, n = 30) %>% pull(L1_annot) %>% rev()
  
  # fold changes
  p2 <- agg_results %>% 
    filter(L1_annot %in% cluster_order) %>% 
    mutate(L1_annot = factor(L1_annot, levels = cluster_order)) %>% 
    mutate(effect_lower = effect_mean - effect_sd,
           effect_upper = effect_mean + effect_sd) %>% 
    ggplot(aes(x = L1_annot, y = effect_mean)) +
    geom_bar(aes(fill = organ), stat = "identity") +
    geom_errorbar(aes(ymin = effect_lower, ymax = effect_upper)) +
    geom_hline(yintercept = log2(1.5), linetype = "dashed") +
    scale_fill_manual(values = cmap_organ) +
    guides(fill = guide_legend(ncol = 2)) +
    xlab(NULL) + ylab("Mean effect") +
    coord_flip() +
    theme(axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          legend.position = "bottom") +
    ggtitle("effect (log counts)")
  
  # heatmap of profile predictions (difference between edited and background)
  p1 <- agg_preds_diff %>% 
    gather(position, value, 2:ncol(.)) %>%
    dplyr::rename(cluster = index) %>% 
    left_join(cluster_meta_subset, by = c("cluster" = "Cluster_ChromBPNet")) %>% 
    filter(L1_annot %in% cluster_order) %>% 
    mutate(L1_annot = factor(L1_annot, levels = cluster_order),
           position = as.numeric(position)) %>% 
    ggplot(aes(x = position, y = L1_annot)) +
    geom_tile(aes(fill = value)) +
    theme(legend.position = "bottom",
          axis.text.y = element_text(size = 10)) + #, color = rev(cluster_meta$Color))) +
    scale_x_continuous(breaks = seq(0, 1000, by = 100),
                       labels = seq(-500, 500, by = 100)) +
    scale_fill_gradientn(colors = ylrd) +
    xlab("Genomic position (bp)") +
    ggtitle(glue("{comp_motif_name} \npredicted profile "))
  
  p3 <- bubbleplot_data_decontx %>%
    filter(eval(rlang::parse_expr(filter_statement))) %>% 
    filter(L1_annot %in% cluster_order) %>%
    mutate(L1_annot = factor(L1_annot, levels = cluster_order)) %>% 
    ggplot(aes(y = L1_annot, x = TF)) +
    geom_point(aes(size = Pct1, color = Expression), alpha = 0.8) +
    scale_radius(limits = c(0, 1), range = c(1, 5)) +
    rotate_x() +
    # use the RNA color map without the darkest colors
    scale_color_gradientn(colors = whrd) +
    # scale_color_gradient(low = "white", high = "red") +
    ylab(NULL) + xlab(NULL) +
    theme(panel.grid.major = element_line(colour = "grey90", linewidth = 0.2),
          # panel.border = element_blank(),
          axis.ticks.y = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.x = element_text(angle = 70, vjust = 0.5, hjust = 0),
          axis.text.y = element_blank(),
          legend.position = "bottom") +
    ggtitle("TF expression") +
    scale_x_discrete(position = "top")
  
  plot_grid(p1, p2, p3, nrow = 1, align = "h", axis = "tb", rel_widths = c(1.5, 1, tf_rel_width))
  
}

4.1 255|FOX_NR:HNF4A/HNF4G

plot_motif_across_celltypes(motif = "255|FOX_NR:HNF4A/HNF4G",
                            filter_statement = 'grepl("FOX", TF) | grepl("HNF4", TF)',
                            tf_rel_width = 2)
## Rows: 184 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): motif_name, cluster, orientation
## dbl (5): spacing, counts_after_mean, counts_after_sd, effect_mean, effect_sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 184 Columns: 1001
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr    (1): index
## dbl (1000): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

4.2 339|IKZF1_RUNX

plot_motif_across_celltypes(motif = "339|IKZF1_RUNX",
                            filter_statement = 'grepl("IKZF", TF) | grepl("RUNX", TF)',
                            tf_rel_width = 0.8)
## Rows: 185 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): motif_name, cluster, orientation
## dbl (5): spacing, counts_after_mean, counts_after_sd, effect_mean, effect_sd
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 185 Columns: 1001
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr    (1): index
## dbl (1000): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

5 Session info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_4.2.4         RColorBrewer_1.1-3     ggrastr_1.0.1         
##  [4] magrittr_2.0.3         GenomicFeatures_1.46.5 AnnotationDbi_1.56.2  
##  [7] Biobase_2.54.0         GenomicRanges_1.46.1   GenomeInfoDb_1.30.1   
## [10] IRanges_2.28.0         S4Vectors_0.32.4       BiocGenerics_0.40.0   
## [13] ggseqlogo_0.1          pheatmap_1.0.12        cowplot_1.1.3         
## [16] ggrepel_0.9.3          stringr_1.5.0          purrr_1.0.2           
## [19] glue_1.8.0             scales_1.3.0           readr_2.1.4           
## [22] ggplot2_3.5.1          tidyr_1.3.0            dplyr_1.1.3           
## [25] here_1.0.1            
## 
## loaded via a namespace (and not attached):
##  [1] matrixStats_1.0.0           bitops_1.0-7               
##  [3] bit64_4.0.5                 filelock_1.0.2             
##  [5] progress_1.2.2              httr_1.4.7                 
##  [7] rprojroot_2.0.3             tools_4.1.2                
##  [9] bslib_0.5.1                 utf8_1.2.3                 
## [11] R6_2.5.1                    vipor_0.4.5                
## [13] DBI_1.1.3                   colorspace_2.1-0           
## [15] withr_2.5.0                 tidyselect_1.2.0           
## [17] prettyunits_1.1.1           bit_4.0.5                  
## [19] curl_5.0.2                  compiler_4.1.2             
## [21] cli_3.6.1                   xml2_1.3.5                 
## [23] DelayedArray_0.20.0         labeling_0.4.3             
## [25] rtracklayer_1.54.0          sass_0.4.7                 
## [27] rappdirs_0.3.3              digest_0.6.33              
## [29] Rsamtools_2.10.0            rmarkdown_2.24             
## [31] XVector_0.34.0              pkgconfig_2.0.3            
## [33] htmltools_0.5.6             MatrixGenerics_1.6.0       
## [35] highr_0.10                  dbplyr_2.3.3               
## [37] fastmap_1.1.1               rlang_1.1.1                
## [39] rstudioapi_0.15.0           RSQLite_2.3.1              
## [41] farver_2.1.1                jquerylib_0.1.4            
## [43] BiocIO_1.4.0                generics_0.1.3             
## [45] jsonlite_1.8.7              vroom_1.6.3                
## [47] BiocParallel_1.28.3         RCurl_1.98-1.12            
## [49] GenomeInfoDbData_1.2.7      Matrix_1.6-3               
## [51] ggbeeswarm_0.7.2            Rcpp_1.0.11                
## [53] munsell_0.5.0               fansi_1.0.4                
## [55] lifecycle_1.0.3             stringi_1.7.12             
## [57] yaml_2.3.7                  SummarizedExperiment_1.24.0
## [59] zlibbioc_1.40.0             BiocFileCache_2.2.1        
## [61] blob_1.2.4                  parallel_4.1.2             
## [63] crayon_1.5.2                lattice_0.20-45            
## [65] Biostrings_2.62.0           hms_1.1.3                  
## [67] KEGGREST_1.34.0             knitr_1.43                 
## [69] pillar_1.9.0                rjson_0.2.21               
## [71] biomaRt_2.50.3              XML_3.99-0.14              
## [73] evaluate_0.21               png_0.1-8                  
## [75] vctrs_0.6.3                 tzdb_0.4.0                 
## [77] gtable_0.3.4                cachem_1.0.8               
## [79] xfun_0.40                   restfulr_0.0.15            
## [81] tibble_3.2.1                beeswarm_0.4.0             
## [83] GenomicAlignments_1.30.0    memoise_2.0.1