# **MODIFY THIS CHUNK**
library(here)
proj_dir    <- here()
doc_id      <- "03"
out         <- here("output/03-chrombpnet/01-models/qc"); dir.create(out, recursive = TRUE)
figout      <- here("figures/03-chrombpnet/01-models", doc_id, "/"); dir.create(figout, recursive = TRUE)

1 Overview

Here we load and aggregate the QC metrics for the ChromBPNet models to evaluate them prior to interpretation. We set thresholds to define the set of models which will be used for downstream analysis.

2 Set up

library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(scales)
library(glue)
library(purrr)
library(stringr)
library(cowplot)

script_path <- here("code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))

ggplot2::theme_set(theme_min())

3 Load QC and visualize

Load general cluster info (number of fragments, number of cells):

# get number of cells
cluster_meta <- read_csv(here("output/05-misc/03/TableS2_cluster_meta_qc.csv"))
## Rows: 203 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Cluster, organ, organ_code, compartment, L1_annot, L2_annot, L3_an...
## dbl  (8): cluster_id, dend_order, ncell, median_numi, median_ngene, median_n...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# get number of fragments
n_frags <- read_tsv(here("output/01-preprocessing/03/fragmentsPerCluster.tsv")) %>% 
  dplyr::rename(total_frags = total_reads)
## Rows: 203 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): RNA_Clusters
## dbl (1): total_reads
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# put these together
cluster_meta <- cluster_meta %>% 
  left_join(n_frags, by = c("Cluster" = "RNA_Clusters")) %>% 
  # in the ChromBPNet analyses, StomachEsophagus is referred to as Stomach
  mutate(organ = ifelse(organ == "StomachEsophagus", "Stomach", organ))

sorted_cluster_order <- cluster_meta %>% arrange(organ, cluster_id) %>% pull(Cluster_ChromBPNet)

Then, we load the various ChromBPNet QC metrics:

bias_model <- "Heart_c0_thresh0.4"
model_dir  <- here("output/03-chrombpnet/01-models/models/", paste0("bias_", bias_model))
fs::dir_exists(model_dir)
## /oak/stanford/groups/wjg/skim/projects/HDMA-public/output/03-chrombpnet/01-models/models//bias_Heart_c0_thresh0.4 
##                                                                                                              TRUE
metrics    <- fs::dir_ls(model_dir, glob = "*/*/evaluation/chrombpnet_metrics.json", recurse = TRUE, type = "file")

# extract out cell type and fold from the path
clusters   <- stringr::str_split(metrics, "/") %>% sapply(getElement, 14)
folds      <- stringr::str_split(metrics, "/") %>% sapply(getElement, 15)

# load the response of the bias model
bias_resp  <- fs::dir_ls(model_dir, glob = "*/*/evaluation/chrombpnet_nobias_max_bias_response.txt",
                         recurse = TRUE, type = "file")

# loop over model performance metrics and read them in
chrombpnet_metrics <- pmap_dfr(list(metrics, clusters, folds),
                               ~ jsonlite::fromJSON(..1) %>%
                                 as.data.frame() %>% 
                                 tibble::add_column("Cluster" = ..2, .before = 1) %>% 
                                 tibble::add_column("Fold" = ..3, .after = 1))

# get tn5 motif response
read_bias <- function(response_file) {
  
  responses <- read_lines(response_file) %>% str_split("_") %>% .[[1]] %>% getElement(3) %>% str_split("/") %>% .[[1]]
  data.frame("tn5_motif" = paste0("tn5_motif_", 1:5),
             "response" = responses) %>% 
    pivot_wider(names_from = "tn5_motif", values_from = "response") %>% 
    mutate(Cluster = response_file %>% str_split("/") %>% .[[1]] %>% getElement(14),
           Fold = response_file %>% str_split("/") %>% .[[1]] %>% getElement(15))
  
}

bias_metrics <- map_dfr(bias_resp, read_bias)

Put all metrics together:

all_metrics <- right_join(cluster_meta, chrombpnet_metrics, by = c("Cluster_ChromBPNet" = "Cluster")) %>% 
  left_join(bias_metrics, by = c("Cluster_ChromBPNet" = "Cluster", "Fold" = "Fold")) %>% 
  mutate(Model = paste0(Cluster_ChromBPNet, "_", Fold)) %>% 
  dplyr::relocate(Model, .before = 1) %>% 
  arrange(organ, L1_annot) %>% 
  rowwise() %>% 
  mutate(max_tn5_response = max(tn5_motif_1, tn5_motif_2, tn5_motif_3, tn5_motif_4, tn5_motif_5))

head(all_metrics)
## # A tibble: 6 × 33
## # Rowwise: 
##   Model        Cluster organ organ_code cluster_id compartment L1_annot L2_annot
##   <chr>        <chr>   <chr> <chr>           <dbl> <chr>       <chr>    <chr>   
## 1 Adrenal_c0_… AG_0    Adre… AG                  0 epi         AG_0_ad… Adrenal…
## 2 Adrenal_c0_… AG_0    Adre… AG                  0 epi         AG_0_ad… Adrenal…
## 3 Adrenal_c0_… AG_0    Adre… AG                  0 epi         AG_0_ad… Adrenal…
## 4 Adrenal_c0_… AG_0    Adre… AG                  0 epi         AG_0_ad… Adrenal…
## 5 Adrenal_c0_… AG_0    Adre… AG                  0 epi         AG_0_ad… Adrenal…
## 6 Adrenal_c1_… AG_1    Adre… AG                  1 epi         AG_1_ad… Adrenal…
## # ℹ 25 more variables: L3_annot <chr>, dend_order <dbl>, ncell <dbl>,
## #   median_numi <dbl>, median_ngene <dbl>, median_nfrags <dbl>,
## #   median_tss <dbl>, median_frip <dbl>, note <chr>, organ_color <chr>,
## #   compartment_color <chr>, Cluster_ChromBPNet <chr>, total_frags <dbl>,
## #   Fold <chr>, counts_metrics.peaks.spearmanr <dbl>,
## #   counts_metrics.peaks.pearsonr <dbl>, counts_metrics.peaks.mse <dbl>,
## #   profile_metrics.peaks.median_jsd <dbl>, …

Sanity checks:

n_clusters <- 203
length(unique(all_metrics$Model)) == 5 * n_clusters
## [1] TRUE
length(unique(all_metrics$Cluster)) == n_clusters
## [1] TRUE
length(unique(all_metrics$organ)) == 12
## [1] TRUE

Filter out cell types where any of the folds has < 0.5 Spearman counts correlation.

celltypes_drop <- all_metrics %>%
  filter(counts_metrics.peaks.spearmanr < 0.5) %>% 
  pull(Cluster) %>% 
  unique()
celltypes_drop
## [1] "AG_7"  "AG_8"  "AG_9"  "EY_21"
all_metrics <- all_metrics %>% 
  mutate(Keep_cluster = ifelse(Cluster %in% celltypes_drop, FALSE, TRUE),
         Keep_fold    = case_when(
           Cluster %in% celltypes_drop ~ FALSE,
           TRUE ~ TRUE
         ))

models_to_keep <- all_metrics %>%
  dplyr::select(Model, Cluster_ChromBPNet, Cluster, Fold, Keep_cluster, Keep_fold) %>%
  group_by(Cluster_ChromBPNet, Cluster) %>%
  mutate(Keep_cluster = ifelse(all(Keep_cluster), TRUE, FALSE)) %>%
  filter(Keep_fold) %>%
  summarize(Folds_to_keep = glue_collapse(Fold, ",")) %>% 
  relocate(Cluster, .after = Folds_to_keep)
## `summarise()` has grouped output by 'Cluster_ChromBPNet'. You can override
## using the `.groups` argument.
models_to_keep %>% write_tsv(glue("{out}/chrombpnet_models_keep.tsv"), col_names = FALSE)

nrow(models_to_keep)
## [1] 199

Thus, at this stage, we’re keeping 199 models.

Prepare supplementary table:

all_metrics %>% 
  dplyr::select(organ, Cluster_ChromBPNet, L1_annot, number_of_cells = ncell, total_frags, Model, Fold, matches("metrics"), matches("tn5")) %>% 
  write_tsv(glue("{out}/chrombpnet_metrics.tsv"))

Correlation on the counts task:

summary(all_metrics$counts_metrics.peaks.pearsonr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3879  0.7469  0.7802  0.7720  0.8042  0.8645
summary(all_metrics$counts_metrics.peaks.spearmanr)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.4079  0.6869  0.7320  0.7227  0.7686  0.8518
summary(all_metrics$profile_metrics.peaks.median_jsd)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3154  0.4879  0.5534  0.5594  0.6275  0.8118
cluster_meta <- cluster_meta %>% 
  mutate(Cluster_ChromBPNet = factor(Cluster_ChromBPNet, levels = sorted_cluster_order))

all_metrics_avg <- all_metrics %>% 
  group_by(Cluster_ChromBPNet, Cluster, dend_order, organ) %>% 
  summarize_at(.vars = vars(counts_metrics.peaks.pearsonr, counts_metrics.peaks.spearmanr, profile_metrics.peaks.median_jsd), .funs = mean) %>% 
  gather(stat, value, counts_metrics.peaks.pearsonr, counts_metrics.peaks.spearmanr, profile_metrics.peaks.median_jsd)

all_metrics_long <- all_metrics %>%
  gather(stat, value, counts_metrics.peaks.pearsonr, counts_metrics.peaks.spearmanr, profile_metrics.peaks.median_jsd)

p1 <- cluster_meta %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = total_frags)) +
  geom_bar(aes(fill = organ), stat = "identity") +
  geom_hline(yintercept = c(5e6), color = "black", linetype = "dashed") +
  scale_fill_manual(values = cmap_organ) +
  # put the scale on a log10 scale, but use non-logged numbers
  scale_y_log10(label = comma) +
  coord_cartesian(ylim = c(1e5, NA)) +
  facet_grid(. ~ organ, scales = "free_x", space = "free") +
  rotate_x() + no_legend() +
  theme(axis.text.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray90")) +
  xlab(NULL) + ylab("Total # fragments") +
  ggtitle("ChromBPNet model performance")

p2 <- cluster_meta %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = ncell)) +
  geom_bar(aes(fill = organ), stat = "identity") +
  scale_fill_manual(values = cmap_organ) +
  # put the scale on a log10 scale, but use non-logged numbers
  scale_y_log10(label = comma) +
  facet_grid(. ~ organ, scales = "free_x", space = "free") +
  rotate_x() + no_legend() +
  theme(axis.text.x = element_blank(),
        strip.text.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray90")) +
  xlab(NULL) + ylab("# cells")

p3 <- all_metrics_avg %>% 
  mutate(Cluster_ChromBPNet = factor(Cluster_ChromBPNet, levels = sorted_cluster_order)) %>% 
  filter(stat %in% c("counts_metrics.peaks.pearsonr")) %>% 
  mutate(stat = recode(stat, counts_metrics.peaks.spearmanr = "SpearmanR", counts_metrics.peaks.pearsonr = "PearsonR")) %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = value)) +
  geom_bar(aes(fill = organ), stat = "identity") +
  geom_point(data = all_metrics_long %>%
               filter(stat %in% c("counts_metrics.peaks.pearsonr")) %>% 
               mutate(stat = recode(stat, counts_metrics.peaks.spearmanr = "SpearmanR", counts_metrics.peaks.pearsonr = "PearsonR")), 
             aes(x = Cluster_ChromBPNet, y = value), shape = 21, size = 0.5) +
  scale_fill_manual(values = cmap_organ) +
  # geom_text(aes(label = round(value, 2), y = 0.88), size = 2, angle = 90) +
  facet_grid(. ~ organ, scales = "free_x", space = "free") +
  rotate_x() +
  scale_y_continuous(breaks = seq(0.3, 0.9, 0.1)) +
  coord_cartesian(ylim = c(0.3, 0.9)) +
  ylab("Correlation between predicted / observed counts") + no_legend() +
  theme(strip.text.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray90"))

p4 <- all_metrics_avg %>% 
  mutate(Cluster_ChromBPNet = factor(Cluster_ChromBPNet, levels = sorted_cluster_order)) %>% 
  filter(stat %in% c("profile_metrics.peaks.median_jsd")) %>%
  mutate(stat = "Median JSD") %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = value)) +
  geom_bar(aes(fill = organ), stat = "identity") +
  geom_point(data = all_metrics_long %>%
               filter(stat %in% c("profile_metrics.peaks.median_jsd")) %>% 
               mutate(stat = "Median JSD"),
             aes(x = Cluster_ChromBPNet, y = value), shape = 21, size = 0.5) +
  scale_fill_manual(values = cmap_organ) +
  facet_grid(stat ~ organ, scales = "free_x", space = "free") +
  rotate_x() +
  coord_cartesian(ylim = c(0.3, 0.85)) +
  scale_y_continuous(breaks = seq(0.3, 0.85, 0.1)) +
  ylab("Median JSD between observed / predicted profiles") + no_legend() +
  theme(strip.text.x = element_blank(),
        panel.grid.major.y = element_line(color = "gray90")) +
  xlab("Cluster")

plot_grid(p1, p2, p3 + theme(axis.text.x = element_blank()) + xlab(NULL),
          p4, ncol = 1, rel_heights = c(1.5, 1, 1, 2), align = "v", axis = "rl")

Plot response to Tn5 motifs:

all_metrics %>% 
  mutate(Cluster_ChromBPNet = factor(Cluster_ChromBPNet, levels = sorted_cluster_order)) %>% 
  dplyr::select(Cluster_ChromBPNet, organ, Fold, matches("tn5_motif")) %>% 
  gather(Tn5_motif, Response, matches("tn5_motif")) %>% 
  mutate(Response = as.numeric(Response),
         Response = ifelse(Response > 0.01, 0.01, Response)) %>% 
  ggplot(aes(x = Cluster_ChromBPNet, y = Response)) +
  geom_hline(yintercept = 0.003, color = "gray90") +
  geom_jitter(stat = "identity", aes(color = organ, shape = Fold), width = 0.1, alpha = 0.8) +
  scale_y_continuous(breaks = seq(0.001, 0.01, 0.002)) +
  facet_grid(Tn5_motif ~ organ, scales = "free_x", space = "free") +
  scale_color_manual(values = cmap_organ) + 
  rotate_x() +
  ggtitle("ChromBPNet Tn5 motif response, plotting capped at 0.01") +
  coord_cartesian(ylim = c(0, 0.01))

Distribution of metrics (mean across folds) per cell type:

p1 <- all_metrics_avg %>%
  filter(stat == "counts_metrics.peaks.pearsonr") %>%
  ggplot(aes(x = 1, y = value)) +
  geom_boxplot(outlier.colour = NA, fill = "gray90") +
  geom_jitter(fill = "#3953A4", width = 0.25,
              shape = 21, size = 3, alpha = 0.9) +
  theme(
    panel.grid.major.y = element_line(),
    panel.grid.minor.y = element_line(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()) +
  ggtitle("Pearson R") + ylab("Value") + xlab(NULL)

p2 <- all_metrics_avg %>%
  filter(stat == "counts_metrics.peaks.spearmanr") %>%
  ggplot(aes(x = 1, y = value)) +
  geom_boxplot(outlier.colour = NA, fill = "gray90") +
  geom_jitter(fill = "#3953A4", width = 0.25,
              shape = 21, size = 3, alpha = 0.9) +
  theme(
    panel.grid.major.y = element_line(),
    panel.grid.minor.y = element_line(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()) +
  ggtitle("Spearman R") + ylab("Value") + xlab(NULL)

p3 <- all_metrics_avg %>%
  filter(stat == "profile_metrics.peaks.median_jsd") %>%
  ggplot(aes(x = 1, y = value)) +
  geom_boxplot(outlier.colour = NA, fill = "gray90") +
  geom_jitter(fill = "#3953A4", width = 0.25,
              shape = 21, size = 3, alpha = 0.9) +
  theme(
    panel.grid.major.y = element_line(),
    panel.grid.minor.y = element_line(),
    axis.ticks.x = element_blank(),
    axis.text.x = element_blank()) +
  ggtitle("Median JSD") + ylab("Value") + xlab(NULL)

plot_grid(p1, p2, p3, nrow = 1)

4 Session info

sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3     ggrastr_1.0.1          magrittr_2.0.3        
##  [4] GenomicFeatures_1.46.5 AnnotationDbi_1.56.2   Biobase_2.54.0        
##  [7] GenomicRanges_1.46.1   GenomeInfoDb_1.30.1    IRanges_2.28.0        
## [10] S4Vectors_0.32.4       BiocGenerics_0.40.0    cowplot_1.1.3         
## [13] stringr_1.5.0          purrr_1.0.2            glue_1.8.0            
## [16] scales_1.3.0           readr_2.1.4            ggplot2_3.5.1         
## [19] tidyr_1.3.0            dplyr_1.1.3            here_1.0.1            
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.6.3                    bitops_1.0-7               
##  [3] matrixStats_1.0.0           bit64_4.0.5                
##  [5] filelock_1.0.2              progress_1.2.2             
##  [7] httr_1.4.7                  rprojroot_2.0.3            
##  [9] tools_4.1.2                 bslib_0.5.1                
## [11] utf8_1.2.3                  R6_2.5.1                   
## [13] vipor_0.4.5                 DBI_1.1.3                  
## [15] colorspace_2.1-0            withr_2.5.0                
## [17] tidyselect_1.2.0            prettyunits_1.1.1          
## [19] bit_4.0.5                   curl_5.0.2                 
## [21] compiler_4.1.2              cli_3.6.1                  
## [23] xml2_1.3.5                  DelayedArray_0.20.0        
## [25] labeling_0.4.3              rtracklayer_1.54.0         
## [27] sass_0.4.7                  rappdirs_0.3.3             
## [29] digest_0.6.33               Rsamtools_2.10.0           
## [31] rmarkdown_2.24              XVector_0.34.0             
## [33] pkgconfig_2.0.3             htmltools_0.5.6            
## [35] MatrixGenerics_1.6.0        highr_0.10                 
## [37] dbplyr_2.3.3                fastmap_1.1.1              
## [39] rlang_1.1.1                 rstudioapi_0.15.0          
## [41] RSQLite_2.3.1               farver_2.1.1               
## [43] jquerylib_0.1.4             BiocIO_1.4.0               
## [45] generics_0.1.3              jsonlite_1.8.7             
## [47] vroom_1.6.3                 BiocParallel_1.28.3        
## [49] RCurl_1.98-1.12             GenomeInfoDbData_1.2.7     
## [51] Matrix_1.6-3                ggbeeswarm_0.7.2           
## [53] munsell_0.5.0               fansi_1.0.4                
## [55] lifecycle_1.0.3             stringi_1.7.12             
## [57] yaml_2.3.7                  SummarizedExperiment_1.24.0
## [59] zlibbioc_1.40.0             BiocFileCache_2.2.1        
## [61] grid_4.1.2                  blob_1.2.4                 
## [63] parallel_4.1.2              crayon_1.5.2               
## [65] lattice_0.20-45             Biostrings_2.62.0          
## [67] hms_1.1.3                   KEGGREST_1.34.0            
## [69] knitr_1.43                  pillar_1.9.0               
## [71] rjson_0.2.21                biomaRt_2.50.3             
## [73] XML_3.99-0.14               evaluate_0.21              
## [75] png_0.1-8                   vctrs_0.6.3                
## [77] tzdb_0.4.0                  gtable_0.3.4               
## [79] cachem_1.0.8                xfun_0.40                  
## [81] restfulr_0.0.15             tibble_3.2.1               
## [83] GenomicAlignments_1.30.0    beeswarm_0.4.0             
## [85] memoise_2.0.1