# **MODIFY THIS CHUNK**
library(here)
kundaje_dir <- trimws(readr::read_lines(here("code/AK_PROJ_DIR.txt")))
doc_id      <- "01"
out         <- here("output/03-chrombpnet/03-syntax/", doc_id); dir.create(out, recursive = TRUE)
figout      <- here("figures/03-chrombpnet/03-syntax", doc_id, "/"); dir.create(figout, recursive = TRUE)
chrombpnet_dir <- here("output/03-chrombpnet")

1 Overview

In this notebook, we do global investigations of the de novo motifs:

computing stats on the motif instances such as frequencies per peak
summarize each motif’s distribution of distances to nucleosome dyads and peak flanks
distribution of motif instances in genomic features
distribution of motif instances in tissues and tissue compartments

2 Set up

library(glue)
library(scales)
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(stringr)
library(ggrepel)
library(cowplot)
library(plotly)
library(ggseqlogo)
library(BPCells)

script_path <- here("code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "chrombpnet_utils.R"))

ggplot2::theme_set(theme_BOR())

3 Load & wrangle data

3.1 Load cluster, motif, and hits data

Cluster metadata:

cluster_meta <- read_csv(here("output/05-misc/03/TableS2_cluster_meta_qc.csv"))

## Rows: 203 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Cluster, organ, organ_code, compartment, L1_annot, L2_annot, L3_an...
## dbl  (8): cluster_id, dend_order, ncell, median_numi, median_ngene, median_n...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

chrombpnet_models_keep <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_models_keep2.tsv"),
                                   col_names = c("Cluster", "Folds_keep", "Cluster_ID"))

## Rows: 189 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Cluster, Folds_keep, Cluster_ID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cluster_order <- cluster_meta %>% arrange(organ, L1_annot) %>% pull(Cluster_ChromBPNet)
organs <- unique(cluster_meta$organ) %>% sort()
organs[organs == "StomachEsophagus"] <- "Stomach"

head(chrombpnet_models_keep)

## # A tibble: 6 × 3
##   Cluster    Folds_keep                         Cluster_ID
##   <chr>      <chr>                              <chr>     
## 1 Adrenal_c0 fold_0,fold_1,fold_2,fold_3,fold_4 AG_0      
## 2 Adrenal_c1 fold_0,fold_1,fold_2,fold_3,fold_4 AG_1      
## 3 Adrenal_c2 fold_0,fold_1,fold_2,fold_3,fold_4 AG_2      
## 4 Adrenal_c3 fold_0,fold_1,fold_2,fold_3,fold_4 AG_3      
## 5 Brain_c0   fold_0,fold_1,fold_2,fold_3,fold_4 BR_0      
## 6 Brain_c1   fold_0,fold_1,fold_2,fold_3,fold_4 BR_1

Load metrics:

chrombpnet_metrics <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_metrics.tsv")) %>% 
  dplyr::select(Cluster_ChromBPNet, total_frags) %>% 
  distinct()

## Rows: 1015 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (5): organ, Cluster_ChromBPNet, L1_annot, Model, Fold
## dbl (13): number_of_cells, total_frags, counts_metrics.peaks.spearmanr, coun...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Load number of peaks:

npeaks <- read_table(here("output/03-chrombpnet/01-models/qc/npeaks.tsv"),
                     col_names = c("n_peaks", "peak_file")) %>% 
  filter(peak_file != "total")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   n_peaks = col_double(),
##   peak_file = col_character()
## )

nrow(npeaks) == 203

## [1] TRUE

npeaks <- npeaks %>% 
  rowwise() %>% 
  mutate(peak_file = basename(peak_file)) %>% 
  separate(peak_file, into = c("Cluster_ChromBPNet", "drop"), sep = "__") %>% 
  dplyr::select(-drop)

cluster_meta <- cluster_meta %>%
  left_join(npeaks, by = "Cluster_ChromBPNet") %>% 
  left_join(chrombpnet_metrics, by = "Cluster_ChromBPNet")

Load merged modisco reports:

modisco_merged <- read_tsv(here("output/03-chrombpnet/02-compendium/modisco_compiled_anno/modisco_merged_reports.tsv")) %>% 
  # get short cluster ID
  left_join(cluster_meta %>% dplyr::select(Cluster, Cluster_ChromBPNet),
            by = c("component_celltype" = "Cluster_ChromBPNet"))

## Rows: 6362 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): merged_pattern, component_celltype, pattern_class, pattern, compone...
## dbl (1): n_seqlets
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(modisco_merged)

## # A tibble: 6 × 7
##   merged_pattern              component_celltype pattern_class pattern n_seqlets
##   <chr>                       <chr>              <chr>         <chr>       <dbl>
## 1 neg.Average_12__merged_pat… Spleen_c5          neg_patterns  patter…        88
## 2 neg.Average_12__merged_pat… Spleen_c0          neg_patterns  patter…       461
## 3 neg.Average_12__merged_pat… Skin_c5            neg_patterns  patter…       302
## 4 neg.Average_12__merged_pat… Heart_c13          neg_patterns  patter…       128
## 5 neg.Average_12__merged_pat… Skin_c7            neg_patterns  patter…       708
## 6 neg.Average_12__merged_pat… Heart_c5           neg_patterns  patter…        47
## # ℹ 2 more variables: component_pattern <chr>, Cluster <chr>

Load annotated patterns:

# manually annotated motif patterns
motifs_compiled_all <- read_tsv(here("code/03-chrombpnet/02-compendium/04d-ChromBPNet_de_novo_motifs.tsv")) %>% 
  mutate(motif_name = paste0(idx_uniq, "|", annotation)) %>% 
  dplyr::relocate(motif_name, .after = pattern) %>%
  dplyr::relocate(idx_uniq, .after = idx) %>% 
  # rename categories
  mutate(category = ifelse(annotation == "exclude", "exclude", category),
         category = case_match(category,
                               "composite_homo" ~ "homocomposite",
                               "composite_hetero" ~ "heterocomposite",
                               .default = category),
         category = factor(category, levels = names(cmap_category)),
         pattern_class = dplyr::recode(pattern_class, pos = "pos_patterns", neg = "neg_patterns"))

## Rows: 834 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): pattern_class, pattern, component_organs, component_celltypes, que...
## dbl (14): idx, total_n_seqlets, n_component_patterns, n_component_celltypes,...
## lgl  (6): modisco_cwm_fwd, modisco_cwm_rev, match0_logo, match0_logo_jolma, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

motifs_compiled <- motifs_compiled_all %>% 
  # filter out exclude motifs
  filter(category != "exclude") %>% 
  # extract the name of the known motif
  separate(match0, into = c("TF0", "motif0"), sep = ": ")

# for cases with multiple patterns merged into one motif, take the one with the most
# number of seqlets.
motifs_compiled_unique <- motifs_compiled %>% 
  group_by(idx_uniq) %>% 
  slice_max(n = 1, order_by = tibble(total_n_seqlets, n_component_celltypes),
            with_ties = FALSE)

dim(motifs_compiled_unique)

## [1] 508  43

# get positive and negative patterns
pos_patterns <- motifs_compiled_unique %>% filter(pattern_class == "pos_patterns") %>% pull(motif_name)
length(pos_patterns)

## [1] 493

neg_patterns <- motifs_compiled_unique %>% filter(pattern_class == "neg_patterns") %>% pull(motif_name)
length(neg_patterns)

## [1] 15

# initial breakdown of motif categories, prior manual collapsing
table(motifs_compiled_all$category)

## 
##            base base_with_flank   homocomposite heterocomposite      unresolved 
##             147             128             135             263              43 
##          repeat         partial         exclude 
##               7               4             107

# breakdown after manual collapsing & removing low-quality motifs
table(motifs_compiled_unique$category)

## 
##            base base_with_flank   homocomposite heterocomposite      unresolved 
##              86              51             107             222              38 
##          repeat         partial         exclude 
##               2               2               0

dim(motifs_compiled_unique)

## [1] 508  43

Thus, in total we have:

508 unique motifs learned from the dataset
493 positive patterns
15 negative patterns

NOTE: There are 509 unique motifs, and the labels go from 0 to 508 (which spans 509 values). The “missing label” is for idx_uniq 466, which corresponded to unresolved motifs which were filtered out.

Load reconciled hit calls:

finemo_param <- "counts_v0.23_a0.8_all"

# load hit counts per motif per cell type
hits <- map(chrombpnet_models_keep$Cluster,
            ~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_motif.tsv")),
                                data.table = FALSE) %>%
              dplyr::rename(n = count) %>% 
              tibble::add_column(Cluster = .x, .before = 1)) %>%
  setNames(chrombpnet_models_keep$Cluster)

# sanity check
n_cols <- map_dbl(hits, ncol)

if (any(n_cols < 12)) {
  
  missing_cols <- names(n_cols[n_cols < 12])
  stop("@ Missing columns in ", glue_collapse(missing_cols, ", "))
  
}

saveRDS(hits, file = glue("{out}/hits.Rds"))

# load summary stats on hits per peak per cell type
hits_per_peak <- map_dfr(chrombpnet_models_keep$Cluster,
                         ~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_peak.tsv")),
                                             data.table = FALSE) %>%
                           tibble::add_column(Cluster = .x, .before = 1))

Calculate total number of hits per pattern per cell type:

hits_all <- bind_rows(hits)

if (!all(motifs_compiled_unique$motif_name %in% hits_all$motif_name) | !all(hits_all$motif_name %in% motifs_compiled$motif_name)) {
  
  stop("@ Motif names in annotation don't match motif names in hits.")
  
}

hits_all_totals <- hits_all %>%
  group_by(motif_name) %>%
  summarize(total_hits = sum(n))

# organ w/ most hits
hits_top_organ <- hits_all %>% 
  left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>% 
  group_by(motif_name, organ) %>% 
  summarize(hits_per_organ = sum(n)) %>% 
  group_by(motif_name) %>% 
  slice_max(n = 1, order_by = hits_per_organ, with_ties = FALSE) %>% 
  dplyr::select(motif_name, top_organ = organ)

## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'motif_name'. You can override using the
## `.groups` argument.

dim(hits_all_totals)

## [1] 508   2

dim(hits_top_organ)

## [1] 508   2

motifs_compiled_unique <- hits_all_totals %>% 
  left_join(motifs_compiled_unique, by = "motif_name") %>% 
  left_join(hits_top_organ, by = "motif_name") %>% 
  mutate(motif_name_safe = gsub("\\/|\\|", ".", motif_name)) %>% 
  mutate(modisco_cwm_fwd = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.fwd.png")),
         modisco_cwm_rev = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.rev.png")))

# also subset to positive and neg patterns
motifs_compiled_unique_pos <- motifs_compiled_unique %>% 
  filter(pattern_class == "pos_patterns")

motifs_compiled_unique_neg <- motifs_compiled_unique %>% 
  filter(pattern_class == "neg_patterns")

dim(motifs_compiled_unique_pos)

## [1] 493  46

hits_all_anno <- hits_all %>% 
  left_join(cluster_meta %>% dplyr::rename(Cluster_short = Cluster), by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  left_join(motifs_compiled_unique %>% dplyr::select(motif_name, pattern, annotation_broad, category))

## Joining with `by = join_by(motif_name)`

head(hits_all_anno)

##      Cluster                           motif_name pattern_class      n Distal
## 1 Adrenal_c0                    392|NR:ESRRA/NR5A  pos_patterns 158975  52205
## 2 Adrenal_c0                         456|ZEB/SNAI  neg_patterns 122904  31275
## 3 Adrenal_c0                 132|BZIP:FOSL/JUND#1  pos_patterns  52506  17353
## 4 Adrenal_c0                           436|SP/KLF  pos_patterns  45828   6164
## 5 Adrenal_c0                           260|GATA#1  pos_patterns  33086  11270
## 6 Adrenal_c0 507|unresolved,repressive,YY1/2-like  neg_patterns  32037   8234
##   Exonic Intronic Promoter mean_distToTSS median_distToTSS
## 1   9379    84616    12775       20180.92           8859.0
## 2   8002    48500    35127       11567.57           2944.0
## 3   2527    28584     4042       20839.47           9456.0
## 4   2238     9426    28000        3203.65            140.0
## 5   1370    18616     1830       22932.57          10593.5
## 6   2288    12955     8560       12456.09           3584.0
##   mean_distToPeakSummit median_distToPeakSummit Cluster_short   organ
## 1                129.77                      67          AG_0 Adrenal
## 2                171.96                     128          AG_0 Adrenal
## 3                114.59                      50          AG_0 Adrenal
## 4                 88.78                      52          AG_0 Adrenal
## 5                128.59                      74          AG_0 Adrenal
## 6                150.93                     114          AG_0 Adrenal
##   organ_code cluster_id compartment              L1_annot
## 1         AG          0         epi AG_0_adrenal cortex 1
## 2         AG          0         epi AG_0_adrenal cortex 1
## 3         AG          0         epi AG_0_adrenal cortex 1
## 4         AG          0         epi AG_0_adrenal cortex 1
## 5         AG          0         epi AG_0_adrenal cortex 1
## 6         AG          0         epi AG_0_adrenal cortex 1
##                 L2_annot       L3_annot dend_order ncell median_numi
## 1 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 2 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 3 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 4 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 5 Adrenal adrenal cortex adrenal cortex          7   823        9498
## 6 Adrenal adrenal cortex adrenal cortex          7   823        9498
##   median_ngene median_nfrags median_tss median_frip
## 1         3280          6784      9.832   0.2906086
## 2         3280          6784      9.832   0.2906086
## 3         3280          6784      9.832   0.2906086
## 4         3280          6784      9.832   0.2906086
## 5         3280          6784      9.832   0.2906086
## 6         3280          6784      9.832   0.2906086
##                                                                         note
## 1 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 2 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 3 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 4 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 5 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 6 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
##   organ_color compartment_color n_peaks total_frags
## 1     #876941           #11A579  121634     7407892
## 2     #876941           #11A579  121634     7407892
## 3     #876941           #11A579  121634     7407892
## 4     #876941           #11A579  121634     7407892
## 5     #876941           #11A579  121634     7407892
## 6     #876941           #11A579  121634     7407892
##                             pattern annotation_broad   category
## 1 pos.Average_256__merged_pattern_0               NR       base
## 2  neg.Average_12__merged_pattern_0         ZEB/SNAI       base
## 3 pos.Average_287__merged_pattern_0             BZIP       base
## 4 pos.Average_212__merged_pattern_0           SP/KLF       base
## 5 pos.Average_248__merged_pattern_0             GATA       base
## 6  neg.Average_15__merged_pattern_1       unresolved unresolved

# breakdown of motif categories per pos patterns
table(motifs_compiled_unique_pos$category)

## 
##            base base_with_flank   homocomposite heterocomposite      unresolved 
##              81              51             102             221              34 
##          repeat         partial         exclude 
##               2               2               0

# save as TSV for re-use
motifs_compiled_unique %>% write_tsv(glue("{out}/motifs_compiled_unique.tsv"))

# short version
motifs_compiled_unique %>%
  arrange(idx_uniq) %>% 
  dplyr::select(motif_name, total_hits, idx, idx_uniq, pattern_class, pattern,
                annotation, annotation_broad, category,
                TF0, motif0, qval0, top_organ, component_organs, component_celltypes) %>% 
  write_tsv(glue("{out}/motifs_compiled_unique_short.tsv"))

hits_all_anno %>% write_tsv(glue("{out}/hits_per_cluster_annotated.tsv"))

Prep the hit counts matrices across cell types:

dim(hits_all)

## [1] 33054    12

mat_hits <- hits_all %>%
  filter(pattern_class == "pos_patterns") %>% 
  left_join(motifs_compiled_unique_pos, by = c("motif_name", "pattern_class")) %>%
  filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
  dplyr::select(Cluster, motif_name, n) %>%
  spread(motif_name, n, fill = 0) %>%
  tibble::column_to_rownames(var = "Cluster") %T>%
  write_tsv(glue("{out}/hit_matrix.pos_patterns_no_unresolved.tsv"))

dim(mat_hits)

## [1] 189 455

hits_all_broad <- hits_all %>%
  filter(pattern_class == "pos_patterns") %>% 
  left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
  filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
  group_by(annotation_broad, Cluster) %>%
  summarize(n = sum(n))

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

mat_hits_broad <- hits_all_broad %>%
  spread(annotation_broad, n, fill = 0) %>%
  tibble::column_to_rownames(var = "Cluster") %T>%
  write_tsv(glue("{out}/hit_matrix.broad_pos_patterns_no_unresolved.tsv"))
dim(mat_hits_broad)

## [1] 189 169

mat_hits_unresolved <- hits_all %>%
  filter(pattern_class == "pos_patterns") %>% 
  left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
  filter(category == "unresolved") %>%
  dplyr::select(motif_name, n, Cluster) %>%
  spread(motif_name, n, fill = 0) %>%
  as.data.frame() %>%
  tibble::column_to_rownames(var = "Cluster")

dim(mat_hits_unresolved)

## [1] 189  34

Load hit counts for binned distances to nucleosomes:

# load motif hit counts per binned distance to nucleosome dyads
hit_dyad_dist <- map_dfr(chrombpnet_models_keep$Cluster,
                         ~ data.table::fread(
                           here(glue("output/03-chrombpnet/02-compendium/nucleoatac/{.x}/{.x}.motif_dist_to_dyad.tsv")),
                           data.table = FALSE) %>%
                           tibble::add_column(Cluster = .x, .before = 1))

hit_dyad_dist <- left_join(hit_dyad_dist, motifs_compiled_unique, by = "motif_name")

Load hit counts for binned distances to peak summits:

# load motif hit counts per binned distance to nucleosome dyads
hit_summit_dist <- map_dfr(chrombpnet_models_keep$Cluster,
                           ~ data.table::fread(
                             here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/counts_v0.23_a0.8_all/motif_dist_to_summit.tsv")),
                             data.table = FALSE) %>%
                             tibble::add_column(Cluster = .x, .before = 1))

hit_summit_dist <- left_join(hit_summit_dist, motifs_compiled_unique, by = "motif_name")

We also count the number of unique peaks in each cluster that contain at least one motif instance, separately for positive and negative patterns:


cd ../../../output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks
for class in ( pos_patterns neg_patterns ); do
for i in *; do
n_peaks=`zcat ${i}/counts_v0.23_a0.8_all/hits_unique.reconciled.annotated.tsv.gz | grep ${class} | cut -f 14 | sort | uniq | wc -l`
echo -e "${i}\t${n_peaks}" >> .unique_peaks_cluster.${class}.txt
done
done

Load the counts:

peaks_w_pos_hits <- read_tsv(here("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/.unique_peaks_cluster.pos_patterns.txt"),
                             col_names = c("Cluster", "N_peaks_w_hits")) %>% 
  mutate(pattern_class = "pos_patterns")

## Rows: 189 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Cluster
## dbl (1): N_peaks_w_hits
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

peaks_w_neg_hits <- read_tsv(here("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/.unique_peaks_cluster.neg_patterns.txt"),
                             col_names = c("Cluster", "N_peaks_w_hits")) %>% 
  mutate(pattern_class = "neg_patterns")

## Rows: 189 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Cluster
## dbl (1): N_peaks_w_hits
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

peaks_w_hits <- bind_rows(peaks_w_pos_hits,
                          peaks_w_neg_hits) %>%
  left_join(cluster_meta %>% dplyr::select(organ, Cluster = Cluster_ChromBPNet, n_peaks), by = "Cluster") %>% 
  mutate(Prop_peaks_w_hits = N_peaks_w_hits / n_peaks)

Load HOCOMOCO:

hocomoco <- read_tsv(here("output/02-global_analysis/04/hocomoco_unique.tsv"))

## Rows: 949 Columns: 662
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr  (45): name, original_motif.name, original_motif.tf, original_motif.moti...
## dbl (617): original_motif.subtype_info.datatype_counts.P, original_motif.sub...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

unique_classes <- unique(hocomoco$masterlist_info.tfclass_class)
cmap_class <- c(ArchR::paletteDiscrete(unique_classes, set = "stallion"), "N/A" = "gray90")

## Length of unique values greater than palette, interpolating..

Load CWMs for the compiled motifs:

# use custom helper
cwm_list <- read_cwm_meme(here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/modisco_compiled.memedb.txt")))

## Could not find strand info, assuming +.

length(cwm_list)

## [1] 834

cwm_list <- cwm_list[base::intersect(names(cwm_list), motifs_compiled_unique$pattern)]
length(cwm_list)

## [1] 508

names(cwm_list) <- plyr::mapvalues(names(cwm_list), from = motifs_compiled_unique$pattern, to = motifs_compiled_unique$motif_name)

# label matrix dimensions
cwm_list <- map(cwm_list, function(cwm) {
  
  rownames(cwm) <- c("A", "C", "G", "T")
  colnames(cwm) <- seq(1:30)
  
  return(cwm)
  
})

print(length(cwm_list))

## [1] 508

saveRDS(cwm_list, file = glue("{out}/denovo_cwm_list.Rds"))

3.2 Save intermediate objects

save(chrombpnet_models_keep, cluster_meta, motifs_compiled_unique, motifs_compiled_all,
     cwm_list, hit_dyad_dist, hit_summit_dist, hits_all_anno, hits_per_peak,
     file = glue("{out}/intermediate_obj.Rda"))

To reload for later analysis:

load(glue("{out}/intermediate_obj.Rda"))
hits <- readRDS(glue("{out}/hits.Rds"))
hits_all <- bind_rows(hits)

3.3 Rename trimmed CWM logos

fs::dir_create(file.path(out, "trimmed_cwm_logos"))

all(fs::file_exists(motifs_compiled_unique$modisco_cwm_fwd))
all(fs::file_exists(motifs_compiled_unique$modisco_cwm_rev))

for (i in 1:nrow(motifs_compiled_unique)) {
  
  new_name_fw <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.fwd.png")
  fs::file_copy(motifs_compiled_unique$modisco_cwm_fwd[i], file.path(out, "trimmed_cwm_logos", new_name_fw), overwrite = TRUE)
  
  new_name_rev <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.rev.png")
  fs::file_copy(motifs_compiled_unique$modisco_cwm_rev[i], file.path(out, "trimmed_cwm_logos", new_name_rev), overwrite = TRUE)
  
}

3.4 Prep table

motifs_compiled_unique %>% 
  mutate(
    class = case_when(
      pattern_class == "pos_patterns" ~ "positive",
      TRUE ~ "negative"
    ),
    cwm_logo_fwd = paste0(motif_name_safe, "__cwm.fwd.png"),
    cwm_logo_rev = paste0(motif_name_safe, "__cwm.rev.png"),
  ) %>% 
  dplyr::select(
    idx_uniq,
    motif_name,
    motif_name_safe,
    class,
    total_instances_across_celltypes = total_hits,
    cwm_logo_fwd,
    cwm_logo_rev,
    query_consensus,
    annotation,
    annotation_broad,
    category,
    best_match_TF = TF0,
    best_match_motif = motif0,
    best_match_TOMTOM_qval = qval0,
    best_match_TFs_in_family = match0_TFs_in_family,
    best_match_TFs_in_Vierstra_archetype = match0_TFs_in_archetype,
    total_seqlets_across_celltypes = total_n_seqlets,
    merged_pattern = pattern,
    n_constituent_motifs = n_component_patterns,
    n_constituent_celltypes = n_component_celltypes,
    constituent_organs = component_organs,
    constituent_celltypes = component_celltypes
  ) %>% 
  arrange(idx_uniq) %>% 
  write_csv(glue("{out}/TABLE_motif_compendium.csv"))

4 Global view

4.1 Categories of motifs

What kinds of patterns did we detect?

First, let’s look at all 834 motifs from the motif clustering workflow.

p1 <- motifs_compiled_all %>% 
  group_by(category) %>% 
  count() %>% 
  # reverse levels since we're flipping coordinates
  mutate(category = factor(category, levels = rev(names(cmap_category)))) %>% 
  ggplot(aes(x = category, y = n)) +
  geom_col(aes(fill = category)) +
  geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
  coord_flip() +
  scale_fill_manual(values = cmap_category) +
  ggtitle("De novo deep learning derived motifs \n(after MoDISco agggregation step)") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90")) +
  ylim(c(0, 280))

p2 <- motifs_compiled_all %>% 
  ggplot(aes(x = "breakdown")) +
  geom_bar(aes(fill = category), position = "fill") +
  scale_fill_manual(values = cmap_category)

plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(0.65, 0.35))

Next, let’s look at the unique motifs after QC and collapsing motifs:

# stacked barplot
p1 <- motifs_compiled_unique %>% 
  ggplot(aes(x = "breakdown")) +
  geom_bar(aes(fill = category), position = "fill") +
  scale_fill_manual(values = cmap_category) +
  xlab(NULL) +
  theme(legend.position = "bottom") +
  no_legend()

# barplot of counts per category
p2 <- motifs_compiled_unique %>% 
  group_by(category) %>% 
  count() %>% 
  # reverse levels since we're flipping coordinates
  mutate(category = factor(category, levels = rev(names(cmap_category)))) %>% 
  ggplot(aes(x = category, y = n)) +
  geom_col(aes(fill = category)) +
  geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
  coord_flip() +
  scale_fill_manual(values = cmap_category) +
  ggtitle("De novo deep learning derived motifs \n(after manual collapsing & removing low quality motifs)") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90")) +
  ylim(c(0, 280))


plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(1, 4, 3.5))

4.2 Positive and negative patterns

p1 <- motifs_compiled_unique %>% 
  group_by(pattern_class) %>% 
  count() %>% 
  ggplot(aes(x = "1", y = n)) +
  geom_col(aes(fill = pattern_class), alpha = 0.4) +
  coord_flip() +
  scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
  ggtitle("De novo deep learning derived motifs") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90"))

p2 <- hits_all %>%
  left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>% 
  group_by(organ, pattern_class) %>%
  summarize(n_hits = sum(n)) %>%
  mutate(organ = factor(organ, levels = rev(unique(.$organ)))) %>% 
  ggplot(aes(x = organ, y = n_hits)) +
  geom_col(aes(fill = pattern_class), alpha = 0.4, position = "fill") +
  coord_flip() +
  scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
  ggtitle("Breakdown of total hits") +
  no_legend() +
  theme(panel.grid.major.x = element_line(color = "gray90"),
        panel.grid.minor.x = element_line(color = "gray90"))

## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'organ'. You can override using the
## `.groups` argument.

plot_grid(p1, p2, ncol = 1, align = "v", axis = "rl", rel_heights = c(1, 4))

4.3 Similarity to known motifs

Distribution of TOMTOM q-values for similarity to known motifs, and total # hits per motif:

p1 <- motifs_compiled_unique %>%
  mutate(similarity = qval0) %>%
  ggplot(aes(x = category, y = -log10(qval0))) +
  geom_boxplot(aes(fill = category), outliers = FALSE) +
  geom_jitter(shape = 21, width = 0.2, size = 0.5) +
  scale_fill_manual(values = cmap_category) +
  geom_hline(yintercept = -log10(0.05)) +
  no_legend() +
  xlab(NULL) +
  rotate_x()

p2 <- motifs_compiled_unique %>% 
  ggplot(aes(x = category, y = log10(total_hits))) +
  geom_boxplot(aes(fill = category), outlier.colour = NA) +
  geom_jitter(color = "black", width = 0.2, size = 0.5, shape = 21) +
  scale_fill_manual(values = cmap_category) +
  rotate_x() +
  no_legend() +
  # ggtitle("Total # of hits per unique motif \neach point is a motif (n=525)") +
  coord_cartesian(ylim = c(1, 7))

plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb")

Total number of hits per cell type:

hits_all %>%
  group_by(Cluster, pattern_class) %>%
  summarize(total_hits = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>% 
  ggplot(aes(x = Cluster, y = total_hits)) +
  facet_wrap(~ pattern_class, ncol = 1) +
  geom_bar(stat = "identity", aes(fill = organ)) +
  scale_fill_manual(values = cmap_organ) +
  rotate_x() +
  ylab("Total number of hits") +
  theme(
    strip.background = element_blank(),
    axis.text.x = element_text(size = 5),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(labels = scales::comma) +
  ggtitle("Total hits per cell type")

## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.

4.4 Total hits

By class (positive/negative patterns):

hits_all %>%
  left_join(motifs_compiled_unique %>% dplyr::select(-pattern_class),
            by = c("motif_name")) %>%
  group_by(Cluster, pattern_class) %>%
  summarize(n_per_class = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>% 
  ggplot(aes(x = Cluster, y = n_per_class)) +
  geom_bar(stat = "identity", aes(fill = pattern_class), alpha = 0.4, position = "fill") +
  scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
  rotate_x() +
  ylab("Total number of hits") +
  theme(
    axis.text.x = element_text(size = 5),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(labels = scales::comma) +
  ggtitle("Total hits, by pattern class")

## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.

Let’s plot total hits and mean hits per peak, grouped by organ:

peaks_w_hits %>% 
  group_by(pattern_class, organ) %>% 
  summarize(median = median(Prop_peaks_w_hits)) %>% 
  spread(pattern_class, median)

## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.

## # A tibble: 12 × 3
##    organ            neg_patterns pos_patterns
##    <chr>                   <dbl>        <dbl>
##  1 Adrenal                 0.609        0.907
##  2 Brain                   0.803        0.897
##  3 Eye                     0.808        0.880
##  4 Heart                   0.804        0.917
##  5 Liver                   0.551        0.919
##  6 Lung                    0.814        0.921
##  7 Muscle                  0.870        0.934
##  8 Skin                    0.891        0.945
##  9 Spleen                  0.794        0.922
## 10 StomachEsophagus        0.731        0.904
## 11 Thymus                  0.712        0.919
## 12 Thyroid                 0.707        0.870

p0 <- peaks_w_hits %>% 
  mutate(pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>% 
  ggplot(aes(x = organ, y = Prop_peaks_w_hits)) +
  geom_jitter(stat = "identity", aes(fill = organ), shape = 21, color = "white", size = 2, alpha = 0.6, width = 0.25) +
  scale_fill_manual(values = cmap_organ) +
  rotate_x() +
  facet_wrap(~ pattern_class, ncol = 1) +
  ylab("Proportion") +
  theme(
    strip.background = element_blank(),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(labels = scales::comma) +
  ggtitle("Peaks with hits") +
  no_legend()

p1 <- hits_all %>% 
  group_by(Cluster, pattern_class) %>%
  summarize(total_hits = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster)),
         pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>% 
  ggplot(aes(x = organ, y = total_hits)) +
  geom_jitter(stat = "identity", aes(fill = organ), shape = 21, color = "white", size = 2, alpha = 0.6, width = 0.25) +
  scale_fill_manual(values = cmap_organ) +
  rotate_x() +
  facet_wrap(~ pattern_class, ncol = 1) +
  ylab("Total number of hits") +
  theme(
    strip.background = element_blank(),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(labels = scales::comma) +
  ggtitle("Total hits") +
  no_legend()

## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.

p2 <- hits_per_peak %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>% 
  arrange(organ, L1_annot) %>% 
  mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>% 
  gather(pattern_class, median_n_hits, median_pos_patterns, median_neg_patterns) %>% 
  mutate(pattern_class = factor(pattern_class, levels = c("median_pos_patterns", "median_neg_patterns"))) %>% 
  ggplot(aes(x = organ, y = median_n_hits)) +
  ggbeeswarm::geom_quasirandom(
    aes(fill = organ), shape = 21, color = "white", size = 2, alpha = 0.6) + #, width = 0.25, height = 0) +
  scale_fill_manual(values = cmap_organ) +
  facet_wrap(~ pattern_class, ncol = 1) +
  rotate_x() +
  ylab("Median number of hits per peak") +
  theme(
    strip.background = element_blank(),
    panel.grid.major.y = element_line(color = "gray90"),
    panel.grid.minor.y = element_line(color = "gray90")) +
  scale_y_continuous(breaks = seq(0, 10, 1), labels = seq(0, 10), limits = c(0, 9)) +
  no_legend() +
  ggtitle("Median hits per peak")

plot_grid(p0, p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(1, 1.2, 1))

4.5 Effects of coverage/peaks on number of hits

Here we look at how total number of motif instances varies with total fragments and number of peaks:

# calculate total hist per cluster (incl. both pos/neg patterns)
total_hits_per_cluster <- hits_all %>% 
  group_by(Cluster) %>%
  summarize(total_hits = sum(n)) %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))

# this is for the 189 models which passed QC
total_hits_per_cluster$Cluster %>% unique %>% length

## [1] 189

summary(total_hits_per_cluster$total_hits)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   73500  457034  664835  839544 1110354 2593062

cor_frags <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
summary(cor_frags)

## 
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1266989  -240087   -88317   214846  1610838 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        5.052e+05  3.600e+04   14.03   <2e-16 ***
## total_hits_per_cluster$total_frags 8.947e-03  6.326e-04   14.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 373300 on 187 degrees of freedom
## Multiple R-squared:  0.5169, Adjusted R-squared:  0.5143 
## F-statistic:   200 on 1 and 187 DF,  p-value: < 2.2e-16

cor_peaks <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
summary(cor_peaks)

## 
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -555859  -99658    6559  109954  652260 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -1.796e+05  3.305e+04  -5.435 1.69e-07 ***
## total_hits_per_cluster$n_peaks  8.802e+00  2.567e-01  34.294  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 198900 on 187 degrees of freedom
## Multiple R-squared:  0.8628, Adjusted R-squared:  0.8621 
## F-statistic:  1176 on 1 and 187 DF,  p-value: < 2.2e-16

p1 <- total_hits_per_cluster %>% 
  ggplot(aes(x = log10(total_frags), y = total_hits)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = label_number(scale = 1/1e6)) +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
           x = 7, y = 2e6, size = 5) +
  xlab("log10(Total fragments)") + ylab("Total hits (x 1M)") +
  square() +
  ggtitle("# motif instances vs. \n# fragments") +
  no_legend()

p2 <- total_hits_per_cluster %>% 
  ggplot(aes(x = n_peaks, y = total_hits)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = label_number(scale = 1/1e6)) +
  scale_x_continuous(labels = scales::comma) +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
           x = 7e4, y = 2e6, size = 5) +
  xlab("Total peaks") + ylab("Total hits (x 1M)") +
  square() +
  ggtitle("# motif instances vs. \n# peaks") +
  no_legend()

# median number of instances for positive motifs per peaks
n_hits_per_peaks <- hits_per_peak %>% 
  left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))

cor_frags <- lm(n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
summary(cor_frags)

## 
## Call:
## lm(formula = n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0016 -0.9635 -0.0603  0.7646  3.3597 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  3.942e+00  9.692e-02   40.67  < 2e-16 ***
## n_hits_per_peaks$total_frags 8.634e-09  1.703e-09    5.07 9.52e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.005 on 187 degrees of freedom
## Multiple R-squared:  0.1209, Adjusted R-squared:  0.1162 
## F-statistic: 25.71 on 1 and 187 DF,  p-value: 9.516e-07

cor_peaks <- lm(n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
summary(cor_peaks)

## 
## Call:
## lm(formula = n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.73318 -0.57254 -0.00856  0.49256  2.90880 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.991e+00  1.449e-01  20.645   <2e-16 ***
## n_hits_per_peaks$n_peaks 1.100e-05  1.125e-06   9.779   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8717 on 187 degrees of freedom
## Multiple R-squared:  0.3383, Adjusted R-squared:  0.3348 
## F-statistic: 95.63 on 1 and 187 DF,  p-value: < 2.2e-16

p3 <- n_hits_per_peaks %>% 
  ggplot(aes(x = log10(total_frags), y = median_pos_patterns)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  no_legend() +
  square() +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
           x = 7.5, y = 7, size = 5) +
  xlab("log10(Total fragments)") + ylab("median # positive hits per peak") +
  ggtitle("median positive hits per peak vs. \n# fragments")

p4 <- n_hits_per_peaks %>% 
  ggplot(aes(x = n_peaks, y = median_pos_patterns)) +
  geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(labels = scales::comma) +
  no_legend() +
  square() +
  annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
           x = 70000, y = 7, size = 5) +
  xlab("total number of peaks") + ylab("median # positive hits per peak") +
  ggtitle("median positive hits per peak vs. \n# peaks")

# calculate quartiles
n_hits_per_peaks$n_peak_quartile <- ntile(n_hits_per_peaks$n_peaks, n=4)
table(n_hits_per_peaks$n_peak_quartile)/sum(table(n_hits_per_peaks$n_peak_quartile))

## 
##         1         2         3         4 
## 0.2539683 0.2486772 0.2486772 0.2486772

n_hits_per_peaks$total_frags_quartile <- ntile(n_hits_per_peaks$total_frags, n=4)
table(n_hits_per_peaks$total_frags_quartile)/sum(table(n_hits_per_peaks$total_frags_quartile))

## 
##         1         2         3         4 
## 0.2539683 0.2486772 0.2486772 0.2486772

p5 <- n_hits_per_peaks %>% 
  ggplot(aes(x = total_frags_quartile, y = median_pos_patterns, group = total_frags_quartile)) +
  geom_boxplot(fill = "gray90", outliers = FALSE) +
  ggbeeswarm::geom_quasirandom(
    aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
  scale_fill_manual(values = cmap_organ) +
  no_legend() +
  ylab("Median # positive hits per peak") +
  xlab("Quantile (total number of fragments)") +
  ggtitle("median positive hits per peak vs. \n# fragments")

p6 <- n_hits_per_peaks %>% 
  ggplot(aes(x = n_peak_quartile, y = median_pos_patterns, group = n_peak_quartile)) +
  geom_boxplot(fill = "gray90", outliers = FALSE) +
  ggbeeswarm::geom_quasirandom(
    aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
  scale_fill_manual(values = cmap_organ) +
  no_legend() +
  ylab("Median # positive hits per peak") +
  xlab("Quantile (total number of peaks)") +
  ggtitle("median positive hits per peak vs. \n# peaks")

plot_grid(p1, p3, p5, p2, p4, p6, align = "hv", axis = "rltb")

5 Genomic localization

5.1 Nucleosome positioning

First we need to aggregate the counts per bin over motif variants and cell types, per category:

# total counts
motifs_keep <- hit_dyad_dist %>% 
  filter(category %in% c("base", "base_with_flank")) %>% 
  group_by(annotation_broad) %>% 
  summarize(sum_all_counts = sum(n)) %>%
  arrange(sum_all_counts) %>% 
  filter(sum_all_counts > 50000) %>%
  pull(annotation_broad)

motifs_keep %>% length

## [1] 40

medians_dist_dyad <- hit_dyad_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist) %>% 
  summarize(sum_counts = sum(n)) %>% 
  # we need to actually calculate the median given the binned counts
  slice_max(order_by = sum_counts, n = 1) %>% 
  arrange(bin_dist)

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

# z-score
hit_dyad_dist_summed <- hit_dyad_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist) %>% 
  summarize(sum_counts = sum(n)) %>% 
  # zscore counts per motif (row-wise)
  mutate(zscore = scale(sum_counts)) %>% 
  mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
         bin_dist = as.numeric(bin_dist)) %>% 
  ungroup()

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

p1 <- hit_dyad_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = zscore)) +
  scale_fill_gradientn(colours = rdbu2,
                       # set midpoint to 0
                       # https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
                       rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
  geom_vline(xintercept = 75, linewidth = 1) +
  scale_x_continuous(breaks = seq(10, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from nucleosome dyad (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")

We can plot the heatmap w/ the actual counts on top so that we can manually inspect:

hit_dyad_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = zscore)) +
  geom_text(aes(label = round(sum_counts, 2)), color = "black") +
  scale_fill_gradientn(colours = rdbu2,
                       # set midpoint to 0
                       # https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
                       rescaler = ~ scales::rescale_mid(.x, mid = 0)) +
  geom_vline(xintercept = 75, linewidth = 1) +
  scale_x_continuous(breaks = seq(10, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from nucleosome dyad (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")

5.2 Distance to summits

hit_summit_dist_summed <- hit_summit_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist) %>% 
  summarize(sum_counts = sum(n)) %>% 
  filter(bin_dist <= 250) %>% 
  # zscore counts per motif (row-wise)
  mutate(zscore = scale(sum_counts)) %>% 
  mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
         bin_dist = as.numeric(bin_dist)) %>% 
  ungroup()

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

hit_summit_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = sum_counts)) +
  scale_fill_viridis_c("plasma") +
  scale_x_continuous(breaks = seq(0, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from peak summit (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")

p2 <- hit_summit_dist_summed %>%
  ggplot(aes(x = bin_dist, y = annotation_broad)) +
  geom_tile(aes(fill = zscore)) +
  scale_fill_gradientn(colours = rdbu2,
                       # set midpoint to 0
                       # https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
                       rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
  scale_x_continuous(breaks = seq(0, 250, by = 10)) +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.text.y = element_text(size = 9)) +
  xlab("binned distance from peak summit (bp)") +
  ylab("motif") +
  ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")

hit_summit_dist_summed <- hit_summit_dist %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  group_by(annotation_broad, bin_dist, pattern_class) %>% 
  summarize(sum_counts = sum(n))

## `summarise()` has grouped output by 'annotation_broad', 'bin_dist'. You can
## override using the `.groups` argument.

hit_summit_dist_summed %>%
  filter(annotation_broad != "unresolved") %>% 
  group_by(annotation_broad) %>%
  mutate(cum_sum_counts = cumsum(sum_counts)) %>%
  mutate(pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>% 
  ggplot(aes(x = bin_dist, y = log10(cum_sum_counts), group = annotation_broad)) +
  # geom_point(aes(fill = pattern_class), alpha = 0.4, color = "white", shape = 21) +
  geom_line(aes(color = pattern_class), alpha = 0.4) + 
  facet_wrap(~ pattern_class, ncol = 1) +
  scale_color_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
  scale_fill_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
  theme(panel.grid.minor.x = element_line(color = "gray90"),
        panel.grid.major.x = element_line(color = "gray90"),
        strip.background.x = element_blank()) +
  no_legend()

hit_summit_dist_coarse <- hit_summit_dist %>% 
  dplyr::select(Cluster, motif_name, annotation_broad, category, pattern_class, bin_dist, n_per_cluster_per_bin = n) %>% 
  filter(category %in% c("base", "base_with_flank") &
           annotation_broad %in% motifs_keep) %>% 
  mutate(bin_dist_coarse = case_when(
    bin_dist <= 50 ~ "0-50 bp",
    bin_dist <= 100 ~ "51-100 bp",
    bin_dist <= 150 ~ "101-150 bp",
    bin_dist <= 200 ~ "151-200 bp",
    bin_dist <= 250 ~ "201-250 bp",
    TRUE ~ "> 251 bp"
  )) %>% 
  group_by(annotation_broad, pattern_class, bin_dist_coarse) %>% 
  summarize(n_per_bin = sum(n_per_cluster_per_bin)) %>% 
  mutate(n_total = sum(n_per_bin),
         prop_ber_bin = n_per_bin / n_total)

## `summarise()` has grouped output by 'annotation_broad', 'pattern_class'. You
## can override using the `.groups` argument.

cmap_bins <- RColorBrewer::brewer.pal(6, "BuGn")
names(cmap_bins) <- rev(c("0-50 bp", "51-100 bp", "101-150 bp", "151-200 bp", "201-250 bp", "> 251 bp"))

motif_order_dist <- hit_summit_dist_coarse %>% filter(bin_dist_coarse == "0-50 bp") %>% arrange(desc(prop_ber_bin)) %>% pull(annotation_broad) %>% unique()

hit_summit_dist_coarse %>% 
  mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = names(cmap_bins)),
         annotation_broad = factor(annotation_broad, levels = motif_order_dist)) %>% 
  ggplot(aes(x = annotation_broad, y = prop_ber_bin)) +
  geom_bar(stat = "identity", aes(fill = bin_dist_coarse)) +
  scale_fill_manual(values = cmap_bins) +
  rotate_x()

Get proportions of motifs per bin:

hit_summit_dist_coarse %>% 
  mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = rev(names(cmap_bins)))) %>% 
  group_by(bin_dist_coarse) %>% 
  summarize(n_per_bin_across_motifs = sum(n_per_bin)) %>% 
  mutate(n_total = sum(n_per_bin_across_motifs),
         prop = n_per_bin_across_motifs / n_total,
         cum_n = cumsum(n_per_bin_across_motifs),
         cum_prop = cum_n / n_total) %>% 
  arrange(bin_dist_coarse)

## # A tibble: 6 × 6
##   bin_dist_coarse n_per_bin_across_motifs   n_total   prop     cum_n cum_prop
##   <fct>                             <int>     <int>  <dbl>     <int>    <dbl>
## 1 0-50 bp                        43674790 120645979 0.362   43674790    0.362
## 2 51-100 bp                      21467522 120645979 0.178   65142312    0.540
## 3 101-150 bp                     13494671 120645979 0.112   78636983    0.652
## 4 151-200 bp                      9438266 120645979 0.0782  88075249    0.730
## 5 201-250 bp                      7483429 120645979 0.0620  95558678    0.792
## 6 > 251 bp                       25087301 120645979 0.208  120645979    1

5.3 Feature type

region_type_summed <- hits_all_anno %>% 
  filter(annotation_broad %in% motifs_keep) %>% 
  dplyr::select(motif_name, annotation_broad, Distal, Exonic, Intronic, Promoter) %>% 
  gather(region_type, count, 3:ncol(.)) %>% 
  group_by(annotation_broad, region_type) %>% 
  summarize(count_total = sum(count, na.rm = TRUE)) %>% 
  mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)))

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

p3 <- region_type_summed %>% 
  ggplot(aes(x = annotation_broad, y = count_total)) +
  geom_col(aes(fill = region_type), position = "fill", alpha = 0.7) +
  scale_fill_manual(values = cmap_peaktype2) +
  coord_flip() +
  theme(legend.position = "bottom")

plot_grid(p1, p2 , p3, nrow = 1, align = "h", axis = "tb", rel_widths = c(2, 2, 1))

6 Summary plots of discovered motifs

Load known motifs from Vierstra input set:

vierstra_motifs <- read_cwm_meme(file.path(kundaje_dir, "refs/Vierstra_motifs/all.dbs.meme"))

We want to describe motifs in a given subset. This plots:

The de novo CWM (trimmed)
The PWM for the best matching known motif in the Vierstra input database
Total # of hits across cell types
Number of motif variants within a category
Breakdown of hits by compartment
Breakdown of hits by organ
Genomic annotations
Binned distances to nucleosomes

6.1 Base motifs, representative per broad anno

base_broad <- motifs_compiled_unique %>% 
  filter(category %in% c("base", "base_with_flank"))

length(base_broad$annotation_broad %>% unique())

## [1] 51

plot_motif_summary(base_broad,
                   hits_all_anno,
                   hit_dyad_dist,
                   order_by = "distToTSS",
                   plot_known_motifs = TRUE,
                   # manually indicate which motifs to reverse complement
                   # to match known motif orientation
                   to_revcomp = c("GATA", "CTCF", "RFX", "MEF2", "NFKB/RELA", "NR", "ONECUT",
                                  "ZNF143", "NR:HNF4", "REST", "THRA/B", "HD:SIX/ZNF", "ZEB/SNAI", "YY1/2rep"))

## @ plotting 51 motifs

## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.2 Homocomposites, representative motif per broad anno

homocomp_broad <- motifs_compiled_unique %>%
  filter(pattern_class == "pos_patterns" & category == "homocomposite")

length(unique(homocomp_broad$annotation_broad))

## [1] 22

plot_motif_summary(homocomp_broad,
                   hits_all_anno,
                   hit_dyad_dist,
                   plot_known_motifs = TRUE,
                   rel_widths = c(3.5, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
                   to_revcomp = c("ETS_ETS", "IRF/STAT_IRF/STAT", "BZIP_BZIP", "HD:PITX/OTX_HD:PITX/OTX"))

## @ plotting 22 motifs

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.3 Heterocomposites, representative motif per broad anno

Just showing the top 30 here, because there are many.

heterocomp_broad <- motifs_compiled_unique %>% 
  filter(category == "heterocomposite" & !grepl("unresolved", annotation))

heterocomp_broad %>% 
  group_by(annotation_broad) %>% 
  count() %>% 
  arrange(desc(n))

## # A tibble: 95 × 2
## # Groups:   annotation_broad [95]
##    annotation_broad     n
##    <chr>            <int>
##  1 BHLH_HD             11
##  2 ETS_SP/KLF           9
##  3 ETS_NR               8
##  4 FOX_NKX              8
##  5 ETS_SOX              7
##  6 BHLH_BZIP            6
##  7 BHLH_MEF2            6
##  8 BHLH_NFI             6
##  9 ETS_NFI              6
## 10 GATA_MEF2            5
## # ℹ 85 more rows

plot_motif_summary(heterocomp_broad,
                   hits_all_anno,
                   hit_dyad_dist,
                   plot_known_motifs = FALSE,
                   # known_motifs = jolma_motifs,
                   # known_motif_col = "match0_jolma",
                   # known_motif_name = "match0_jolma",
                   rel_widths = c(3, 0.5, 1, 1, 1, 1, 1, 1.5, 1.5, 1),
                   top_n = 30)

## @ plotting 30 motifs

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.4 Negative/reducing patterns, representative per broad anno

neg_patterns <- motifs_compiled_unique %>% 
  filter(pattern_class == "neg_patterns") %>% 
  filter(!(category == "unresolved"))

# we can make use of the plotting function by setting the broad annotation to 
# the granular one
plot_motif_summary(neg_patterns %>% mutate(annotation_broad = motif_name),
                   hits_all_anno %>% mutate(annotation_broad = motif_name),
                   hit_dyad_dist %>% mutate(annotation_broad = motif_name),
                   plot_known_motifs = TRUE,
                   rel_widths = c(4, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
                   to_revcomp = c("456|ZEB/SNAI", "455|YY1/2repressive"))

## @ plotting 11 motifs

## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

6.5 Unresolved motifs

unresolved <- motifs_compiled_unique %>% 
  filter(category == "unresolved") %>% 
  arrange(desc(total_hits))

plot_motif_summary(unresolved %>% mutate(annotation_broad = motif_name),
                   hits_all_anno %>% mutate(annotation_broad = motif_name), 
                   hit_dyad_dist %>% mutate(annotation_broad = motif_name),
                   order_by = "distToTSS",
                   plot_known_motifs = FALSE,
                   rel_widths = c(3.5, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
                   top_n = 30)

## @ plotting 30 motifs

## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.

7 Footprinting motif instances

For a few motifs, let’s compute the ATAC footprint, focusing on the the cluster with the most hits for each of the motifs of interest:

# for each motif of interest, what is the cluster w/ the most hits?
hits_all_anno %>%
  group_by(annotation_broad) %>%
  slice_max(order_by = n, n = 1) %>%
  filter(annotation_broad %in%
           c("CTCF", "GATA", "BZIP", "NFY", "NFYrep", "YY1/2", "YY1/2rep", "HIC", "ZEB/SNAI", "BCL11A/IRF/SPI", "BCL11Arep")) %>%
  dplyr::select(pattern_class, motif_name, Cluster, n)

## Adding missing grouping variables: `annotation_broad`

## # A tibble: 11 × 5
## # Groups:   annotation_broad [11]
##    annotation_broad pattern_class motif_name           Cluster         n
##    <chr>            <chr>         <chr>                <chr>       <int>
##  1 BCL11A/IRF/SPI   pos_patterns  214|ETS:SPI1/BCL11A  Liver_c7    12993
##  2 BCL11Arep        neg_patterns  4|BCL11A/Brepressive Skin_c2    291187
##  3 BZIP             pos_patterns  132|BZIP:FOSL/JUND#1 Adrenal_c0  52506
##  4 CTCF             pos_patterns  148|CTCF#1           Liver_c1    29365
##  5 GATA             pos_patterns  260|GATA#1           Heart_c2   164132
##  6 HIC              neg_patterns  333|HIC              Heart_c3   222937
##  7 NFY              pos_patterns  367|NFY              Brain_c9    32681
##  8 NFYrep           neg_patterns  372|NFYrepressive    Heart_c4    19995
##  9 YY1/2            pos_patterns  454|YY1/2            Lung_c1      5345
## 10 YY1/2rep         neg_patterns  455|YY1/2repressive  Liver_c1    55895
## 11 ZEB/SNAI         neg_patterns  456|ZEB/SNAI         Skin_c1    620645

7.1 Load BPCells object

global_bp_obj <- readRDS(here("output/05-misc/01/global_bp_obj.rds"))

head(global_bp_obj$cell_metadata)

##                                     cb Cluster Cluster_chrombpnet   organ
## 1 T318_b16_Adr_PCW21#CL131_E05+G07+B04    AG_0         Adrenal_c0 Adrenal
## 2 T318_b16_Adr_PCW21#CL132_E06+C05+G06    AG_0         Adrenal_c0 Adrenal
## 3 T318_b16_Adr_PCW21#CL131_E06+H08+A07    AG_0         Adrenal_c0 Adrenal
## 4 T318_b16_Adr_PCW21#CL131_E02+D10+G10    AG_0         Adrenal_c0 Adrenal
## 5 T318_b16_Adr_PCW21#CL131_E05+H06+A03    AG_0         Adrenal_c0 Adrenal
## 6 T318_b16_Adr_PCW21#CL131_E11+D02+G02    AG_0         Adrenal_c0 Adrenal
##   organ_code nFrags              L1_annot               L2_annot       L3_annot
## 1         AG  11499 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 2         AG   4449 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 3         AG  13058 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 4         AG  11339 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 5         AG   6374 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 6         AG  11742 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
##   cluster_id compartment archive_L1_clusterID archive_L1_clusterName
## 1          0         epi                    0                 AG_epi
## 2          0         epi                    0                 AG_epi
## 3          0         epi                    0                 AG_epi
## 4          0         epi                    0                 AG_epi
## 5          0         epi                    0                 AG_epi
## 6          0         epi                    0                 AG_epi
##   archive_L2_clusterID archive_L2_clusterName archive_L3_clusterName
## 1              AG_epi1         adrenal cortex       adrenal cortex 1
## 2              AG_epi1         adrenal cortex       adrenal cortex 1
## 3              AG_epi1         adrenal cortex       adrenal cortex 1
## 4              AG_epi1         adrenal cortex       adrenal cortex 1
## 5              AG_epi1         adrenal cortex       adrenal cortex 1
## 6              AG_epi1         adrenal cortex       adrenal cortex 1
##   archive_L3_clusterID
## 1  AG_adrenal cortex 1
## 2  AG_adrenal cortex 1
## 3  AG_adrenal cortex 1
## 4  AG_adrenal cortex 1
## 5  AG_adrenal cortex 1
## 6  AG_adrenal cortex 1

7.2 Helper function

The motif footprint calculation counts insertions (i.e. fragment starts, ends) that fall within each position in the window. Then the fragment count at each position is normalized by (divided by) the mean count in the outer 10% of the window, taken as a measure of the local background accessibility.

footprint_fn <- function(cluster,
                         motif_broad,
                         split_by_quantile = TRUE,
                         sample_instances = TRUE,
                         sample_n = 10000,
                         flank = 250) {
  
  motif_safe <- gsub("\\/", ".", motif_broad)
  q_string <- ifelse(split_by_quantile, "quantiles", "noquantiles")
  s_string <- ifelse(sample_instances, glue("sample{sample_n}"), "nosample")
  out_path <- glue("{out}/{cluster}__{motif_safe}.footprint.{q_string}.{s_string}.flank{flank}.Rds")
  
  if (file.exists(out_path)) {
    
    message("@ found previous output at ", basename(out_path))
    fp_out <- readRDS(out_path)
    return(fp_out)
    
  }
  
  message("@ loading hits for cluster ", cluster)
  # load hits for the cluster of interest
  motif_instances <- rtracklayer::import.bed(glue("{chrombpnet_dir}/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{cluster}/{finemo_param}/{cluster}__hits_unified.{finemo_param}.reconciled.bed.gz"),
                                             extraCols = c("pattern_class" = "character"))
  
  # map in the broad annotation
  motif_instances$annotation_broad <- plyr::mapvalues(motif_instances$name,
                                                      from = motifs_compiled_unique$motif_name,
                                                      to = motifs_compiled_unique$annotation_broad,
                                                      warn_missing = FALSE)
  
  instances_motif <- motif_instances[motif_instances$annotation_broad == motif_broad]
  
  # subset the global object
  sub_meta <- global_bp_obj$cell_metadata %>%
    dplyr::filter(Cluster_chrombpnet == cluster)
  
  frags <- global_bp_obj$frags %>% select_cells(sub_meta$cb)
  
  # if there are different variants of the motif, subset to the most common one -
  # motif instances may differ because they are identified with slightly different
  # input motifs, and then reconciled afterwards to a unique compendium motif
  message(glue("@ found {length(motif_instances)} motif instances"))
  n_variants  <- instances_motif$name %>% unique %>% length
  most_common <- instances_motif %>% width() %>% table() %>% which.max() %>% names()
  instances_subset <- instances_motif[width(instances_motif) == most_common]
  message(glue("@ motif instances remaining: N={length(instances_subset)}"))
  
  # split motifs by motif score quantile
  if (split_by_quantile) {
    
    message("@ splitting hits by quantile")
    quantiles <- quantile(instances_subset$score)
    instances_subset$quantile <- cut(instances_subset$score, breaks = quantiles,
                                     include.lowest = TRUE, labels = 1:4) %>% 
      as.character()
    
    print(unique(instances_subset$quantile))
    
  } else { instances_subset$quantile <- 1 }
  
  n_quantiles <- length(unique(instances_subset$quantile))
  
  # sample motif instances
  if (sample_instances) {
    
    instances_per_q_sampled <- map(1:n_quantiles, function(q) {
      
      instances_per_q <- instances_subset[instances_subset$quantile == q]
      
      if (length(instances_per_q) < sample_n) {
        message(glue("@@ quantile {q}: <{sample_n} motif instances"))
        return(instances_per_q)
      }
      else {
        message(glue("@@ quantile {q}: sampling {sample_n} motif instances"))
        instances_per_q %>% sample(sample_n)
      }
      
    })
    
  } else {
    
    instances_per_q_sampled <- split(instances_subset, instances_subset$quantile)
    
  }
  
  print(length(instances_per_q_sampled))
  
  n_instances <- map_dbl(instances_per_q_sampled, length) %>% sum
  
  # compute footprints
  message("@ computing footprints...")
  footprints_data <- imap_dfr(instances_per_q_sampled, function(i, q) {
    
    if (split_by_quantile) message(glue("@@ quantile {q}"))
    fp <- BPCells::plot_tf_footprint(
      fragments = frags,
      motif_positions = i,
      cell_groups = sub_meta$Cluster_chrombpnet,
      flank = flank,
      smooth = 2,
      return_data = TRUE
    ) %>% 
      mutate(quantile = q)
    
    return(fp)
    
  }
  )
  
  fp_out <- list(
    "cluster"     = cluster,
    "motif"       = motif_broad,
    "footprint"   = footprints_data,
    "width"       = as.numeric(most_common),
    "n_instances" = n_instances)
  
  message("@ saving results to ", basename(out_path))
  
  saveRDS(fp_out, file = out_path)
  
  return(fp_out)
  
  message("@ done.")
  
}

plot_footprint_fn <- function(footprints_out, central_window = 500) {
  
  n_quantiles <- length(unique(footprints_out$footprint$quantile))
  
  message("@ plotting...")
  footprints_out$footprint$quantile <- as.character(footprints_out$footprint$quantile)
  
  if (max(footprints_out$footprint$quantile) > 1) blues <- RColorBrewer::brewer.pal(8, "Blues") %>%
    tail(4)
  else blues <- "black"
  
  flank <- central_window / 2
  
  # plot footprints 
  p1 <- footprints_out$footprint %>% 
    filter(position > -flank & position < flank) %>% 
    ggplot(aes(position, smoothed_enrichment, color = quantile)) +
    # highlight the motif position within the window
    annotate("rect",
             xmin = - footprints_out$width / 2, xmax = footprints_out$width / 2,
             ymin = -Inf, ymax = Inf,
             fill = "yellow", alpha = 0.3) +
    geom_line(alpha = 0.8) +
    scale_color_manual(values = blues) +
    labs(x = "Distance from motif center (bp)", y = "Enrichment") +
    ggtitle(glue("{footprints_out$motif} \n(N={footprints_out$n_instances} instances, {footprints_out$cluster})")) +
    square()
  
  if (n_quantiles == 1) p1 <- p1 + no_legend()
  
  p1
  
}

7.3 CTCF

We want to get footprints of a specific TF, so we filter to instances of e.g. CTCF here, which is a positive control that we know should footprint well in a high-quality ATACseq dataset.

fp_ctcf_out <- footprint_fn(cluster = "Liver_c1",
                            motif_broad = "CTCF",
                            split_by_quantile = TRUE,
                            sample_instances = FALSE)

## @ found previous output at Liver_c1__CTCF.footprint.quantiles.nosample.flank250.Rds

plot_footprint_fn(fp_ctcf_out)

## @ plotting...

# no splitting
fp_ctcf_out2 <- footprint_fn(cluster = "Liver_c1",
                             motif_broad = "CTCF",
                             split_by_quantile = FALSE,
                             sample_instances = FALSE)

## @ found previous output at Liver_c1__CTCF.footprint.noquantiles.nosample.flank250.Rds

plot_footprint_fn(fp_ctcf_out2)

## @ plotting...

7.4 NFY

fp_nfy_out1 <- footprint_fn(cluster = "Brain_c9",
                            motif_broad = "NFY",
                            split_by_quantile = TRUE,
                            sample_instances = FALSE)

## @ found previous output at Brain_c9__NFY.footprint.quantiles.nosample.flank250.Rds

plot_footprint_fn(fp_nfy_out1)

## @ plotting...

fp_nfy_out <- footprint_fn(
  cluster = "Brain_c9",
  motif_broad = "NFY",
  split_by_quantile = FALSE,
  sample_instances = FALSE
)

## @ found previous output at Brain_c9__NFY.footprint.noquantiles.nosample.flank250.Rds

plot_footprint_fn(fp_nfy_out)

## @ plotting...

7.5 NFY repressive

fp_nfyrep_out <- footprint_fn(
  cluster = "Heart_c4",
  motif_broad = "NFYrep",
  split_by_quantile = FALSE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Heart_c4

## @ found 2200593 motif instances

## @ motif instances remaining: N=19995

## [1] 1

## @ computing footprints...

## @ saving results to Heart_c4__NFYrep.footprint.noquantiles.nosample.flank500.Rds

plot_footprint_fn(fp_nfyrep_out)

## @ plotting...

fp_nfyrep_out2 <- footprint_fn(
  cluster = "Heart_c4",
  motif_broad = "NFYrep",
  split_by_quantile = TRUE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Heart_c4

## @ found 2200593 motif instances

## @ motif instances remaining: N=19995

## @ splitting hits by quantile

## [1] "3" "4" "2" "1"
## [1] 4

## @ computing footprints...

## @@ quantile 1

## @@ quantile 2

## @@ quantile 3

## @@ quantile 4

## @ saving results to Heart_c4__NFYrep.footprint.quantiles.nosample.flank500.Rds

plot_footprint_fn(fp_nfyrep_out2)

## @ plotting...

7.6 YY1/2

fp_yy1_out <- footprint_fn(
  cluster = "Lung_c1",
  motif_broad = "YY1/2",
  split_by_quantile = FALSE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Lung_c1

## @ found 2291775 motif instances

## @ motif instances remaining: N=2857

## [1] 1

## @ computing footprints...

## @ saving results to Lung_c1__YY1.2.footprint.noquantiles.nosample.flank500.Rds

plot_footprint_fn(fp_yy1_out)

## @ plotting...

fp_yy1_out2 <- footprint_fn(
  cluster = "Lung_c1",
  motif_broad = "YY1/2",
  split_by_quantile = TRUE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Lung_c1

## @ found 2291775 motif instances

## @ motif instances remaining: N=2857

## @ splitting hits by quantile

## [1] "1" "2" "3" "4"
## [1] 4

## @ computing footprints...

## @@ quantile 1

## @@ quantile 2

## @@ quantile 3

## @@ quantile 4

## @ saving results to Lung_c1__YY1.2.footprint.quantiles.nosample.flank500.Rds

plot_footprint_fn(fp_yy1_out2)

## @ plotting...

7.7 YY1/2 repressive

fp_yy1rep_out <- footprint_fn(
  cluster = "Liver_c1",
  motif_broad = "YY1/2rep",
  split_by_quantile = FALSE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Liver_c1

## @ found 1275637 motif instances

## @ motif instances remaining: N=55895

## [1] 1

## @ computing footprints...

## @ saving results to Liver_c1__YY1.2rep.footprint.noquantiles.nosample.flank500.Rds

plot_footprint_fn(fp_yy1rep_out)

## @ plotting...

plot_footprint_fn(fp_yy1rep_out)

## @ plotting...

fp_yy1rep_out2 <- footprint_fn(
  cluster = "Liver_c1",
  motif_broad = "YY1/2rep",
  split_by_quantile = TRUE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Liver_c1

## @ found 1275637 motif instances

## @ motif instances remaining: N=55895

## @ splitting hits by quantile

## [1] "4" "1" "2" "3"
## [1] 4

## @ computing footprints...

## @@ quantile 1

## @@ quantile 2

## @@ quantile 3

## @@ quantile 4

## @ saving results to Liver_c1__YY1.2rep.footprint.quantiles.nosample.flank500.Rds

plot_footprint_fn(fp_yy1rep_out2)

## @ plotting...

7.8 ZEB/SNAI

fp_zeb_out <- footprint_fn(
  cluster = "Skin_c1",
  motif_broad = "ZEB/SNAI",
  split_by_quantile = FALSE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Skin_c1

## @ found 2593062 motif instances

## @ motif instances remaining: N=510572

## [1] 1

## @ computing footprints...

## @ saving results to Skin_c1__ZEB.SNAI.footprint.noquantiles.nosample.flank500.Rds

plot_footprint_fn(fp_zeb_out)

## @ plotting...

# split quantiles
fp_zeb_out2 <- footprint_fn(
  cluster = "Skin_c1",
  motif_broad = "ZEB/SNAI",
  split_by_quantile = TRUE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Skin_c1

## @ found 2593062 motif instances

## @ motif instances remaining: N=510572

## @ splitting hits by quantile

## [1] "4" "3" "2" "1"
## [1] 4

## @ computing footprints...

## @@ quantile 1

## @@ quantile 2

## @@ quantile 3

## @@ quantile 4

## @ saving results to Skin_c1__ZEB.SNAI.footprint.quantiles.nosample.flank500.Rds

plot_footprint_fn(fp_zeb_out2)

## @ plotting...

7.9 HIC

fp_hic_out <- footprint_fn(
  cluster = "Heart_c3",
  motif_broad = "HIC",
  split_by_quantile = FALSE,
  sample_instances = FALSE
)

## @ found previous output at Heart_c3__HIC.footprint.noquantiles.nosample.flank250.Rds

plot_footprint_fn(fp_hic_out)

## @ plotting...

fp_hic_out2 <- footprint_fn(
  cluster = "Heart_c3",
  motif_broad = "HIC",
  split_by_quantile = TRUE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Heart_c3

## @ found 1892655 motif instances

## @ motif instances remaining: N=222937

## @ splitting hits by quantile

## [1] "1" "2" "4" "3"
## [1] 4

## @ computing footprints...

## @@ quantile 1

## @@ quantile 2

## @@ quantile 3

## @@ quantile 4

## @ saving results to Heart_c3__HIC.footprint.quantiles.nosample.flank500.Rds

plot_footprint_fn(fp_hic_out2)

## @ plotting...

7.10 BCL11A

fp_bcl11_out <- footprint_fn(
  cluster = "Liver_c7",
  motif_broad = "BCL11A/IRF/SPI",
  split_by_quantile = FALSE,
  sample_instances = FALSE)

## @ found previous output at Liver_c7__BCL11A.IRF.SPI.footprint.noquantiles.nosample.flank250.Rds

plot_footprint_fn(fp_bcl11_out)

## @ plotting...

fp_bcl11_out2 <- footprint_fn(
  cluster = "Liver_c7",
  motif_broad = "BCL11A/IRF/SPI",
  split_by_quantile = TRUE,
  sample_instances = FALSE,
  flank = 500
)

## @ loading hits for cluster Liver_c7

## @ found 958766 motif instances

## @ motif instances remaining: N=10508

## @ splitting hits by quantile

## [1] "1" "3" "2" "4"
## [1] 4

## @ computing footprints...

## @@ quantile 1

## @@ quantile 2

## @@ quantile 3

## @@ quantile 4

## @ saving results to Liver_c7__BCL11A.IRF.SPI.footprint.quantiles.nosample.flank500.Rds

plot_footprint_fn(fp_bcl11_out2)

## @ plotting...

7.11 BCL11A repressive

fp_bcl11rep_out <- footprint_fn(
  cluster = "Skin_c2",
  motif_broad = "BCL11Arep",
  split_by_quantile = FALSE,
  sample_instances = FALSE)

## @ found previous output at Skin_c2__BCL11Arep.footprint.noquantiles.nosample.flank250.Rds

plot_footprint_fn(fp_bcl11rep_out)

## @ plotting...

fp_bcl11rep_out2 <- footprint_fn(
  cluster = "Skin_c2",
  motif_broad = "BCL11Arep",
  split_by_quantile = TRUE,
  sample_instances = FALSE)

## @ found previous output at Skin_c2__BCL11Arep.footprint.quantiles.nosample.flank250.Rds

plot_footprint_fn(fp_bcl11rep_out2)

## @ plotting...

8 Session info

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_4.2.4         RColorBrewer_1.1-3     ggrastr_1.0.1         
##  [4] magrittr_2.0.3         GenomicFeatures_1.46.5 AnnotationDbi_1.56.2  
##  [7] Biobase_2.54.0         GenomicRanges_1.46.1   GenomeInfoDb_1.30.1   
## [10] IRanges_2.28.0         S4Vectors_0.32.4       BiocGenerics_0.40.0   
## [13] BPCells_0.2.0          ggseqlogo_0.1          plotly_4.10.4         
## [16] cowplot_1.1.3          ggrepel_0.9.3          stringr_1.5.0         
## [19] purrr_1.0.2            readr_2.1.4            tidyr_1.3.0           
## [22] dplyr_1.1.3            ggplot2_3.5.1          scales_1.3.0          
## [25] glue_1.8.0             here_1.0.1            
## 
## loaded via a namespace (and not attached):
##  [1] ggbeeswarm_0.7.2            colorspace_2.1-0           
##  [3] rjson_0.2.21                rprojroot_2.0.3            
##  [5] XVector_0.34.0              rstudioapi_0.15.0          
##  [7] farver_2.1.1                bit64_4.0.5                
##  [9] fansi_1.0.4                 xml2_1.3.5                 
## [11] cachem_1.0.8                knitr_1.43                 
## [13] jsonlite_1.8.7              Rsamtools_2.10.0           
## [15] dbplyr_2.3.3                png_0.1-8                  
## [17] compiler_4.1.2              httr_1.4.7                 
## [19] Matrix_1.6-3                fastmap_1.1.1              
## [21] lazyeval_0.2.2              cli_3.6.1                  
## [23] htmltools_0.5.6             prettyunits_1.1.1          
## [25] tools_4.1.2                 gtable_0.3.4               
## [27] GenomeInfoDbData_1.2.7      rappdirs_0.3.3             
## [29] Rcpp_1.0.11                 jquerylib_0.1.4            
## [31] vctrs_0.6.3                 Biostrings_2.62.0          
## [33] rtracklayer_1.54.0          xfun_0.40                  
## [35] lifecycle_1.0.3             restfulr_0.0.15            
## [37] gtools_3.9.4                XML_3.99-0.14              
## [39] zlibbioc_1.40.0             MASS_7.3-60                
## [41] vroom_1.6.3                 hms_1.1.3                  
## [43] MatrixGenerics_1.6.0        parallel_4.1.2             
## [45] SummarizedExperiment_1.24.0 yaml_2.3.7                 
## [47] curl_5.0.2                  gridExtra_2.3              
## [49] memoise_2.0.1               sass_0.4.7                 
## [51] biomaRt_2.50.3              stringi_1.7.12             
## [53] RSQLite_2.3.1               highr_0.10                 
## [55] BiocIO_1.4.0                universalmotif_1.12.4      
## [57] ArchR_1.0.2                 filelock_1.0.2             
## [59] BiocParallel_1.28.3         rlang_1.1.1                
## [61] pkgconfig_2.0.3             matrixStats_1.0.0          
## [63] bitops_1.0-7                evaluate_0.21              
## [65] lattice_0.20-45             GenomicAlignments_1.30.0   
## [67] htmlwidgets_1.6.2           labeling_0.4.3             
## [69] bit_4.0.5                   tidyselect_1.2.0           
## [71] plyr_1.8.8                  R6_2.5.1                   
## [73] generics_0.1.3              DelayedArray_0.20.0        
## [75] DBI_1.1.3                   pillar_1.9.0               
## [77] withr_2.5.0                 KEGGREST_1.34.0            
## [79] RCurl_1.98-1.12             tibble_3.2.1               
## [81] crayon_1.5.2                utf8_1.2.3                 
## [83] BiocFileCache_2.2.1         tzdb_0.4.0                 
## [85] rmarkdown_2.24              viridis_0.6.4              
## [87] progress_1.2.2              data.table_1.14.8          
## [89] blob_1.2.4                  digest_0.6.33              
## [91] munsell_0.5.0               beeswarm_0.4.0             
## [93] viridisLite_0.4.2           vipor_0.4.5                
## [95] bslib_0.5.1

Exploration of de novo learned motifs

02 October, 2025