# **MODIFY THIS CHUNK**
library(here)
kundaje_dir <- trimws(readr::read_lines(here("code/AK_PROJ_DIR.txt")))
doc_id <- "01"
out <- here("output/03-chrombpnet/03-syntax/", doc_id); dir.create(out, recursive = TRUE)
figout <- here("figures/03-chrombpnet/03-syntax", doc_id, "/"); dir.create(figout, recursive = TRUE)
chrombpnet_dir <- here("output/03-chrombpnet")In this notebook, we do global investigations of the de novo motifs:
library(glue)
library(scales)
library(ggplot2)
library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(stringr)
library(ggrepel)
library(cowplot)
library(plotly)
library(ggseqlogo)
library(BPCells)
script_path <- here("code/utils/")
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "chrombpnet_utils.R"))
ggplot2::theme_set(theme_BOR())Cluster metadata:
cluster_meta <- read_csv(here("output/05-misc/03/TableS2_cluster_meta_qc.csv"))## Rows: 203 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Cluster, organ, organ_code, compartment, L1_annot, L2_annot, L3_an...
## dbl (8): cluster_id, dend_order, ncell, median_numi, median_ngene, median_n...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
chrombpnet_models_keep <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_models_keep2.tsv"),
col_names = c("Cluster", "Folds_keep", "Cluster_ID"))## Rows: 189 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Cluster, Folds_keep, Cluster_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cluster_order <- cluster_meta %>% arrange(organ, L1_annot) %>% pull(Cluster_ChromBPNet)
organs <- unique(cluster_meta$organ) %>% sort()
organs[organs == "StomachEsophagus"] <- "Stomach"
head(chrombpnet_models_keep)## # A tibble: 6 × 3
## Cluster Folds_keep Cluster_ID
## <chr> <chr> <chr>
## 1 Adrenal_c0 fold_0,fold_1,fold_2,fold_3,fold_4 AG_0
## 2 Adrenal_c1 fold_0,fold_1,fold_2,fold_3,fold_4 AG_1
## 3 Adrenal_c2 fold_0,fold_1,fold_2,fold_3,fold_4 AG_2
## 4 Adrenal_c3 fold_0,fold_1,fold_2,fold_3,fold_4 AG_3
## 5 Brain_c0 fold_0,fold_1,fold_2,fold_3,fold_4 BR_0
## 6 Brain_c1 fold_0,fold_1,fold_2,fold_3,fold_4 BR_1
Load metrics:
chrombpnet_metrics <- read_tsv(here("output/03-chrombpnet/01-models/qc/chrombpnet_metrics.tsv")) %>%
dplyr::select(Cluster_ChromBPNet, total_frags) %>%
distinct()## Rows: 1015 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): organ, Cluster_ChromBPNet, L1_annot, Model, Fold
## dbl (13): number_of_cells, total_frags, counts_metrics.peaks.spearmanr, coun...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Load number of peaks:
npeaks <- read_table(here("output/03-chrombpnet/01-models/qc/npeaks.tsv"),
col_names = c("n_peaks", "peak_file")) %>%
filter(peak_file != "total")##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## n_peaks = col_double(),
## peak_file = col_character()
## )
nrow(npeaks) == 203## [1] TRUE
npeaks <- npeaks %>%
rowwise() %>%
mutate(peak_file = basename(peak_file)) %>%
separate(peak_file, into = c("Cluster_ChromBPNet", "drop"), sep = "__") %>%
dplyr::select(-drop)
cluster_meta <- cluster_meta %>%
left_join(npeaks, by = "Cluster_ChromBPNet") %>%
left_join(chrombpnet_metrics, by = "Cluster_ChromBPNet")Load merged modisco reports:
modisco_merged <- read_tsv(here("output/03-chrombpnet/02-compendium/modisco_compiled_anno/modisco_merged_reports.tsv")) %>%
# get short cluster ID
left_join(cluster_meta %>% dplyr::select(Cluster, Cluster_ChromBPNet),
by = c("component_celltype" = "Cluster_ChromBPNet"))## Rows: 6362 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): merged_pattern, component_celltype, pattern_class, pattern, compone...
## dbl (1): n_seqlets
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(modisco_merged)## # A tibble: 6 × 7
## merged_pattern component_celltype pattern_class pattern n_seqlets
## <chr> <chr> <chr> <chr> <dbl>
## 1 neg.Average_12__merged_pat… Spleen_c5 neg_patterns patter… 88
## 2 neg.Average_12__merged_pat… Spleen_c0 neg_patterns patter… 461
## 3 neg.Average_12__merged_pat… Skin_c5 neg_patterns patter… 302
## 4 neg.Average_12__merged_pat… Heart_c13 neg_patterns patter… 128
## 5 neg.Average_12__merged_pat… Skin_c7 neg_patterns patter… 708
## 6 neg.Average_12__merged_pat… Heart_c5 neg_patterns patter… 47
## # ℹ 2 more variables: component_pattern <chr>, Cluster <chr>
Load annotated patterns:
# manually annotated motif patterns
motifs_compiled_all <- read_tsv(here("code/03-chrombpnet/02-compendium/04d-ChromBPNet_de_novo_motifs.tsv")) %>%
mutate(motif_name = paste0(idx_uniq, "|", annotation)) %>%
dplyr::relocate(motif_name, .after = pattern) %>%
dplyr::relocate(idx_uniq, .after = idx) %>%
# rename categories
mutate(category = ifelse(annotation == "exclude", "exclude", category),
category = case_match(category,
"composite_homo" ~ "homocomposite",
"composite_hetero" ~ "heterocomposite",
.default = category),
category = factor(category, levels = names(cmap_category)),
pattern_class = dplyr::recode(pattern_class, pos = "pos_patterns", neg = "neg_patterns"))## Rows: 834 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (21): pattern_class, pattern, component_organs, component_celltypes, que...
## dbl (14): idx, total_n_seqlets, n_component_patterns, n_component_celltypes,...
## lgl (6): modisco_cwm_fwd, modisco_cwm_rev, match0_logo, match0_logo_jolma, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
motifs_compiled <- motifs_compiled_all %>%
# filter out exclude motifs
filter(category != "exclude") %>%
# extract the name of the known motif
separate(match0, into = c("TF0", "motif0"), sep = ": ")
# for cases with multiple patterns merged into one motif, take the one with the most
# number of seqlets.
motifs_compiled_unique <- motifs_compiled %>%
group_by(idx_uniq) %>%
slice_max(n = 1, order_by = tibble(total_n_seqlets, n_component_celltypes),
with_ties = FALSE)
dim(motifs_compiled_unique)## [1] 508 43
# get positive and negative patterns
pos_patterns <- motifs_compiled_unique %>% filter(pattern_class == "pos_patterns") %>% pull(motif_name)
length(pos_patterns)## [1] 493
neg_patterns <- motifs_compiled_unique %>% filter(pattern_class == "neg_patterns") %>% pull(motif_name)
length(neg_patterns)## [1] 15
# initial breakdown of motif categories, prior manual collapsing
table(motifs_compiled_all$category)##
## base base_with_flank homocomposite heterocomposite unresolved
## 147 128 135 263 43
## repeat partial exclude
## 7 4 107
# breakdown after manual collapsing & removing low-quality motifs
table(motifs_compiled_unique$category)##
## base base_with_flank homocomposite heterocomposite unresolved
## 86 51 107 222 38
## repeat partial exclude
## 2 2 0
dim(motifs_compiled_unique)## [1] 508 43
Thus, in total we have:
NOTE: There are 509 unique motifs, and the
labels go from 0 to 508 (which spans 509 values). The “missing label” is
for idx_uniq 466, which corresponded to unresolved motifs
which were filtered out.
Load reconciled hit calls:
finemo_param <- "counts_v0.23_a0.8_all"
# load hit counts per motif per cell type
hits <- map(chrombpnet_models_keep$Cluster,
~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_motif.tsv")),
data.table = FALSE) %>%
dplyr::rename(n = count) %>%
tibble::add_column(Cluster = .x, .before = 1)) %>%
setNames(chrombpnet_models_keep$Cluster)
# sanity check
n_cols <- map_dbl(hits, ncol)
if (any(n_cols < 12)) {
missing_cols <- names(n_cols[n_cols < 12])
stop("@ Missing columns in ", glue_collapse(missing_cols, ", "))
}
saveRDS(hits, file = glue("{out}/hits.Rds"))
# load summary stats on hits per peak per cell type
hits_per_peak <- map_dfr(chrombpnet_models_keep$Cluster,
~ data.table::fread(here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/{finemo_param}/hits_per_peak.tsv")),
data.table = FALSE) %>%
tibble::add_column(Cluster = .x, .before = 1))Calculate total number of hits per pattern per cell type:
hits_all <- bind_rows(hits)
if (!all(motifs_compiled_unique$motif_name %in% hits_all$motif_name) | !all(hits_all$motif_name %in% motifs_compiled$motif_name)) {
stop("@ Motif names in annotation don't match motif names in hits.")
}
hits_all_totals <- hits_all %>%
group_by(motif_name) %>%
summarize(total_hits = sum(n))
# organ w/ most hits
hits_top_organ <- hits_all %>%
left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>%
group_by(motif_name, organ) %>%
summarize(hits_per_organ = sum(n)) %>%
group_by(motif_name) %>%
slice_max(n = 1, order_by = hits_per_organ, with_ties = FALSE) %>%
dplyr::select(motif_name, top_organ = organ)## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'motif_name'. You can override using the
## `.groups` argument.
dim(hits_all_totals)## [1] 508 2
dim(hits_top_organ)## [1] 508 2
motifs_compiled_unique <- hits_all_totals %>%
left_join(motifs_compiled_unique, by = "motif_name") %>%
left_join(hits_top_organ, by = "motif_name") %>%
mutate(motif_name_safe = gsub("\\/|\\|", ".", motif_name)) %>%
mutate(modisco_cwm_fwd = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.fwd.png")),
modisco_cwm_rev = here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/trimmed_logos/{pattern_class}.{pattern}.cwm.rev.png")))
# also subset to positive and neg patterns
motifs_compiled_unique_pos <- motifs_compiled_unique %>%
filter(pattern_class == "pos_patterns")
motifs_compiled_unique_neg <- motifs_compiled_unique %>%
filter(pattern_class == "neg_patterns")
dim(motifs_compiled_unique_pos)## [1] 493 46
hits_all_anno <- hits_all %>%
left_join(cluster_meta %>% dplyr::rename(Cluster_short = Cluster), by = c("Cluster" = "Cluster_ChromBPNet")) %>%
left_join(motifs_compiled_unique %>% dplyr::select(motif_name, pattern, annotation_broad, category))## Joining with `by = join_by(motif_name)`
head(hits_all_anno)## Cluster motif_name pattern_class n Distal
## 1 Adrenal_c0 392|NR:ESRRA/NR5A pos_patterns 158975 52205
## 2 Adrenal_c0 456|ZEB/SNAI neg_patterns 122904 31275
## 3 Adrenal_c0 132|BZIP:FOSL/JUND#1 pos_patterns 52506 17353
## 4 Adrenal_c0 436|SP/KLF pos_patterns 45828 6164
## 5 Adrenal_c0 260|GATA#1 pos_patterns 33086 11270
## 6 Adrenal_c0 507|unresolved,repressive,YY1/2-like neg_patterns 32037 8234
## Exonic Intronic Promoter mean_distToTSS median_distToTSS
## 1 9379 84616 12775 20180.92 8859.0
## 2 8002 48500 35127 11567.57 2944.0
## 3 2527 28584 4042 20839.47 9456.0
## 4 2238 9426 28000 3203.65 140.0
## 5 1370 18616 1830 22932.57 10593.5
## 6 2288 12955 8560 12456.09 3584.0
## mean_distToPeakSummit median_distToPeakSummit Cluster_short organ
## 1 129.77 67 AG_0 Adrenal
## 2 171.96 128 AG_0 Adrenal
## 3 114.59 50 AG_0 Adrenal
## 4 88.78 52 AG_0 Adrenal
## 5 128.59 74 AG_0 Adrenal
## 6 150.93 114 AG_0 Adrenal
## organ_code cluster_id compartment L1_annot
## 1 AG 0 epi AG_0_adrenal cortex 1
## 2 AG 0 epi AG_0_adrenal cortex 1
## 3 AG 0 epi AG_0_adrenal cortex 1
## 4 AG 0 epi AG_0_adrenal cortex 1
## 5 AG 0 epi AG_0_adrenal cortex 1
## 6 AG 0 epi AG_0_adrenal cortex 1
## L2_annot L3_annot dend_order ncell median_numi
## 1 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 2 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 3 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 4 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 5 Adrenal adrenal cortex adrenal cortex 7 823 9498
## 6 Adrenal adrenal cortex adrenal cortex 7 823 9498
## median_ngene median_nfrags median_tss median_frip
## 1 3280 6784 9.832 0.2906086
## 2 3280 6784 9.832 0.2906086
## 3 3280 6784 9.832 0.2906086
## 4 3280 6784 9.832 0.2906086
## 5 3280 6784 9.832 0.2906086
## 6 3280 6784 9.832 0.2906086
## note
## 1 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 2 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 3 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 4 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 5 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 6 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## organ_color compartment_color n_peaks total_frags
## 1 #876941 #11A579 121634 7407892
## 2 #876941 #11A579 121634 7407892
## 3 #876941 #11A579 121634 7407892
## 4 #876941 #11A579 121634 7407892
## 5 #876941 #11A579 121634 7407892
## 6 #876941 #11A579 121634 7407892
## pattern annotation_broad category
## 1 pos.Average_256__merged_pattern_0 NR base
## 2 neg.Average_12__merged_pattern_0 ZEB/SNAI base
## 3 pos.Average_287__merged_pattern_0 BZIP base
## 4 pos.Average_212__merged_pattern_0 SP/KLF base
## 5 pos.Average_248__merged_pattern_0 GATA base
## 6 neg.Average_15__merged_pattern_1 unresolved unresolved
# breakdown of motif categories per pos patterns
table(motifs_compiled_unique_pos$category)##
## base base_with_flank homocomposite heterocomposite unresolved
## 81 51 102 221 34
## repeat partial exclude
## 2 2 0
# save as TSV for re-use
motifs_compiled_unique %>% write_tsv(glue("{out}/motifs_compiled_unique.tsv"))
# short version
motifs_compiled_unique %>%
arrange(idx_uniq) %>%
dplyr::select(motif_name, total_hits, idx, idx_uniq, pattern_class, pattern,
annotation, annotation_broad, category,
TF0, motif0, qval0, top_organ, component_organs, component_celltypes) %>%
write_tsv(glue("{out}/motifs_compiled_unique_short.tsv"))
hits_all_anno %>% write_tsv(glue("{out}/hits_per_cluster_annotated.tsv"))Prep the hit counts matrices across cell types:
dim(hits_all)## [1] 33054 12
mat_hits <- hits_all %>%
filter(pattern_class == "pos_patterns") %>%
left_join(motifs_compiled_unique_pos, by = c("motif_name", "pattern_class")) %>%
filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
dplyr::select(Cluster, motif_name, n) %>%
spread(motif_name, n, fill = 0) %>%
tibble::column_to_rownames(var = "Cluster") %T>%
write_tsv(glue("{out}/hit_matrix.pos_patterns_no_unresolved.tsv"))
dim(mat_hits)## [1] 189 455
hits_all_broad <- hits_all %>%
filter(pattern_class == "pos_patterns") %>%
left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
filter(category %in% c("base", "base_with_flank", "homocomposite", "heterocomposite")) %>%
group_by(annotation_broad, Cluster) %>%
summarize(n = sum(n))## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
mat_hits_broad <- hits_all_broad %>%
spread(annotation_broad, n, fill = 0) %>%
tibble::column_to_rownames(var = "Cluster") %T>%
write_tsv(glue("{out}/hit_matrix.broad_pos_patterns_no_unresolved.tsv"))
dim(mat_hits_broad)## [1] 189 169
mat_hits_unresolved <- hits_all %>%
filter(pattern_class == "pos_patterns") %>%
left_join(motifs_compiled_unique_pos, by = "motif_name") %>%
filter(category == "unresolved") %>%
dplyr::select(motif_name, n, Cluster) %>%
spread(motif_name, n, fill = 0) %>%
as.data.frame() %>%
tibble::column_to_rownames(var = "Cluster")
dim(mat_hits_unresolved)## [1] 189 34
Load hit counts for binned distances to nucleosomes:
# load motif hit counts per binned distance to nucleosome dyads
hit_dyad_dist <- map_dfr(chrombpnet_models_keep$Cluster,
~ data.table::fread(
here(glue("output/03-chrombpnet/02-compendium/nucleoatac/{.x}/{.x}.motif_dist_to_dyad.tsv")),
data.table = FALSE) %>%
tibble::add_column(Cluster = .x, .before = 1))
hit_dyad_dist <- left_join(hit_dyad_dist, motifs_compiled_unique, by = "motif_name")Load hit counts for binned distances to peak summits:
# load motif hit counts per binned distance to nucleosome dyads
hit_summit_dist <- map_dfr(chrombpnet_models_keep$Cluster,
~ data.table::fread(
here(glue("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{.x}/counts_v0.23_a0.8_all/motif_dist_to_summit.tsv")),
data.table = FALSE) %>%
tibble::add_column(Cluster = .x, .before = 1))
hit_summit_dist <- left_join(hit_summit_dist, motifs_compiled_unique, by = "motif_name")We also count the number of unique peaks in each cluster that contain at least one motif instance, separately for positive and negative patterns:
cd ../../../output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks
for class in ( pos_patterns neg_patterns ); do
for i in *; do
n_peaks=`zcat ${i}/counts_v0.23_a0.8_all/hits_unique.reconciled.annotated.tsv.gz | grep ${class} | cut -f 14 | sort | uniq | wc -l`
echo -e "${i}\t${n_peaks}" >> .unique_peaks_cluster.${class}.txt
done
doneLoad the counts:
peaks_w_pos_hits <- read_tsv(here("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/.unique_peaks_cluster.pos_patterns.txt"),
col_names = c("Cluster", "N_peaks_w_hits")) %>%
mutate(pattern_class = "pos_patterns")## Rows: 189 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Cluster
## dbl (1): N_peaks_w_hits
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
peaks_w_neg_hits <- read_tsv(here("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/.unique_peaks_cluster.neg_patterns.txt"),
col_names = c("Cluster", "N_peaks_w_hits")) %>%
mutate(pattern_class = "neg_patterns")## Rows: 189 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Cluster
## dbl (1): N_peaks_w_hits
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
peaks_w_hits <- bind_rows(peaks_w_pos_hits,
peaks_w_neg_hits) %>%
left_join(cluster_meta %>% dplyr::select(organ, Cluster = Cluster_ChromBPNet, n_peaks), by = "Cluster") %>%
mutate(Prop_peaks_w_hits = N_peaks_w_hits / n_peaks)Load HOCOMOCO:
hocomoco <- read_tsv(here("output/02-global_analysis/04/hocomoco_unique.tsv"))## Rows: 949 Columns: 662
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (45): name, original_motif.name, original_motif.tf, original_motif.moti...
## dbl (617): original_motif.subtype_info.datatype_counts.P, original_motif.sub...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
unique_classes <- unique(hocomoco$masterlist_info.tfclass_class)
cmap_class <- c(ArchR::paletteDiscrete(unique_classes, set = "stallion"), "N/A" = "gray90")## Length of unique values greater than palette, interpolating..
Load CWMs for the compiled motifs:
# use custom helper
cwm_list <- read_cwm_meme(here(glue("output/03-chrombpnet/02-compendium/modisco_compiled/modisco_compiled.memedb.txt")))## Could not find strand info, assuming +.
length(cwm_list)## [1] 834
cwm_list <- cwm_list[base::intersect(names(cwm_list), motifs_compiled_unique$pattern)]
length(cwm_list)## [1] 508
names(cwm_list) <- plyr::mapvalues(names(cwm_list), from = motifs_compiled_unique$pattern, to = motifs_compiled_unique$motif_name)
# label matrix dimensions
cwm_list <- map(cwm_list, function(cwm) {
rownames(cwm) <- c("A", "C", "G", "T")
colnames(cwm) <- seq(1:30)
return(cwm)
})
print(length(cwm_list))## [1] 508
saveRDS(cwm_list, file = glue("{out}/denovo_cwm_list.Rds"))save(chrombpnet_models_keep, cluster_meta, motifs_compiled_unique, motifs_compiled_all,
cwm_list, hit_dyad_dist, hit_summit_dist, hits_all_anno, hits_per_peak,
file = glue("{out}/intermediate_obj.Rda"))To reload for later analysis:
load(glue("{out}/intermediate_obj.Rda"))
hits <- readRDS(glue("{out}/hits.Rds"))
hits_all <- bind_rows(hits)fs::dir_create(file.path(out, "trimmed_cwm_logos"))
all(fs::file_exists(motifs_compiled_unique$modisco_cwm_fwd))
all(fs::file_exists(motifs_compiled_unique$modisco_cwm_rev))
for (i in 1:nrow(motifs_compiled_unique)) {
new_name_fw <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.fwd.png")
fs::file_copy(motifs_compiled_unique$modisco_cwm_fwd[i], file.path(out, "trimmed_cwm_logos", new_name_fw), overwrite = TRUE)
new_name_rev <- paste0(motifs_compiled_unique$motif_name_safe[i], "__cwm.rev.png")
fs::file_copy(motifs_compiled_unique$modisco_cwm_rev[i], file.path(out, "trimmed_cwm_logos", new_name_rev), overwrite = TRUE)
}motifs_compiled_unique %>%
mutate(
class = case_when(
pattern_class == "pos_patterns" ~ "positive",
TRUE ~ "negative"
),
cwm_logo_fwd = paste0(motif_name_safe, "__cwm.fwd.png"),
cwm_logo_rev = paste0(motif_name_safe, "__cwm.rev.png"),
) %>%
dplyr::select(
idx_uniq,
motif_name,
motif_name_safe,
class,
total_instances_across_celltypes = total_hits,
cwm_logo_fwd,
cwm_logo_rev,
query_consensus,
annotation,
annotation_broad,
category,
best_match_TF = TF0,
best_match_motif = motif0,
best_match_TOMTOM_qval = qval0,
best_match_TFs_in_family = match0_TFs_in_family,
best_match_TFs_in_Vierstra_archetype = match0_TFs_in_archetype,
total_seqlets_across_celltypes = total_n_seqlets,
merged_pattern = pattern,
n_constituent_motifs = n_component_patterns,
n_constituent_celltypes = n_component_celltypes,
constituent_organs = component_organs,
constituent_celltypes = component_celltypes
) %>%
arrange(idx_uniq) %>%
write_csv(glue("{out}/TABLE_motif_compendium.csv"))What kinds of patterns did we detect?
First, let’s look at all 834 motifs from the motif clustering workflow.
p1 <- motifs_compiled_all %>%
group_by(category) %>%
count() %>%
# reverse levels since we're flipping coordinates
mutate(category = factor(category, levels = rev(names(cmap_category)))) %>%
ggplot(aes(x = category, y = n)) +
geom_col(aes(fill = category)) +
geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
coord_flip() +
scale_fill_manual(values = cmap_category) +
ggtitle("De novo deep learning derived motifs \n(after MoDISco agggregation step)") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90")) +
ylim(c(0, 280))
p2 <- motifs_compiled_all %>%
ggplot(aes(x = "breakdown")) +
geom_bar(aes(fill = category), position = "fill") +
scale_fill_manual(values = cmap_category)
plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(0.65, 0.35))Next, let’s look at the unique motifs after QC and collapsing motifs:
# stacked barplot
p1 <- motifs_compiled_unique %>%
ggplot(aes(x = "breakdown")) +
geom_bar(aes(fill = category), position = "fill") +
scale_fill_manual(values = cmap_category) +
xlab(NULL) +
theme(legend.position = "bottom") +
no_legend()
# barplot of counts per category
p2 <- motifs_compiled_unique %>%
group_by(category) %>%
count() %>%
# reverse levels since we're flipping coordinates
mutate(category = factor(category, levels = rev(names(cmap_category)))) %>%
ggplot(aes(x = category, y = n)) +
geom_col(aes(fill = category)) +
geom_text(aes(label = n), hjust = -0.5, fontface = "bold", size = 4) +
coord_flip() +
scale_fill_manual(values = cmap_category) +
ggtitle("De novo deep learning derived motifs \n(after manual collapsing & removing low quality motifs)") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90")) +
ylim(c(0, 280))
plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(1, 4, 3.5))p1 <- motifs_compiled_unique %>%
group_by(pattern_class) %>%
count() %>%
ggplot(aes(x = "1", y = n)) +
geom_col(aes(fill = pattern_class), alpha = 0.4) +
coord_flip() +
scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
ggtitle("De novo deep learning derived motifs") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90"))
p2 <- hits_all %>%
left_join(cluster_meta %>% dplyr::select(Cluster = Cluster_ChromBPNet, organ)) %>%
group_by(organ, pattern_class) %>%
summarize(n_hits = sum(n)) %>%
mutate(organ = factor(organ, levels = rev(unique(.$organ)))) %>%
ggplot(aes(x = organ, y = n_hits)) +
geom_col(aes(fill = pattern_class), alpha = 0.4, position = "fill") +
coord_flip() +
scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
ggtitle("Breakdown of total hits") +
no_legend() +
theme(panel.grid.major.x = element_line(color = "gray90"),
panel.grid.minor.x = element_line(color = "gray90"))## Joining with `by = join_by(Cluster)`
## `summarise()` has grouped output by 'organ'. You can override using the
## `.groups` argument.
plot_grid(p1, p2, ncol = 1, align = "v", axis = "rl", rel_heights = c(1, 4))Distribution of TOMTOM q-values for similarity to known motifs, and total # hits per motif:
p1 <- motifs_compiled_unique %>%
mutate(similarity = qval0) %>%
ggplot(aes(x = category, y = -log10(qval0))) +
geom_boxplot(aes(fill = category), outliers = FALSE) +
geom_jitter(shape = 21, width = 0.2, size = 0.5) +
scale_fill_manual(values = cmap_category) +
geom_hline(yintercept = -log10(0.05)) +
no_legend() +
xlab(NULL) +
rotate_x()
p2 <- motifs_compiled_unique %>%
ggplot(aes(x = category, y = log10(total_hits))) +
geom_boxplot(aes(fill = category), outlier.colour = NA) +
geom_jitter(color = "black", width = 0.2, size = 0.5, shape = 21) +
scale_fill_manual(values = cmap_category) +
rotate_x() +
no_legend() +
# ggtitle("Total # of hits per unique motif \neach point is a motif (n=525)") +
coord_cartesian(ylim = c(1, 7))
plot_grid(p1, p2, nrow = 1, align = "h", axis = "tb")Total number of hits per cell type:
hits_all %>%
group_by(Cluster, pattern_class) %>%
summarize(total_hits = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>%
ggplot(aes(x = Cluster, y = total_hits)) +
facet_wrap(~ pattern_class, ncol = 1) +
geom_bar(stat = "identity", aes(fill = organ)) +
scale_fill_manual(values = cmap_organ) +
rotate_x() +
ylab("Total number of hits") +
theme(
strip.background = element_blank(),
axis.text.x = element_text(size = 5),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(labels = scales::comma) +
ggtitle("Total hits per cell type")## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
By class (positive/negative patterns):
hits_all %>%
left_join(motifs_compiled_unique %>% dplyr::select(-pattern_class),
by = c("motif_name")) %>%
group_by(Cluster, pattern_class) %>%
summarize(n_per_class = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>%
ggplot(aes(x = Cluster, y = n_per_class)) +
geom_bar(stat = "identity", aes(fill = pattern_class), alpha = 0.4, position = "fill") +
scale_fill_manual(values = c("pos_patterns" = "darkgreen", "neg_patterns" = "red")) +
rotate_x() +
ylab("Total number of hits") +
theme(
axis.text.x = element_text(size = 5),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(labels = scales::comma) +
ggtitle("Total hits, by pattern class")## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
Let’s plot total hits and mean hits per peak, grouped by organ:
peaks_w_hits %>%
group_by(pattern_class, organ) %>%
summarize(median = median(Prop_peaks_w_hits)) %>%
spread(pattern_class, median)## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## # A tibble: 12 × 3
## organ neg_patterns pos_patterns
## <chr> <dbl> <dbl>
## 1 Adrenal 0.609 0.907
## 2 Brain 0.803 0.897
## 3 Eye 0.808 0.880
## 4 Heart 0.804 0.917
## 5 Liver 0.551 0.919
## 6 Lung 0.814 0.921
## 7 Muscle 0.870 0.934
## 8 Skin 0.891 0.945
## 9 Spleen 0.794 0.922
## 10 StomachEsophagus 0.731 0.904
## 11 Thymus 0.712 0.919
## 12 Thyroid 0.707 0.870
p0 <- peaks_w_hits %>%
mutate(pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>%
ggplot(aes(x = organ, y = Prop_peaks_w_hits)) +
geom_jitter(stat = "identity", aes(fill = organ), shape = 21, color = "white", size = 2, alpha = 0.6, width = 0.25) +
scale_fill_manual(values = cmap_organ) +
rotate_x() +
facet_wrap(~ pattern_class, ncol = 1) +
ylab("Proportion") +
theme(
strip.background = element_blank(),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(labels = scales::comma) +
ggtitle("Peaks with hits") +
no_legend()
p1 <- hits_all %>%
group_by(Cluster, pattern_class) %>%
summarize(total_hits = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster)),
pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>%
ggplot(aes(x = organ, y = total_hits)) +
geom_jitter(stat = "identity", aes(fill = organ), shape = 21, color = "white", size = 2, alpha = 0.6, width = 0.25) +
scale_fill_manual(values = cmap_organ) +
rotate_x() +
facet_wrap(~ pattern_class, ncol = 1) +
ylab("Total number of hits") +
theme(
strip.background = element_blank(),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(labels = scales::comma) +
ggtitle("Total hits") +
no_legend()## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
p2 <- hits_per_peak %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet")) %>%
arrange(organ, L1_annot) %>%
mutate(Cluster = factor(Cluster, levels = unique(.$Cluster))) %>%
gather(pattern_class, median_n_hits, median_pos_patterns, median_neg_patterns) %>%
mutate(pattern_class = factor(pattern_class, levels = c("median_pos_patterns", "median_neg_patterns"))) %>%
ggplot(aes(x = organ, y = median_n_hits)) +
ggbeeswarm::geom_quasirandom(
aes(fill = organ), shape = 21, color = "white", size = 2, alpha = 0.6) + #, width = 0.25, height = 0) +
scale_fill_manual(values = cmap_organ) +
facet_wrap(~ pattern_class, ncol = 1) +
rotate_x() +
ylab("Median number of hits per peak") +
theme(
strip.background = element_blank(),
panel.grid.major.y = element_line(color = "gray90"),
panel.grid.minor.y = element_line(color = "gray90")) +
scale_y_continuous(breaks = seq(0, 10, 1), labels = seq(0, 10), limits = c(0, 9)) +
no_legend() +
ggtitle("Median hits per peak")
plot_grid(p0, p1, p2, nrow = 1, align = "h", axis = "tb", rel_widths = c(1, 1.2, 1))Here we look at how total number of motif instances varies with total fragments and number of peaks:
# calculate total hist per cluster (incl. both pos/neg patterns)
total_hits_per_cluster <- hits_all %>%
group_by(Cluster) %>%
summarize(total_hits = sum(n)) %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))
# this is for the 189 models which passed QC
total_hits_per_cluster$Cluster %>% unique %>% length## [1] 189
summary(total_hits_per_cluster$total_hits)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73500 457034 664835 839544 1110354 2593062
cor_frags <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
summary(cor_frags)##
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$total_frags)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1266989 -240087 -88317 214846 1610838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.052e+05 3.600e+04 14.03 <2e-16 ***
## total_hits_per_cluster$total_frags 8.947e-03 6.326e-04 14.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 373300 on 187 degrees of freedom
## Multiple R-squared: 0.5169, Adjusted R-squared: 0.5143
## F-statistic: 200 on 1 and 187 DF, p-value: < 2.2e-16
cor_peaks <- lm(total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
summary(cor_peaks)##
## Call:
## lm(formula = total_hits_per_cluster$total_hits ~ total_hits_per_cluster$n_peaks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -555859 -99658 6559 109954 652260
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.796e+05 3.305e+04 -5.435 1.69e-07 ***
## total_hits_per_cluster$n_peaks 8.802e+00 2.567e-01 34.294 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 198900 on 187 degrees of freedom
## Multiple R-squared: 0.8628, Adjusted R-squared: 0.8621
## F-statistic: 1176 on 1 and 187 DF, p-value: < 2.2e-16
p1 <- total_hits_per_cluster %>%
ggplot(aes(x = log10(total_frags), y = total_hits)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = label_number(scale = 1/1e6)) +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
x = 7, y = 2e6, size = 5) +
xlab("log10(Total fragments)") + ylab("Total hits (x 1M)") +
square() +
ggtitle("# motif instances vs. \n# fragments") +
no_legend()
p2 <- total_hits_per_cluster %>%
ggplot(aes(x = n_peaks, y = total_hits)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = label_number(scale = 1/1e6)) +
scale_x_continuous(labels = scales::comma) +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
x = 7e4, y = 2e6, size = 5) +
xlab("Total peaks") + ylab("Total hits (x 1M)") +
square() +
ggtitle("# motif instances vs. \n# peaks") +
no_legend()
# median number of instances for positive motifs per peaks
n_hits_per_peaks <- hits_per_peak %>%
left_join(cluster_meta, by = c("Cluster" = "Cluster_ChromBPNet"))
cor_frags <- lm(n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
summary(cor_frags)##
## Call:
## lm(formula = n_hits_per_peaks$median_pos_patterns ~ n_hits_per_peaks$total_frags)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0016 -0.9635 -0.0603 0.7646 3.3597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.942e+00 9.692e-02 40.67 < 2e-16 ***
## n_hits_per_peaks$total_frags 8.634e-09 1.703e-09 5.07 9.52e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.005 on 187 degrees of freedom
## Multiple R-squared: 0.1209, Adjusted R-squared: 0.1162
## F-statistic: 25.71 on 1 and 187 DF, p-value: 9.516e-07
cor_peaks <- lm(n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
summary(cor_peaks)##
## Call:
## lm(formula = n_hits_per_peaks$median_pos_pattern ~ n_hits_per_peaks$n_peaks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.73318 -0.57254 -0.00856 0.49256 2.90880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.991e+00 1.449e-01 20.645 <2e-16 ***
## n_hits_per_peaks$n_peaks 1.100e-05 1.125e-06 9.779 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8717 on 187 degrees of freedom
## Multiple R-squared: 0.3383, Adjusted R-squared: 0.3348
## F-statistic: 95.63 on 1 and 187 DF, p-value: < 2.2e-16
p3 <- n_hits_per_peaks %>%
ggplot(aes(x = log10(total_frags), y = median_pos_patterns)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
no_legend() +
square() +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_frags)$r.squared), 3)}, \nR^2 = {round(summary(cor_frags)$r.squared, 3)}"),
x = 7.5, y = 7, size = 5) +
xlab("log10(Total fragments)") + ylab("median # positive hits per peak") +
ggtitle("median positive hits per peak vs. \n# fragments")
p4 <- n_hits_per_peaks %>%
ggplot(aes(x = n_peaks, y = median_pos_patterns)) +
geom_point(aes(fill = organ), shape = 21, size = 3, alpha = 0.7, color = "white") +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::comma) +
no_legend() +
square() +
annotate("text", label = glue("cor = {round(sqrt(summary(cor_peaks)$r.squared), 3)}, \nR^2 = {round(summary(cor_peaks)$r.squared, 3)}"),
x = 70000, y = 7, size = 5) +
xlab("total number of peaks") + ylab("median # positive hits per peak") +
ggtitle("median positive hits per peak vs. \n# peaks")
# calculate quartiles
n_hits_per_peaks$n_peak_quartile <- ntile(n_hits_per_peaks$n_peaks, n=4)
table(n_hits_per_peaks$n_peak_quartile)/sum(table(n_hits_per_peaks$n_peak_quartile))##
## 1 2 3 4
## 0.2539683 0.2486772 0.2486772 0.2486772
n_hits_per_peaks$total_frags_quartile <- ntile(n_hits_per_peaks$total_frags, n=4)
table(n_hits_per_peaks$total_frags_quartile)/sum(table(n_hits_per_peaks$total_frags_quartile))##
## 1 2 3 4
## 0.2539683 0.2486772 0.2486772 0.2486772
p5 <- n_hits_per_peaks %>%
ggplot(aes(x = total_frags_quartile, y = median_pos_patterns, group = total_frags_quartile)) +
geom_boxplot(fill = "gray90", outliers = FALSE) +
ggbeeswarm::geom_quasirandom(
aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
scale_fill_manual(values = cmap_organ) +
no_legend() +
ylab("Median # positive hits per peak") +
xlab("Quantile (total number of fragments)") +
ggtitle("median positive hits per peak vs. \n# fragments")
p6 <- n_hits_per_peaks %>%
ggplot(aes(x = n_peak_quartile, y = median_pos_patterns, group = n_peak_quartile)) +
geom_boxplot(fill = "gray90", outliers = FALSE) +
ggbeeswarm::geom_quasirandom(
aes(fill = organ), shape = 21, color = "white", size = 4, alpha = 0.6, width = 0.2) +
scale_fill_manual(values = cmap_organ) +
no_legend() +
ylab("Median # positive hits per peak") +
xlab("Quantile (total number of peaks)") +
ggtitle("median positive hits per peak vs. \n# peaks")
plot_grid(p1, p3, p5, p2, p4, p6, align = "hv", axis = "rltb")First we need to aggregate the counts per bin over motif variants and cell types, per category:
# total counts
motifs_keep <- hit_dyad_dist %>%
filter(category %in% c("base", "base_with_flank")) %>%
group_by(annotation_broad) %>%
summarize(sum_all_counts = sum(n)) %>%
arrange(sum_all_counts) %>%
filter(sum_all_counts > 50000) %>%
pull(annotation_broad)
motifs_keep %>% length## [1] 40
medians_dist_dyad <- hit_dyad_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist) %>%
summarize(sum_counts = sum(n)) %>%
# we need to actually calculate the median given the binned counts
slice_max(order_by = sum_counts, n = 1) %>%
arrange(bin_dist)## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
# z-score
hit_dyad_dist_summed <- hit_dyad_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist) %>%
summarize(sum_counts = sum(n)) %>%
# zscore counts per motif (row-wise)
mutate(zscore = scale(sum_counts)) %>%
mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
bin_dist = as.numeric(bin_dist)) %>%
ungroup()## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
p1 <- hit_dyad_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = zscore)) +
scale_fill_gradientn(colours = rdbu2,
# set midpoint to 0
# https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
geom_vline(xintercept = 75, linewidth = 1) +
scale_x_continuous(breaks = seq(10, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from nucleosome dyad (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")We can plot the heatmap w/ the actual counts on top so that we can manually inspect:
hit_dyad_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = zscore)) +
geom_text(aes(label = round(sum_counts, 2)), color = "black") +
scale_fill_gradientn(colours = rdbu2,
# set midpoint to 0
# https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
rescaler = ~ scales::rescale_mid(.x, mid = 0)) +
geom_vline(xintercept = 75, linewidth = 1) +
scale_x_continuous(breaks = seq(10, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from nucleosome dyad (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to nucleosome dyad \n(aggregated within categories, across cell types)")hit_summit_dist_summed <- hit_summit_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist) %>%
summarize(sum_counts = sum(n)) %>%
filter(bin_dist <= 250) %>%
# zscore counts per motif (row-wise)
mutate(zscore = scale(sum_counts)) %>%
mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)),
bin_dist = as.numeric(bin_dist)) %>%
ungroup()## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
hit_summit_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = sum_counts)) +
scale_fill_viridis_c("plasma") +
scale_x_continuous(breaks = seq(0, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from peak summit (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")p2 <- hit_summit_dist_summed %>%
ggplot(aes(x = bin_dist, y = annotation_broad)) +
geom_tile(aes(fill = zscore)) +
scale_fill_gradientn(colours = rdbu2,
# set midpoint to 0
# https://github.com/tidyverse/ggplot2/issues/3738#issuecomment-1336166757
rescaler = ~ scales::rescale_mid(.x, mid = 0), limits = c(-3, 4)) +
scale_x_continuous(breaks = seq(0, 250, by = 10)) +
theme(panel.border = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_text(size = 5),
axis.text.y = element_text(size = 9)) +
xlab("binned distance from peak summit (bp)") +
ylab("motif") +
ggtitle("Motif instances distances to peak summit \n(aggregated within categories, across cell types)")hit_summit_dist_summed <- hit_summit_dist %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
group_by(annotation_broad, bin_dist, pattern_class) %>%
summarize(sum_counts = sum(n))## `summarise()` has grouped output by 'annotation_broad', 'bin_dist'. You can
## override using the `.groups` argument.
hit_summit_dist_summed %>%
filter(annotation_broad != "unresolved") %>%
group_by(annotation_broad) %>%
mutate(cum_sum_counts = cumsum(sum_counts)) %>%
mutate(pattern_class = factor(pattern_class, levels = c("pos_patterns", "neg_patterns"))) %>%
ggplot(aes(x = bin_dist, y = log10(cum_sum_counts), group = annotation_broad)) +
# geom_point(aes(fill = pattern_class), alpha = 0.4, color = "white", shape = 21) +
geom_line(aes(color = pattern_class), alpha = 0.4) +
facet_wrap(~ pattern_class, ncol = 1) +
scale_color_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
scale_fill_manual(values = c("pos_patterns" = "#BD202E", "neg_patterns" = "#1D75BC")) +
theme(panel.grid.minor.x = element_line(color = "gray90"),
panel.grid.major.x = element_line(color = "gray90"),
strip.background.x = element_blank()) +
no_legend()hit_summit_dist_coarse <- hit_summit_dist %>%
dplyr::select(Cluster, motif_name, annotation_broad, category, pattern_class, bin_dist, n_per_cluster_per_bin = n) %>%
filter(category %in% c("base", "base_with_flank") &
annotation_broad %in% motifs_keep) %>%
mutate(bin_dist_coarse = case_when(
bin_dist <= 50 ~ "0-50 bp",
bin_dist <= 100 ~ "51-100 bp",
bin_dist <= 150 ~ "101-150 bp",
bin_dist <= 200 ~ "151-200 bp",
bin_dist <= 250 ~ "201-250 bp",
TRUE ~ "> 251 bp"
)) %>%
group_by(annotation_broad, pattern_class, bin_dist_coarse) %>%
summarize(n_per_bin = sum(n_per_cluster_per_bin)) %>%
mutate(n_total = sum(n_per_bin),
prop_ber_bin = n_per_bin / n_total)## `summarise()` has grouped output by 'annotation_broad', 'pattern_class'. You
## can override using the `.groups` argument.
cmap_bins <- RColorBrewer::brewer.pal(6, "BuGn")
names(cmap_bins) <- rev(c("0-50 bp", "51-100 bp", "101-150 bp", "151-200 bp", "201-250 bp", "> 251 bp"))
motif_order_dist <- hit_summit_dist_coarse %>% filter(bin_dist_coarse == "0-50 bp") %>% arrange(desc(prop_ber_bin)) %>% pull(annotation_broad) %>% unique()
hit_summit_dist_coarse %>%
mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = names(cmap_bins)),
annotation_broad = factor(annotation_broad, levels = motif_order_dist)) %>%
ggplot(aes(x = annotation_broad, y = prop_ber_bin)) +
geom_bar(stat = "identity", aes(fill = bin_dist_coarse)) +
scale_fill_manual(values = cmap_bins) +
rotate_x()Get proportions of motifs per bin:
hit_summit_dist_coarse %>%
mutate(bin_dist_coarse = factor(bin_dist_coarse, levels = rev(names(cmap_bins)))) %>%
group_by(bin_dist_coarse) %>%
summarize(n_per_bin_across_motifs = sum(n_per_bin)) %>%
mutate(n_total = sum(n_per_bin_across_motifs),
prop = n_per_bin_across_motifs / n_total,
cum_n = cumsum(n_per_bin_across_motifs),
cum_prop = cum_n / n_total) %>%
arrange(bin_dist_coarse)## # A tibble: 6 × 6
## bin_dist_coarse n_per_bin_across_motifs n_total prop cum_n cum_prop
## <fct> <int> <int> <dbl> <int> <dbl>
## 1 0-50 bp 43674790 120645979 0.362 43674790 0.362
## 2 51-100 bp 21467522 120645979 0.178 65142312 0.540
## 3 101-150 bp 13494671 120645979 0.112 78636983 0.652
## 4 151-200 bp 9438266 120645979 0.0782 88075249 0.730
## 5 201-250 bp 7483429 120645979 0.0620 95558678 0.792
## 6 > 251 bp 25087301 120645979 0.208 120645979 1
region_type_summed <- hits_all_anno %>%
filter(annotation_broad %in% motifs_keep) %>%
dplyr::select(motif_name, annotation_broad, Distal, Exonic, Intronic, Promoter) %>%
gather(region_type, count, 3:ncol(.)) %>%
group_by(annotation_broad, region_type) %>%
summarize(count_total = sum(count, na.rm = TRUE)) %>%
mutate(annotation_broad = factor(annotation_broad, levels = unique(medians_dist_dyad$annotation_broad)))## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
p3 <- region_type_summed %>%
ggplot(aes(x = annotation_broad, y = count_total)) +
geom_col(aes(fill = region_type), position = "fill", alpha = 0.7) +
scale_fill_manual(values = cmap_peaktype2) +
coord_flip() +
theme(legend.position = "bottom")
plot_grid(p1, p2 , p3, nrow = 1, align = "h", axis = "tb", rel_widths = c(2, 2, 1))Load known motifs from Vierstra input set:
vierstra_motifs <- read_cwm_meme(file.path(kundaje_dir, "refs/Vierstra_motifs/all.dbs.meme"))We want to describe motifs in a given subset. This plots:
base_broad <- motifs_compiled_unique %>%
filter(category %in% c("base", "base_with_flank"))
length(base_broad$annotation_broad %>% unique())## [1] 51
plot_motif_summary(base_broad,
hits_all_anno,
hit_dyad_dist,
order_by = "distToTSS",
plot_known_motifs = TRUE,
# manually indicate which motifs to reverse complement
# to match known motif orientation
to_revcomp = c("GATA", "CTCF", "RFX", "MEF2", "NFKB/RELA", "NR", "ONECUT",
"ZNF143", "NR:HNF4", "REST", "THRA/B", "HD:SIX/ZNF", "ZEB/SNAI", "YY1/2rep"))## @ plotting 51 motifs
## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
homocomp_broad <- motifs_compiled_unique %>%
filter(pattern_class == "pos_patterns" & category == "homocomposite")
length(unique(homocomp_broad$annotation_broad))## [1] 22
plot_motif_summary(homocomp_broad,
hits_all_anno,
hit_dyad_dist,
plot_known_motifs = TRUE,
rel_widths = c(3.5, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
to_revcomp = c("ETS_ETS", "IRF/STAT_IRF/STAT", "BZIP_BZIP", "HD:PITX/OTX_HD:PITX/OTX"))## @ plotting 22 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
Just showing the top 30 here, because there are many.
heterocomp_broad <- motifs_compiled_unique %>%
filter(category == "heterocomposite" & !grepl("unresolved", annotation))
heterocomp_broad %>%
group_by(annotation_broad) %>%
count() %>%
arrange(desc(n))## # A tibble: 95 × 2
## # Groups: annotation_broad [95]
## annotation_broad n
## <chr> <int>
## 1 BHLH_HD 11
## 2 ETS_SP/KLF 9
## 3 ETS_NR 8
## 4 FOX_NKX 8
## 5 ETS_SOX 7
## 6 BHLH_BZIP 6
## 7 BHLH_MEF2 6
## 8 BHLH_NFI 6
## 9 ETS_NFI 6
## 10 GATA_MEF2 5
## # ℹ 85 more rows
plot_motif_summary(heterocomp_broad,
hits_all_anno,
hit_dyad_dist,
plot_known_motifs = FALSE,
# known_motifs = jolma_motifs,
# known_motif_col = "match0_jolma",
# known_motif_name = "match0_jolma",
rel_widths = c(3, 0.5, 1, 1, 1, 1, 1, 1.5, 1.5, 1),
top_n = 30)## @ plotting 30 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
neg_patterns <- motifs_compiled_unique %>%
filter(pattern_class == "neg_patterns") %>%
filter(!(category == "unresolved"))
# we can make use of the plotting function by setting the broad annotation to
# the granular one
plot_motif_summary(neg_patterns %>% mutate(annotation_broad = motif_name),
hits_all_anno %>% mutate(annotation_broad = motif_name),
hit_dyad_dist %>% mutate(annotation_broad = motif_name),
plot_known_motifs = TRUE,
rel_widths = c(4, 2, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
to_revcomp = c("456|ZEB/SNAI", "455|YY1/2repressive"))## @ plotting 11 motifs
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
unresolved <- motifs_compiled_unique %>%
filter(category == "unresolved") %>%
arrange(desc(total_hits))
plot_motif_summary(unresolved %>% mutate(annotation_broad = motif_name),
hits_all_anno %>% mutate(annotation_broad = motif_name),
hit_dyad_dist %>% mutate(annotation_broad = motif_name),
order_by = "distToTSS",
plot_known_motifs = FALSE,
rel_widths = c(3.5, 0.5, 1, 1, 1, 1, 1, 1, 1, 1),
top_n = 30)## @ plotting 30 motifs
## `summarise()` has grouped output by 'pattern_class'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'annotation_broad'. You can override using
## the `.groups` argument.
For a few motifs, let’s compute the ATAC footprint, focusing on the the cluster with the most hits for each of the motifs of interest:
# for each motif of interest, what is the cluster w/ the most hits?
hits_all_anno %>%
group_by(annotation_broad) %>%
slice_max(order_by = n, n = 1) %>%
filter(annotation_broad %in%
c("CTCF", "GATA", "BZIP", "NFY", "NFYrep", "YY1/2", "YY1/2rep", "HIC", "ZEB/SNAI", "BCL11A/IRF/SPI", "BCL11Arep")) %>%
dplyr::select(pattern_class, motif_name, Cluster, n)## Adding missing grouping variables: `annotation_broad`
## # A tibble: 11 × 5
## # Groups: annotation_broad [11]
## annotation_broad pattern_class motif_name Cluster n
## <chr> <chr> <chr> <chr> <int>
## 1 BCL11A/IRF/SPI pos_patterns 214|ETS:SPI1/BCL11A Liver_c7 12993
## 2 BCL11Arep neg_patterns 4|BCL11A/Brepressive Skin_c2 291187
## 3 BZIP pos_patterns 132|BZIP:FOSL/JUND#1 Adrenal_c0 52506
## 4 CTCF pos_patterns 148|CTCF#1 Liver_c1 29365
## 5 GATA pos_patterns 260|GATA#1 Heart_c2 164132
## 6 HIC neg_patterns 333|HIC Heart_c3 222937
## 7 NFY pos_patterns 367|NFY Brain_c9 32681
## 8 NFYrep neg_patterns 372|NFYrepressive Heart_c4 19995
## 9 YY1/2 pos_patterns 454|YY1/2 Lung_c1 5345
## 10 YY1/2rep neg_patterns 455|YY1/2repressive Liver_c1 55895
## 11 ZEB/SNAI neg_patterns 456|ZEB/SNAI Skin_c1 620645
global_bp_obj <- readRDS(here("output/05-misc/01/global_bp_obj.rds"))
head(global_bp_obj$cell_metadata)## cb Cluster Cluster_chrombpnet organ
## 1 T318_b16_Adr_PCW21#CL131_E05+G07+B04 AG_0 Adrenal_c0 Adrenal
## 2 T318_b16_Adr_PCW21#CL132_E06+C05+G06 AG_0 Adrenal_c0 Adrenal
## 3 T318_b16_Adr_PCW21#CL131_E06+H08+A07 AG_0 Adrenal_c0 Adrenal
## 4 T318_b16_Adr_PCW21#CL131_E02+D10+G10 AG_0 Adrenal_c0 Adrenal
## 5 T318_b16_Adr_PCW21#CL131_E05+H06+A03 AG_0 Adrenal_c0 Adrenal
## 6 T318_b16_Adr_PCW21#CL131_E11+D02+G02 AG_0 Adrenal_c0 Adrenal
## organ_code nFrags L1_annot L2_annot L3_annot
## 1 AG 11499 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 2 AG 4449 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 3 AG 13058 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 4 AG 11339 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 5 AG 6374 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## 6 AG 11742 AG_0_adrenal cortex 1 Adrenal adrenal cortex adrenal cortex
## cluster_id compartment archive_L1_clusterID archive_L1_clusterName
## 1 0 epi 0 AG_epi
## 2 0 epi 0 AG_epi
## 3 0 epi 0 AG_epi
## 4 0 epi 0 AG_epi
## 5 0 epi 0 AG_epi
## 6 0 epi 0 AG_epi
## archive_L2_clusterID archive_L2_clusterName archive_L3_clusterName
## 1 AG_epi1 adrenal cortex adrenal cortex 1
## 2 AG_epi1 adrenal cortex adrenal cortex 1
## 3 AG_epi1 adrenal cortex adrenal cortex 1
## 4 AG_epi1 adrenal cortex adrenal cortex 1
## 5 AG_epi1 adrenal cortex adrenal cortex 1
## 6 AG_epi1 adrenal cortex adrenal cortex 1
## archive_L3_clusterID
## 1 AG_adrenal cortex 1
## 2 AG_adrenal cortex 1
## 3 AG_adrenal cortex 1
## 4 AG_adrenal cortex 1
## 5 AG_adrenal cortex 1
## 6 AG_adrenal cortex 1
The motif footprint calculation counts insertions (i.e. fragment starts, ends) that fall within each position in the window. Then the fragment count at each position is normalized by (divided by) the mean count in the outer 10% of the window, taken as a measure of the local background accessibility.
footprint_fn <- function(cluster,
motif_broad,
split_by_quantile = TRUE,
sample_instances = TRUE,
sample_n = 10000,
flank = 250) {
motif_safe <- gsub("\\/", ".", motif_broad)
q_string <- ifelse(split_by_quantile, "quantiles", "noquantiles")
s_string <- ifelse(sample_instances, glue("sample{sample_n}"), "nosample")
out_path <- glue("{out}/{cluster}__{motif_safe}.footprint.{q_string}.{s_string}.flank{flank}.Rds")
if (file.exists(out_path)) {
message("@ found previous output at ", basename(out_path))
fp_out <- readRDS(out_path)
return(fp_out)
}
message("@ loading hits for cluster ", cluster)
# load hits for the cluster of interest
motif_instances <- rtracklayer::import.bed(glue("{chrombpnet_dir}/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/{cluster}/{finemo_param}/{cluster}__hits_unified.{finemo_param}.reconciled.bed.gz"),
extraCols = c("pattern_class" = "character"))
# map in the broad annotation
motif_instances$annotation_broad <- plyr::mapvalues(motif_instances$name,
from = motifs_compiled_unique$motif_name,
to = motifs_compiled_unique$annotation_broad,
warn_missing = FALSE)
instances_motif <- motif_instances[motif_instances$annotation_broad == motif_broad]
# subset the global object
sub_meta <- global_bp_obj$cell_metadata %>%
dplyr::filter(Cluster_chrombpnet == cluster)
frags <- global_bp_obj$frags %>% select_cells(sub_meta$cb)
# if there are different variants of the motif, subset to the most common one -
# motif instances may differ because they are identified with slightly different
# input motifs, and then reconciled afterwards to a unique compendium motif
message(glue("@ found {length(motif_instances)} motif instances"))
n_variants <- instances_motif$name %>% unique %>% length
most_common <- instances_motif %>% width() %>% table() %>% which.max() %>% names()
instances_subset <- instances_motif[width(instances_motif) == most_common]
message(glue("@ motif instances remaining: N={length(instances_subset)}"))
# split motifs by motif score quantile
if (split_by_quantile) {
message("@ splitting hits by quantile")
quantiles <- quantile(instances_subset$score)
instances_subset$quantile <- cut(instances_subset$score, breaks = quantiles,
include.lowest = TRUE, labels = 1:4) %>%
as.character()
print(unique(instances_subset$quantile))
} else { instances_subset$quantile <- 1 }
n_quantiles <- length(unique(instances_subset$quantile))
# sample motif instances
if (sample_instances) {
instances_per_q_sampled <- map(1:n_quantiles, function(q) {
instances_per_q <- instances_subset[instances_subset$quantile == q]
if (length(instances_per_q) < sample_n) {
message(glue("@@ quantile {q}: <{sample_n} motif instances"))
return(instances_per_q)
}
else {
message(glue("@@ quantile {q}: sampling {sample_n} motif instances"))
instances_per_q %>% sample(sample_n)
}
})
} else {
instances_per_q_sampled <- split(instances_subset, instances_subset$quantile)
}
print(length(instances_per_q_sampled))
n_instances <- map_dbl(instances_per_q_sampled, length) %>% sum
# compute footprints
message("@ computing footprints...")
footprints_data <- imap_dfr(instances_per_q_sampled, function(i, q) {
if (split_by_quantile) message(glue("@@ quantile {q}"))
fp <- BPCells::plot_tf_footprint(
fragments = frags,
motif_positions = i,
cell_groups = sub_meta$Cluster_chrombpnet,
flank = flank,
smooth = 2,
return_data = TRUE
) %>%
mutate(quantile = q)
return(fp)
}
)
fp_out <- list(
"cluster" = cluster,
"motif" = motif_broad,
"footprint" = footprints_data,
"width" = as.numeric(most_common),
"n_instances" = n_instances)
message("@ saving results to ", basename(out_path))
saveRDS(fp_out, file = out_path)
return(fp_out)
message("@ done.")
}
plot_footprint_fn <- function(footprints_out, central_window = 500) {
n_quantiles <- length(unique(footprints_out$footprint$quantile))
message("@ plotting...")
footprints_out$footprint$quantile <- as.character(footprints_out$footprint$quantile)
if (max(footprints_out$footprint$quantile) > 1) blues <- RColorBrewer::brewer.pal(8, "Blues") %>%
tail(4)
else blues <- "black"
flank <- central_window / 2
# plot footprints
p1 <- footprints_out$footprint %>%
filter(position > -flank & position < flank) %>%
ggplot(aes(position, smoothed_enrichment, color = quantile)) +
# highlight the motif position within the window
annotate("rect",
xmin = - footprints_out$width / 2, xmax = footprints_out$width / 2,
ymin = -Inf, ymax = Inf,
fill = "yellow", alpha = 0.3) +
geom_line(alpha = 0.8) +
scale_color_manual(values = blues) +
labs(x = "Distance from motif center (bp)", y = "Enrichment") +
ggtitle(glue("{footprints_out$motif} \n(N={footprints_out$n_instances} instances, {footprints_out$cluster})")) +
square()
if (n_quantiles == 1) p1 <- p1 + no_legend()
p1
}We want to get footprints of a specific TF, so we filter to instances of e.g. CTCF here, which is a positive control that we know should footprint well in a high-quality ATACseq dataset.
fp_ctcf_out <- footprint_fn(cluster = "Liver_c1",
motif_broad = "CTCF",
split_by_quantile = TRUE,
sample_instances = FALSE)## @ found previous output at Liver_c1__CTCF.footprint.quantiles.nosample.flank250.Rds
plot_footprint_fn(fp_ctcf_out)## @ plotting...
# no splitting
fp_ctcf_out2 <- footprint_fn(cluster = "Liver_c1",
motif_broad = "CTCF",
split_by_quantile = FALSE,
sample_instances = FALSE)## @ found previous output at Liver_c1__CTCF.footprint.noquantiles.nosample.flank250.Rds
plot_footprint_fn(fp_ctcf_out2)## @ plotting...
fp_nfy_out1 <- footprint_fn(cluster = "Brain_c9",
motif_broad = "NFY",
split_by_quantile = TRUE,
sample_instances = FALSE)## @ found previous output at Brain_c9__NFY.footprint.quantiles.nosample.flank250.Rds
plot_footprint_fn(fp_nfy_out1)## @ plotting...
fp_nfy_out <- footprint_fn(
cluster = "Brain_c9",
motif_broad = "NFY",
split_by_quantile = FALSE,
sample_instances = FALSE
)## @ found previous output at Brain_c9__NFY.footprint.noquantiles.nosample.flank250.Rds
plot_footprint_fn(fp_nfy_out)## @ plotting...
fp_nfyrep_out <- footprint_fn(
cluster = "Heart_c4",
motif_broad = "NFYrep",
split_by_quantile = FALSE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Heart_c4
## @ found 2200593 motif instances
## @ motif instances remaining: N=19995
## [1] 1
## @ computing footprints...
## @ saving results to Heart_c4__NFYrep.footprint.noquantiles.nosample.flank500.Rds
plot_footprint_fn(fp_nfyrep_out)## @ plotting...
fp_nfyrep_out2 <- footprint_fn(
cluster = "Heart_c4",
motif_broad = "NFYrep",
split_by_quantile = TRUE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Heart_c4
## @ found 2200593 motif instances
## @ motif instances remaining: N=19995
## @ splitting hits by quantile
## [1] "3" "4" "2" "1"
## [1] 4
## @ computing footprints...
## @@ quantile 1
## @@ quantile 2
## @@ quantile 3
## @@ quantile 4
## @ saving results to Heart_c4__NFYrep.footprint.quantiles.nosample.flank500.Rds
plot_footprint_fn(fp_nfyrep_out2)## @ plotting...
fp_yy1_out <- footprint_fn(
cluster = "Lung_c1",
motif_broad = "YY1/2",
split_by_quantile = FALSE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Lung_c1
## @ found 2291775 motif instances
## @ motif instances remaining: N=2857
## [1] 1
## @ computing footprints...
## @ saving results to Lung_c1__YY1.2.footprint.noquantiles.nosample.flank500.Rds
plot_footprint_fn(fp_yy1_out)## @ plotting...
fp_yy1_out2 <- footprint_fn(
cluster = "Lung_c1",
motif_broad = "YY1/2",
split_by_quantile = TRUE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Lung_c1
## @ found 2291775 motif instances
## @ motif instances remaining: N=2857
## @ splitting hits by quantile
## [1] "1" "2" "3" "4"
## [1] 4
## @ computing footprints...
## @@ quantile 1
## @@ quantile 2
## @@ quantile 3
## @@ quantile 4
## @ saving results to Lung_c1__YY1.2.footprint.quantiles.nosample.flank500.Rds
plot_footprint_fn(fp_yy1_out2)## @ plotting...
fp_yy1rep_out <- footprint_fn(
cluster = "Liver_c1",
motif_broad = "YY1/2rep",
split_by_quantile = FALSE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Liver_c1
## @ found 1275637 motif instances
## @ motif instances remaining: N=55895
## [1] 1
## @ computing footprints...
## @ saving results to Liver_c1__YY1.2rep.footprint.noquantiles.nosample.flank500.Rds
plot_footprint_fn(fp_yy1rep_out)## @ plotting...
plot_footprint_fn(fp_yy1rep_out)## @ plotting...
fp_yy1rep_out2 <- footprint_fn(
cluster = "Liver_c1",
motif_broad = "YY1/2rep",
split_by_quantile = TRUE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Liver_c1
## @ found 1275637 motif instances
## @ motif instances remaining: N=55895
## @ splitting hits by quantile
## [1] "4" "1" "2" "3"
## [1] 4
## @ computing footprints...
## @@ quantile 1
## @@ quantile 2
## @@ quantile 3
## @@ quantile 4
## @ saving results to Liver_c1__YY1.2rep.footprint.quantiles.nosample.flank500.Rds
plot_footprint_fn(fp_yy1rep_out2)## @ plotting...
fp_zeb_out <- footprint_fn(
cluster = "Skin_c1",
motif_broad = "ZEB/SNAI",
split_by_quantile = FALSE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Skin_c1
## @ found 2593062 motif instances
## @ motif instances remaining: N=510572
## [1] 1
## @ computing footprints...
## @ saving results to Skin_c1__ZEB.SNAI.footprint.noquantiles.nosample.flank500.Rds
plot_footprint_fn(fp_zeb_out)## @ plotting...
# split quantiles
fp_zeb_out2 <- footprint_fn(
cluster = "Skin_c1",
motif_broad = "ZEB/SNAI",
split_by_quantile = TRUE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Skin_c1
## @ found 2593062 motif instances
## @ motif instances remaining: N=510572
## @ splitting hits by quantile
## [1] "4" "3" "2" "1"
## [1] 4
## @ computing footprints...
## @@ quantile 1
## @@ quantile 2
## @@ quantile 3
## @@ quantile 4
## @ saving results to Skin_c1__ZEB.SNAI.footprint.quantiles.nosample.flank500.Rds
plot_footprint_fn(fp_zeb_out2)## @ plotting...
fp_hic_out <- footprint_fn(
cluster = "Heart_c3",
motif_broad = "HIC",
split_by_quantile = FALSE,
sample_instances = FALSE
)## @ found previous output at Heart_c3__HIC.footprint.noquantiles.nosample.flank250.Rds
plot_footprint_fn(fp_hic_out)## @ plotting...
fp_hic_out2 <- footprint_fn(
cluster = "Heart_c3",
motif_broad = "HIC",
split_by_quantile = TRUE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Heart_c3
## @ found 1892655 motif instances
## @ motif instances remaining: N=222937
## @ splitting hits by quantile
## [1] "1" "2" "4" "3"
## [1] 4
## @ computing footprints...
## @@ quantile 1
## @@ quantile 2
## @@ quantile 3
## @@ quantile 4
## @ saving results to Heart_c3__HIC.footprint.quantiles.nosample.flank500.Rds
plot_footprint_fn(fp_hic_out2)## @ plotting...
fp_bcl11_out <- footprint_fn(
cluster = "Liver_c7",
motif_broad = "BCL11A/IRF/SPI",
split_by_quantile = FALSE,
sample_instances = FALSE)## @ found previous output at Liver_c7__BCL11A.IRF.SPI.footprint.noquantiles.nosample.flank250.Rds
plot_footprint_fn(fp_bcl11_out)## @ plotting...
fp_bcl11_out2 <- footprint_fn(
cluster = "Liver_c7",
motif_broad = "BCL11A/IRF/SPI",
split_by_quantile = TRUE,
sample_instances = FALSE,
flank = 500
)## @ loading hits for cluster Liver_c7
## @ found 958766 motif instances
## @ motif instances remaining: N=10508
## @ splitting hits by quantile
## [1] "1" "3" "2" "4"
## [1] 4
## @ computing footprints...
## @@ quantile 1
## @@ quantile 2
## @@ quantile 3
## @@ quantile 4
## @ saving results to Liver_c7__BCL11A.IRF.SPI.footprint.quantiles.nosample.flank500.Rds
plot_footprint_fn(fp_bcl11_out2)## @ plotting...
fp_bcl11rep_out <- footprint_fn(
cluster = "Skin_c2",
motif_broad = "BCL11Arep",
split_by_quantile = FALSE,
sample_instances = FALSE)## @ found previous output at Skin_c2__BCL11Arep.footprint.noquantiles.nosample.flank250.Rds
plot_footprint_fn(fp_bcl11rep_out)## @ plotting...
fp_bcl11rep_out2 <- footprint_fn(
cluster = "Skin_c2",
motif_broad = "BCL11Arep",
split_by_quantile = TRUE,
sample_instances = FALSE)## @ found previous output at Skin_c2__BCL11Arep.footprint.quantiles.nosample.flank250.Rds
plot_footprint_fn(fp_bcl11rep_out2)## @ plotting...
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggthemes_4.2.4 RColorBrewer_1.1-3 ggrastr_1.0.1
## [4] magrittr_2.0.3 GenomicFeatures_1.46.5 AnnotationDbi_1.56.2
## [7] Biobase_2.54.0 GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [10] IRanges_2.28.0 S4Vectors_0.32.4 BiocGenerics_0.40.0
## [13] BPCells_0.2.0 ggseqlogo_0.1 plotly_4.10.4
## [16] cowplot_1.1.3 ggrepel_0.9.3 stringr_1.5.0
## [19] purrr_1.0.2 readr_2.1.4 tidyr_1.3.0
## [22] dplyr_1.1.3 ggplot2_3.5.1 scales_1.3.0
## [25] glue_1.8.0 here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.7.2 colorspace_2.1-0
## [3] rjson_0.2.21 rprojroot_2.0.3
## [5] XVector_0.34.0 rstudioapi_0.15.0
## [7] farver_2.1.1 bit64_4.0.5
## [9] fansi_1.0.4 xml2_1.3.5
## [11] cachem_1.0.8 knitr_1.43
## [13] jsonlite_1.8.7 Rsamtools_2.10.0
## [15] dbplyr_2.3.3 png_0.1-8
## [17] compiler_4.1.2 httr_1.4.7
## [19] Matrix_1.6-3 fastmap_1.1.1
## [21] lazyeval_0.2.2 cli_3.6.1
## [23] htmltools_0.5.6 prettyunits_1.1.1
## [25] tools_4.1.2 gtable_0.3.4
## [27] GenomeInfoDbData_1.2.7 rappdirs_0.3.3
## [29] Rcpp_1.0.11 jquerylib_0.1.4
## [31] vctrs_0.6.3 Biostrings_2.62.0
## [33] rtracklayer_1.54.0 xfun_0.40
## [35] lifecycle_1.0.3 restfulr_0.0.15
## [37] gtools_3.9.4 XML_3.99-0.14
## [39] zlibbioc_1.40.0 MASS_7.3-60
## [41] vroom_1.6.3 hms_1.1.3
## [43] MatrixGenerics_1.6.0 parallel_4.1.2
## [45] SummarizedExperiment_1.24.0 yaml_2.3.7
## [47] curl_5.0.2 gridExtra_2.3
## [49] memoise_2.0.1 sass_0.4.7
## [51] biomaRt_2.50.3 stringi_1.7.12
## [53] RSQLite_2.3.1 highr_0.10
## [55] BiocIO_1.4.0 universalmotif_1.12.4
## [57] ArchR_1.0.2 filelock_1.0.2
## [59] BiocParallel_1.28.3 rlang_1.1.1
## [61] pkgconfig_2.0.3 matrixStats_1.0.0
## [63] bitops_1.0-7 evaluate_0.21
## [65] lattice_0.20-45 GenomicAlignments_1.30.0
## [67] htmlwidgets_1.6.2 labeling_0.4.3
## [69] bit_4.0.5 tidyselect_1.2.0
## [71] plyr_1.8.8 R6_2.5.1
## [73] generics_0.1.3 DelayedArray_0.20.0
## [75] DBI_1.1.3 pillar_1.9.0
## [77] withr_2.5.0 KEGGREST_1.34.0
## [79] RCurl_1.98-1.12 tibble_3.2.1
## [81] crayon_1.5.2 utf8_1.2.3
## [83] BiocFileCache_2.2.1 tzdb_0.4.0
## [85] rmarkdown_2.24 viridis_0.6.4
## [87] progress_1.2.2 data.table_1.14.8
## [89] blob_1.2.4 digest_0.6.33
## [91] munsell_0.5.0 beeswarm_0.4.0
## [93] viridisLite_0.4.2 vipor_0.4.5
## [95] bslib_0.5.1