# **MODIFY THIS CHUNK**
project_id  <- "HDMA-public" # determines the name of the cache folder
doc_id      <- "04-enhancers/07" # determines name of the subfolder of `figures` where pngs/pdfs are saved
out         <- here::here("output/", doc_id); dir.create(out, recursive = TRUE)
figout      <- here::here("figures/", doc_id, "/")
cache       <- paste0("~/scratch/cache/", project_id, "/", doc_id, "/")
script_path <- here::here("code/utils/")

1 Overview

Analysis:

  • We got ENCODE cCREs
  • Calculate overlaps between our developmental peaks & each ENCODE list

From ENCODE SCREEN resource: ENCODE V4 cCRE descriptions: https://screen.wenglab.org/about

Classification of cCREs Many uses of cCREs are based on the regulatory role associated with their biochemical signatures. Thus, we putatively defined cCREs in one of the following annotation groups based on each element’s dominant biochemical signals across all available biosamples. Analogous to GENCODE’s catalog of genes, which are defined irrespective of their varying expression levels and alternative transcripts across different cell types, we provide a general, cell type-agnostic classification of cCREs based on the max-Zs as well as its proximity to the nearest annotated TSS:

  1. Promoter-like signatures (PLS): 1) fall within 200 bp (center to center) of an annotated GENCODE TSS or experimentally derived TSS and 2) have high chromatin accessibility (either DNase or ATAC) and H3K4me3 signals.

  2. Enhancer-like signatures (ELS) have high chromatin accessibility and H3K27ac signals. If they are within 200 bp of a TSS they must also have low H3K4me3 signal. The subset of the enhancers within 2 kb of a TSS are denoted as TSS proximal (pELS), while the remaining subset is denoted TSS distal (dELS).

  3. Chromatin accessibility + H3K4me3 (CA-H3K4me3) have high chromatin accessibility and H3K4me3 signals, low H3K27ac signals and do not fall within 200 bp of a TSS.

  4. Chromatin accessibility + CTCF (CA-CTCF) have high chromatin accessibility and CTCF signals, low H3K4me3 and H3K27ac signals.

  5. Chromatin accessibility + transcription factor (CA-TF) have high chromatin accessibility, low H3K4me3 H3K27ac, and CTCF signals, and overlap a transcription factor cluster.

  6. Chromatin accessibility (CA) have high chromatin accessibility, and low H3K4me3, H3K27ac, and CTCF signals.

  7. Transcription factor (TF) have low chromatin accessibility, H3K4me3, H3K27ac, and CTCF signals and overlap a transcription factor cluster.

2 Set up

# libraries
library(here)
library(dplyr)
library(tidyr)
library(ggplot2)
library(purrr)
library(glue)
library(readr)
library(tibble)
library(cowplot)
library(GenomicRanges)
library(rtracklayer)

# shared project scripts/functions
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))

ggplot2::theme_set(theme_BOR())

3 Load cluster metadata

cluster_meta <- read_tsv(here::here("output/02-global_analysis/02/cluster_metadata_dend_order.tsv"))
## Rows: 203 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): Cluster, organ, organ_code, L0_clusterName, L1_clusterName, L2_clu...
## dbl  (9): L1_clusterID, ncell, median_numi, median_ngene, median_pctmt, medi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

4 Load global peakset

# load peaks and convert to granges
peaks <- readRDS(here::here("output/04-enhancers/01/peaks_all_df_annots.rds"))
peaks_gr <- makeGRangesFromDataFrame(peaks, ignore.strand = TRUE, keep.extra.columns = TRUE)

rtracklayer::export.bed(peaks_gr, con = file.path(out, "atlas_peaks_all.bed"), format = "BED")

# how many peaks?
length(peaks_gr)
## [1] 1032273
# all same length
all(width(peaks_gr) == 501)
## [1] TRUE
cmap_anno <- cmap_peaktype

5 Load ENCODE cREs

Note: GRanges is 1-based while BED is 0-based:

$ head GRCh38-cCREs.bed
chr1    104896  105048  EH38D4327509    EH38E2776520    CTCF-only,CTCF-bound
chr1    138866  139134  EH38D4327520    EH38E2776521    pELS,CTCF-bound
chr1    181289  181639  EH38D4327525    EH38E2776524    DNase-H3K4me3,CTCF-bound
chr1    267925  268171  EH38D4327544    EH38E2776528    CTCF-only,CTCF-bound
chr1    586036  586264  EH38D4327554    EH38E2776532    CTCF-only,CTCF-bound
chr1    605330  605668  EH38D4327560    EH38E2776534    dELS
chr1    727121  727350  EH38D4327570    EH38E2776536    dELS
chr1    778570  778919  EH38D4327580    EH38E2776539    PLS,CTCF-bound
chr1    779026  779180  EH38D4327581    EH38E2776540    PLS,CTCF-bound
chr1    779222  779432  EH38D4327582    EH38E2776541    pELS,CTCF-bound
encode_cCREs_paths <- list.files(path = here::here("output/04-enhancers/07/ENCODE"), full.names = TRUE)
encode_cCREs <- map(encode_cCREs_paths,
                    ~ rtracklayer::import(.x, format = "BED",
                                          extraCols = c("Accession1" = "character", "Accession2" = "character", "Annotation" = "character")))
names(encode_cCREs) <- gsub("GRCh38-", "", gsub(".bed", "", basename(encode_cCREs_paths)))

map(encode_cCREs, length)
## $cCREs
## [1] 2348854
## 
## $`cCREs.CA-CTCF`
## [1] 126034
## 
## $`cCREs.CA-H3K4me3`
## [1] 79246
## 
## $`cCREs.CA-TF`
## [1] 26102
## 
## $cCREs.CA
## [1] 245985
## 
## $`cCREs.CTCF-bound`
## [1] 948642
## 
## $cCREs.dELS
## [1] 1469205
## 
## $cCREs.pELS
## [1] 249464
## 
## $cCREs.PLS
## [1] 47532
## 
## $cCREs.TF
## [1] 105286
map(encode_cCREs, ~ .x$Annotation %>% unique())
## $cCREs
## [1] "pELS"       "CA-CTCF"    "CA-TF"      "CA"         "dELS"      
## [6] "TF"         "CA-H3K4me3" "PLS"       
## 
## $`cCREs.CA-CTCF`
## [1] "CA-CTCF"
## 
## $`cCREs.CA-H3K4me3`
## [1] "CA-H3K4me3"
## 
## $`cCREs.CA-TF`
## [1] "CA-TF"
## 
## $cCREs.CA
## [1] "CA"
## 
## $`cCREs.CTCF-bound`
## [1] "pELS,CTCF-bound"       "CA-CTCF,CTCF-bound"    "TF,CTCF-bound"        
## [4] "CA-H3K4me3,CTCF-bound" "dELS,CTCF-bound"       "PLS,CTCF-bound"       
## 
## $cCREs.dELS
## [1] "dELS"
## 
## $cCREs.pELS
## [1] "pELS"
## 
## $cCREs.PLS
## [1] "PLS"
## 
## $cCREs.TF
## [1] "TF"
head(encode_cCREs$cCREs)
## GRanges object with 6 ranges and 3 metadata columns:
##       seqnames      ranges strand |   Accession1   Accession2  Annotation
##          <Rle>   <IRanges>  <Rle> |  <character>  <character> <character>
##   [1]     chr1 10034-10250      * | EH38D4327497 EH38E2776516        pELS
##   [2]     chr1 10386-10713      * | EH38D4327498 EH38E2776517        pELS
##   [3]     chr1 16098-16381      * | EH38D6144701 EH38E3951272     CA-CTCF
##   [4]     chr1 17344-17642      * | EH38D6144702 EH38E3951273       CA-TF
##   [5]     chr1 29321-29517      * | EH38D6144703 EH38E3951274          CA
##   [6]     chr1 66351-66509      * | EH38D4327503 EH38E3951275          CA
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths

How many in each set:

p <- data.frame(Set = names(encode_cCREs),
           N_elements = map_dbl(encode_cCREs, length)) %>% 
  mutate(Set = gsub("cCREs.", "", Set)) %>% 
  ggplot(aes(x = Set, y = N_elements)) +
  geom_col() +
  scale_y_continuous(label = scales::comma, limits = c(0, 2600000)) +
  ggtitle("# of ENCODE V4 cCREs") +
  geom_text(aes(label = N_elements), vjust = -1) +
  theme(axis.text.x = element_text(angle=90, hjust=1))
p

# ggsave(plot=p, filename=paste0(figout, "encode_cCREs_breakdown.pdf"), width=7, height=5) # if not knitting

6 Calculate overlaps:

Here, we calculate overlap between atlas peaks and ENCODE cCREs.

#' Update peaks_gr metadata columns based on overlaps with other region lists
#'
#' @param query GRanges, the query ranges in which to look for overlaps with \code{subject}.
#' @param subject GRanges, the subject ranges.
#' @param key character, used to name the new column in query
#' @param min_subject_overalp numeric, percentage of query region required for considering
#' two ranges overlapping.
get_overlaps <- function(query, subject, key, min_subject_overlap) {
  
  # get overlap in basepairs, given that query peaks are 501bp
  min_overlap_bp <- min_subject_overlap * 501
  
  # find overlaps between peaks and ENCODE cCREs
  olap_idx <- GenomicRanges::findOverlaps(query, subject = subject, minoverlap = min_overlap_bp)
  peaks_idx <- queryHits(olap_idx)
  annos_per_peaks_idx <- subject$Annotation[subjectHits(olap_idx)]
  
  print(length(olap_idx))
  
  # count the number of times each peak index appears (i.e., the number of overlaps per peak)
  overlap_counts <- table(peaks_idx) %>% enframe(name = "peak_idx", value = "n_overlaps")
  
  # create a dataframe to hold the overlap counts for each query, initialize with "0" (no overlaps)
  anno_df <- data.frame(
    peak_idx = as.character(seq_along(query)),
    Olaps = rep(0, length(query))) %>% 
    left_join(overlap_counts, by = "peak_idx") %>%
    mutate(Olaps = ifelse(!is.na(n_overlaps), n_overlaps, Olaps)) %>%
    dplyr::select(peak_idx, Olaps)
  
  if (!all(anno_df$peak_idx == as.character(seq_along(query)))) {
    stop("Peak indices are not equal.")
  }
  
  anno_df <- anno_df %>% dplyr::select(Olaps)
  anno_df$Olaps <- as.numeric(anno_df$Olaps)
  colnames(anno_df) <- paste0("Overlap_", key)
  
  return(anno_df)
  
}

peaks_anno_0.1 <- imap_dfc(encode_cCREs, ~ get_overlaps(query = peaks_gr, subject = .x, key = .y, min_subject_overlap = 0.1)) %>% mutate(Percent_overlap = 0.1)
## [1] 1180068
## [1] 32339
## [1] 21708
## [1] 8404
## [1] 50181
## [1] 551933
## [1] 821960
## [1] 180156
## [1] 40701
## [1] 24619
peaks_anno_0.5 <- imap_dfc(encode_cCREs, ~ get_overlaps(query = peaks_gr, subject = .x, key = .y, min_subject_overlap = 0.5)) %>% mutate(Percent_overlap = 0.5)
## [1] 526082
## [1] 13553
## [1] 9495
## [1] 5057
## [1] 26672
## [1] 219278
## [1] 378862
## [1] 62041
## [1] 23142
## [1] 7260
head(peaks_anno_0.1)
##   Overlap_cCREs Overlap_cCREs.CA-CTCF Overlap_cCREs.CA-H3K4me3
## 1             1                     0                        0
## 2             1                     0                        0
## 3             1                     0                        0
## 4             2                     0                        0
## 5             1                     0                        0
## 6             1                     0                        0
##   Overlap_cCREs.CA-TF Overlap_cCREs.CA Overlap_cCREs.CTCF-bound
## 1                   0                0                        1
## 2                   0                0                        0
## 3                   0                0                        1
## 4                   0                0                        0
## 5                   0                0                        0
## 6                   0                0                        1
##   Overlap_cCREs.dELS Overlap_cCREs.pELS Overlap_cCREs.PLS Overlap_cCREs.TF
## 1                  1                  0                 0                0
## 2                  0                  0                 1                0
## 3                  0                  1                 0                0
## 4                  0                  2                 0                0
## 5                  1                  0                 0                0
## 6                  1                  0                 0                0
##   Percent_overlap
## 1             0.1
## 2             0.1
## 3             0.1
## 4             0.1
## 5             0.1
## 6             0.1
nrow(peaks_anno_0.5)
## [1] 1032273

Also compute the reverse, i.e. which of the ENCODE elements overlap with atlas elements?

cCREs_anno_0.1 <- imap_dfr(encode_cCREs, ~ get_overlaps(query = .x, subject = peaks_gr, key = .y, min_subject_overlap = 0.1) %>% set_colnames("Overlap") %>% mutate(ENCODE_set = .y)) %>% mutate(Percent_overlap = 0.1)
## [1] 1180068
## [1] 32339
## [1] 21708
## [1] 8404
## [1] 50181
## [1] 551933
## [1] 821960
## [1] 180156
## [1] 40701
## [1] 24619
cCREs_anno_0.5 <- imap_dfr(encode_cCREs, ~ get_overlaps(query = .x, subject = peaks_gr, key = .y, min_subject_overlap = 0.5) %>% set_colnames("Overlap") %>% mutate(ENCODE_set = .y)) %>% mutate(Percent_overlap = 0.5)
## [1] 526082
## [1] 13553
## [1] 9495
## [1] 5057
## [1] 26672
## [1] 219278
## [1] 378862
## [1] 62041
## [1] 23142
## [1] 7260

7 Visualize

peaks_anno_long <- bind_rows(data.frame(peaks_anno_0.1),
                             data.frame(peaks_anno_0.5)) %>% 
  dplyr::select(Percent_overlap, matches("Overlap_")) %>% 
  # convert to long/tidy format
  gather(ENCODE_set, Status, 2:ncol(.)) %>% 
  # convert to binary
  mutate(Status = case_when(
    Status == 0 ~ "None",
    Status == 1 ~ "1",
    Status > 1   ~ ">1"))

# make a palette that encodes both the cCRE type and the status in terms of # overlaps
palette_encode_status <- base::expand.grid(unique(peaks_anno_long$ENCODE_set), unique(peaks_anno_long$Status)) %>% 
  mutate(Var1 = gsub("\\.", "-", gsub("cCREs.", "", gsub("Overlap_", "", Var1)))) %>% 
  rowwise() %>% 
  mutate(Color = case_when(
    Var2 == "None" ~ "gray90",
    Var2 == "1" ~ "gray50",
    Var2 == ">1" ~ 'black')) %>% 
  mutate(Key = paste0(Var1, "_", Var2)) %>% 
  dplyr::select(Key, Color) %>% 
  tibble::deframe()

palette_encode_status
##         cCREs_1       CA-CTCF_1    CA-H3K4me3_1         CA-TF_1            CA_1 
##        "gray50"        "gray50"        "gray50"        "gray50"        "gray50" 
##    CTCF-bound_1          dELS_1          pELS_1           PLS_1            TF_1 
##        "gray50"        "gray50"        "gray50"        "gray50"        "gray50" 
##        cCREs_>1      CA-CTCF_>1   CA-H3K4me3_>1        CA-TF_>1           CA_>1 
##         "black"         "black"         "black"         "black"         "black" 
##   CTCF-bound_>1         dELS_>1         pELS_>1          PLS_>1           TF_>1 
##         "black"         "black"         "black"         "black"         "black" 
##      cCREs_None    CA-CTCF_None CA-H3K4me3_None      CA-TF_None         CA_None 
##        "gray90"        "gray90"        "gray90"        "gray90"        "gray90" 
## CTCF-bound_None       dELS_None       pELS_None        PLS_None         TF_None 
##        "gray90"        "gray90"        "gray90"        "gray90"        "gray90"
p1 <- peaks_anno_long %>% 
  mutate(ENCODE_set = gsub("\\.", "-", gsub("cCREs.", "", gsub("Overlap_", "", ENCODE_set))),
         ENCODE_status = paste0(ENCODE_set, "_", Status)) %>% 
  ggplot(aes(x = ENCODE_set)) +
  geom_bar(aes(fill = ENCODE_status)) +
  scale_fill_manual(values = palette_encode_status) +
  scale_y_continuous(labels = scales::comma) +
  facet_wrap(~ Percent_overlap) +
  xlab("ENCODE V4 cCRE set") + ylab("# of atlas peaks") +
  ggtitle("Atlas peaks which overlap with \nENCODE candidate cis-regulatory elements") +
  theme(strip.background = element_blank()) +
  rotate_x()

p2 <- bind_rows(cCREs_anno_0.1, cCREs_anno_0.5) %>% 
  mutate(Status = case_when( # convert to binary
    Overlap == 0 ~ "None",
    Overlap == 1 ~ "1",
    Overlap > 1   ~ ">1")) %>% 
  mutate(ENCODE_set = gsub("\\.", "-", gsub("cCREs.", "", gsub("Overlap_", "", ENCODE_set))),
         ENCODE_status = paste0(ENCODE_set, "_", Status)) %>% 
  ggplot(aes(x = ENCODE_set)) +
  geom_bar(aes(fill = ENCODE_status)) +
  scale_fill_manual(values = palette_encode_status) +
  scale_y_continuous(labels = scales::comma) +
  facet_wrap(~ Percent_overlap) +
  xlab("ENCODE V4 cCRE set") + ylab("# of ENCODE cCREs") +
  ggtitle("ENCODE candidate cis-regulatory \nelements which overlap with atlas peaks") +
  theme(strip.background = element_blank()) +
  rotate_x()

p <- plot_grid(p1, p2, nrow = 2, align = "v", axis = "rl")
p

# ggsave(plot=p, filename = paste0(figout, "barplot_peaks_encode_overlap.pdf"), width = 10, height = 9)

8 Explore overlapping/non-overlapping peaks

peaks_anno_0.1 <- peaks_anno_0.1 %>% 
  tibble::add_column(peak_name = mcols(peaks_gr)$peak_name, .before = 1)
colnames(peaks_anno_0.1) <- gsub("Overlap_", "Overlap_ENCODE_", colnames(peaks_anno_0.1))

head(peaks_anno_0.1)
##            peak_name Overlap_ENCODE_cCREs Overlap_ENCODE_cCREs.CA-CTCF
## 1 chr1_794823_795323                    1                            0
## 2 chr1_817072_817572                    1                            0
## 3 chr1_817868_818368                    1                            0
## 4 chr1_818822_819322                    2                            0
## 5 chr1_819827_820327                    1                            0
## 6 chr1_821644_822144                    1                            0
##   Overlap_ENCODE_cCREs.CA-H3K4me3 Overlap_ENCODE_cCREs.CA-TF
## 1                               0                          0
## 2                               0                          0
## 3                               0                          0
## 4                               0                          0
## 5                               0                          0
## 6                               0                          0
##   Overlap_ENCODE_cCREs.CA Overlap_ENCODE_cCREs.CTCF-bound
## 1                       0                               1
## 2                       0                               0
## 3                       0                               1
## 4                       0                               0
## 5                       0                               0
## 6                       0                               1
##   Overlap_ENCODE_cCREs.dELS Overlap_ENCODE_cCREs.pELS Overlap_ENCODE_cCREs.PLS
## 1                         1                         0                        0
## 2                         0                         0                        1
## 3                         0                         1                        0
## 4                         0                         2                        0
## 5                         1                         0                        0
## 6                         1                         0                        0
##   Overlap_ENCODE_cCREs.TF Percent_overlap
## 1                       0             0.1
## 2                       0             0.1
## 3                       0             0.1
## 4                       0             0.1
## 5                       0             0.1
## 6                       0             0.1
readr::write_tsv(peaks_anno_0.1, file.path(out, "peaks_annotated_ENCODE_cCREs.tsv"))
readr::write_tsv(cCREs_anno_0.1, file.path(out, "ENCODE_cCREs_overlap_peaks.tsv"))


# and to the granges object
# first, sanity check
all(peaks_anno_0.1$peak_name == mcols(peaks_gr)$peak_name)
## [1] TRUE
mcols(peaks_gr)$Overlap_ENCODE_cCREs <- peaks_anno_0.1$Overlap_ENCODE_cCREs

How do some of the metrics on the called peaks vary based on whether they overlap with an ENCDOE cCRE?

p1 <- mcols(peaks_gr) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = as.factor(Overlap_ENCODE_cCREs), y = score)) +
  geom_boxplot(outlier.shape = NA)  +
  coord_cartesian(ylim = c(0, 30)) + 
  ggtitle("Peak score")

p2 <- mcols(peaks_gr) %>% 
  as.data.frame() %>% 
  ggplot(aes(x = as.factor(Overlap_ENCODE_cCREs), y = Reproducibility)) +
  geom_boxplot(outlier.shape = NA)  +
  coord_cartesian(ylim = c(0, 5)) +
  ggtitle("Peak reproducibility")

p <- plot_grid(p1, p2, nrow = 1)
p

# ggsave(plot=p, filename = paste0(figout, "qc_per_overlap_status.pdf"), width = 10, height = 5)

What are the peaks that don’t overlap with any ENCODE peaks?

p <- peaks_gr[peaks_gr$Overlap_ENCODE_cCREs == 0, ] %>% 
  mcols() %>% 
  as.data.frame() %>% 
  ggplot(aes(x = "1")) +
  geom_bar(aes(fill = peakType), position = "stack") +
  scale_fill_manual(values = cmap_anno) +
  scale_y_continuous(labels = scales::comma)
p

# ggsave(plot=p, filename = paste0(figout, "peaks_no_olap_breakdown.pdf"), width = 4, height = 7)

What cell types to they come from?

peaks_df <- peaks_gr %>% 
  mcols() %>% 
  as.data.frame() %>% 
  separate(GroupReplicate, sep = "\\._\\.", into = c("Cluster", "Sample")) %>% 
  left_join(cluster_meta, by = "Cluster")

cluster_order_lex <- cluster_meta %>% arrange(organ, L1_clusterID) %>% pull(Cluster)

clust_order_desc_npeak <- table(peaks_df$Cluster) %>% as.data.frame() %>% dplyr::rename(Cluster="Var1", npeaks="Freq") %>% dplyr::arrange(desc(npeaks)) %>% pull(Cluster)
peaks_df <- peaks_df %>% mutate(Cluster = factor(Cluster, levels = clust_order_desc_npeak))
clust_organ_map <- peaks_df %>% dplyr::select(Cluster, organ) %>% unique
rownames(clust_organ_map) <- as.character(clust_organ_map$Cluster)

p1 <- peaks_df %>%
  ggplot(aes(x = Cluster)) +
  geom_bar(aes(fill = organ)) +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  rotate_x() +
  ggtitle("Number of peaks in global peakset originating from each cluster") +
  ylab("# peaks") +
  hide_legend() +
  theme(axis.text.x = element_text(size = 6))

p2 <- peaks_df %>% 
  ggplot(aes(x = Cluster)) +
  geom_bar(aes(fill = organ)) +
  facet_wrap(~ Overlap_ENCODE_cCREs, nrow = 3) +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  rotate_x() +
  ggtitle("Number of peaks in global peakset originating from each cluster by ENCODE overlap status") +
  ylab("# peaks") +
  theme(axis.text.x = element_text(size = 4))

a <- peaks_df %>% dplyr::group_by(Cluster) %>% dplyr::summarise(npeak=n()) 
b <- peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0) %>% dplyr::group_by(Cluster) %>% dplyr::summarise(npeak_0overlap=n())
ratio_df <- merge(a, b) %>% dplyr::mutate(new_ratio=npeak_0overlap/npeak)
ratio_df$Cluster <- factor(ratio_df$Cluster, levels=levels(peaks_df$Cluster))
ratio_df$organ <- clust_organ_map[as.character(ratio_df$Cluster),]$organ
avg_ratio <- nrow(peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0)) / nrow(peaks_df)

p3 <- ratio_df %>% 
  ggplot(aes(x = Cluster, y=new_ratio)) +
  geom_col(aes(fill = organ)) +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  rotate_x() +
  geom_hline(yintercept=avg_ratio, linetype="dashed", size=1, color="gray") +
  ggtitle(paste0("Ratio of new peaks with 0 overlap with ENCODE\n(Avg Ratio=", round(avg_ratio,2), ")")) +
  ylab("ratio") +
  theme(axis.text.x = element_text(size = 6)) +
  theme(legend.position = "none")

p <- cowplot::plot_grid(p1, p2, p3, ncol = 1, rel_heights = c(0.3, 0.7, 0.3), align = "v", axis = "rl")
p

# ggsave(plot=p, filename = paste0(figout, "peak_breakdown_by_celltype.pdf"), width = 18, height = 12)

What organs do they come from?

peaks_df <- peaks_gr %>% 
  mcols() %>% 
  as.data.frame() %>% 
  separate(GroupReplicate, sep = "\\._\\.", into = c("Cluster", "Sample")) %>% 
  left_join(cluster_meta, by = "Cluster")

organ_order_desc_npeak <- table(peaks_df$organ) %>% as.data.frame() %>% dplyr::rename(organ="Var1", npeaks="Freq") %>% dplyr::arrange(desc(npeaks)) %>% pull(organ)
peaks_df <- peaks_df %>% mutate(organ = factor(organ, levels = organ_order_desc_npeak))
p1 <- peaks_df %>%
  ggplot(aes(x = organ)) +
  geom_bar(aes(fill = organ)) +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  rotate_x() +
  ggtitle("Number of peaks in global peakset originating from each organ") +
  ylab("# peaks") +
  hide_legend() +
  theme(axis.text.x = element_text(size = 12))

p2 <- peaks_df %>% 
  ggplot(aes(x = organ)) +
  geom_bar(aes(fill = organ)) +
  facet_wrap(~ Overlap_ENCODE_cCREs, ncol = 1) +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  rotate_x() +
  ggtitle("Number of peaks in global peakset originating from each organ by ENCODE overlap status") +
  ylab("# peaks") +
  theme(axis.text.x = element_text(size = 12)) +
  theme(legend.position = "none")

a <- peaks_df %>% dplyr::group_by(organ) %>% dplyr::summarise(npeak=n()) 
b <- peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0) %>% dplyr::group_by(organ) %>% dplyr::summarise(npeak_0overlap=n())
ratio_df <- merge(a, b) %>% dplyr::mutate(new_ratio=npeak_0overlap/npeak)
ratio_df$organ <- factor(ratio_df$organ, levels=levels(peaks_df$organ))
avg_ratio <- nrow(peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0)) / nrow(peaks_df)

p3 <- ratio_df %>% 
  ggplot(aes(x = organ, y=new_ratio)) +
  geom_col(aes(fill = organ)) +
  scale_fill_manual(values = cmap_organ) +
  scale_y_continuous(labels = scales::comma) +
  rotate_x() +
  geom_hline(yintercept=avg_ratio, linetype="dashed", size=1, color="gray") +
  ggtitle(paste0("Ratio of new peaks with 0 overlap with ENCODE\n(Avg Ratio=", round(avg_ratio,2), ")")) +
  ylab("ratio") +
  theme(axis.text.x = element_text(size = 12)) +
  theme(legend.position = "none")


p <- cowplot::plot_grid(p1, p2, p3, ncol = 1, rel_heights = c(0.3, 0.7), align = "v", axis = "rl")
p

# ggsave(plot=p, filename = paste0(figout, "peak_breakdown_by_organ.pdf"), width = 8, height = 20)

9 Session info

.libPaths()
## [1] "/oak/stanford/groups/wjg/bliu/software/R_lib"     
## [2] "/share/software/user/open/R/4.1.2/lib64/R/library"
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggthemes_4.2.4         RColorBrewer_1.1-3     ggrastr_1.0.1         
##  [4] magrittr_2.0.3         GenomicFeatures_1.46.5 AnnotationDbi_1.56.2  
##  [7] Biobase_2.54.0         rtracklayer_1.54.0     GenomicRanges_1.46.1  
## [10] GenomeInfoDb_1.30.1    IRanges_2.28.0         S4Vectors_0.32.4      
## [13] BiocGenerics_0.40.0    cowplot_1.1.1          tibble_3.2.1          
## [16] readr_2.1.4            glue_1.6.2             purrr_1.0.2           
## [19] ggplot2_3.5.0          tidyr_1.3.1            dplyr_1.1.4           
## [22] here_1.0.1            
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7                matrixStats_0.63.0         
##  [3] bit64_4.0.5                 filelock_1.0.2             
##  [5] progress_1.2.2              httr_1.4.6                 
##  [7] rprojroot_2.0.3             tools_4.1.2                
##  [9] bslib_0.4.2                 utf8_1.2.3                 
## [11] R6_2.5.1                    vipor_0.4.5                
## [13] DBI_1.1.3                   colorspace_2.1-0           
## [15] withr_3.0.0                 tidyselect_1.2.0           
## [17] prettyunits_1.1.1           bit_4.0.5                  
## [19] curl_5.0.0                  compiler_4.1.2             
## [21] cli_3.6.2                   xml2_1.3.4                 
## [23] DelayedArray_0.20.0         labeling_0.4.2             
## [25] sass_0.4.6                  scales_1.3.0               
## [27] rappdirs_0.3.3              stringr_1.5.0              
## [29] digest_0.6.31               Rsamtools_2.10.0           
## [31] rmarkdown_2.21              XVector_0.34.0             
## [33] pkgconfig_2.0.3             htmltools_0.5.5            
## [35] MatrixGenerics_1.6.0        highr_0.10                 
## [37] dbplyr_2.3.2                fastmap_1.1.1              
## [39] rlang_1.1.3                 RSQLite_2.3.1              
## [41] farver_2.1.1                jquerylib_0.1.4            
## [43] BiocIO_1.4.0                generics_0.1.3             
## [45] jsonlite_1.8.4              vroom_1.6.3                
## [47] BiocParallel_1.28.3         RCurl_1.98-1.12            
## [49] GenomeInfoDbData_1.2.7      Matrix_1.5-4               
## [51] Rcpp_1.0.10                 ggbeeswarm_0.7.2           
## [53] munsell_0.5.0               fansi_1.0.4                
## [55] lifecycle_1.0.3             stringi_1.7.12             
## [57] yaml_2.3.7                  SummarizedExperiment_1.24.0
## [59] zlibbioc_1.40.0             BiocFileCache_2.2.1        
## [61] blob_1.2.4                  parallel_4.1.2             
## [63] crayon_1.5.2                lattice_0.20-45            
## [65] Biostrings_2.62.0           hms_1.1.3                  
## [67] KEGGREST_1.34.0             knitr_1.42                 
## [69] pillar_1.9.0                rjson_0.2.21               
## [71] biomaRt_2.50.3              XML_3.99-0.14              
## [73] evaluate_0.21               png_0.1-8                  
## [75] vctrs_0.6.5                 tzdb_0.4.0                 
## [77] gtable_0.3.3                cachem_1.0.8               
## [79] xfun_0.39                   restfulr_0.0.15            
## [81] GenomicAlignments_1.30.0    beeswarm_0.4.0             
## [83] memoise_2.0.1