# **MODIFY THIS CHUNK**
project_id <- "HDMA-public" # determines the name of the cache folder
doc_id <- "04-enhancers/07" # determines name of the subfolder of `figures` where pngs/pdfs are saved
out <- here::here("output/", doc_id); dir.create(out, recursive = TRUE)
figout <- here::here("figures/", doc_id, "/")
cache <- paste0("~/scratch/cache/", project_id, "/", doc_id, "/")
script_path <- here::here("code/utils/")Analysis:
From ENCODE SCREEN resource: ENCODE V4 cCRE descriptions: https://screen.wenglab.org/about
Classification of cCREs Many uses of cCREs are based on the regulatory role associated with their biochemical signatures. Thus, we putatively defined cCREs in one of the following annotation groups based on each element’s dominant biochemical signals across all available biosamples. Analogous to GENCODE’s catalog of genes, which are defined irrespective of their varying expression levels and alternative transcripts across different cell types, we provide a general, cell type-agnostic classification of cCREs based on the max-Zs as well as its proximity to the nearest annotated TSS:
Promoter-like signatures (PLS): 1) fall within 200 bp (center to center) of an annotated GENCODE TSS or experimentally derived TSS and 2) have high chromatin accessibility (either DNase or ATAC) and H3K4me3 signals.
Enhancer-like signatures (ELS) have high chromatin accessibility and H3K27ac signals. If they are within 200 bp of a TSS they must also have low H3K4me3 signal. The subset of the enhancers within 2 kb of a TSS are denoted as TSS proximal (pELS), while the remaining subset is denoted TSS distal (dELS).
Chromatin accessibility + H3K4me3 (CA-H3K4me3) have high chromatin accessibility and H3K4me3 signals, low H3K27ac signals and do not fall within 200 bp of a TSS.
Chromatin accessibility + CTCF (CA-CTCF) have high chromatin accessibility and CTCF signals, low H3K4me3 and H3K27ac signals.
Chromatin accessibility + transcription factor (CA-TF) have high chromatin accessibility, low H3K4me3 H3K27ac, and CTCF signals, and overlap a transcription factor cluster.
Chromatin accessibility (CA) have high chromatin accessibility, and low H3K4me3, H3K27ac, and CTCF signals.
Transcription factor (TF) have low chromatin accessibility, H3K4me3, H3K27ac, and CTCF signals and overlap a transcription factor cluster.
# libraries
library(here)
library(dplyr)
library(tidyr)
library(ggplot2)
library(purrr)
library(glue)
library(readr)
library(tibble)
library(cowplot)
library(GenomicRanges)
library(rtracklayer)
# shared project scripts/functions
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
ggplot2::theme_set(theme_BOR())cluster_meta <- read_tsv(here::here("output/02-global_analysis/02/cluster_metadata_dend_order.tsv"))## Rows: 203 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): Cluster, organ, organ_code, L0_clusterName, L1_clusterName, L2_clu...
## dbl (9): L1_clusterID, ncell, median_numi, median_ngene, median_pctmt, medi...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# load peaks and convert to granges
peaks <- readRDS(here::here("output/04-enhancers/01/peaks_all_df_annots.rds"))
peaks_gr <- makeGRangesFromDataFrame(peaks, ignore.strand = TRUE, keep.extra.columns = TRUE)
rtracklayer::export.bed(peaks_gr, con = file.path(out, "atlas_peaks_all.bed"), format = "BED")
# how many peaks?
length(peaks_gr)## [1] 1032273
# all same length
all(width(peaks_gr) == 501)## [1] TRUE
cmap_anno <- cmap_peaktypeNote: GRanges is 1-based while BED is 0-based:
$ head GRCh38-cCREs.bed
chr1 104896 105048 EH38D4327509 EH38E2776520 CTCF-only,CTCF-bound
chr1 138866 139134 EH38D4327520 EH38E2776521 pELS,CTCF-bound
chr1 181289 181639 EH38D4327525 EH38E2776524 DNase-H3K4me3,CTCF-bound
chr1 267925 268171 EH38D4327544 EH38E2776528 CTCF-only,CTCF-bound
chr1 586036 586264 EH38D4327554 EH38E2776532 CTCF-only,CTCF-bound
chr1 605330 605668 EH38D4327560 EH38E2776534 dELS
chr1 727121 727350 EH38D4327570 EH38E2776536 dELS
chr1 778570 778919 EH38D4327580 EH38E2776539 PLS,CTCF-bound
chr1 779026 779180 EH38D4327581 EH38E2776540 PLS,CTCF-bound
chr1 779222 779432 EH38D4327582 EH38E2776541 pELS,CTCF-bound
encode_cCREs_paths <- list.files(path = here::here("output/04-enhancers/07/ENCODE"), full.names = TRUE)
encode_cCREs <- map(encode_cCREs_paths,
~ rtracklayer::import(.x, format = "BED",
extraCols = c("Accession1" = "character", "Accession2" = "character", "Annotation" = "character")))
names(encode_cCREs) <- gsub("GRCh38-", "", gsub(".bed", "", basename(encode_cCREs_paths)))
map(encode_cCREs, length)## $cCREs
## [1] 2348854
##
## $`cCREs.CA-CTCF`
## [1] 126034
##
## $`cCREs.CA-H3K4me3`
## [1] 79246
##
## $`cCREs.CA-TF`
## [1] 26102
##
## $cCREs.CA
## [1] 245985
##
## $`cCREs.CTCF-bound`
## [1] 948642
##
## $cCREs.dELS
## [1] 1469205
##
## $cCREs.pELS
## [1] 249464
##
## $cCREs.PLS
## [1] 47532
##
## $cCREs.TF
## [1] 105286
map(encode_cCREs, ~ .x$Annotation %>% unique())## $cCREs
## [1] "pELS" "CA-CTCF" "CA-TF" "CA" "dELS"
## [6] "TF" "CA-H3K4me3" "PLS"
##
## $`cCREs.CA-CTCF`
## [1] "CA-CTCF"
##
## $`cCREs.CA-H3K4me3`
## [1] "CA-H3K4me3"
##
## $`cCREs.CA-TF`
## [1] "CA-TF"
##
## $cCREs.CA
## [1] "CA"
##
## $`cCREs.CTCF-bound`
## [1] "pELS,CTCF-bound" "CA-CTCF,CTCF-bound" "TF,CTCF-bound"
## [4] "CA-H3K4me3,CTCF-bound" "dELS,CTCF-bound" "PLS,CTCF-bound"
##
## $cCREs.dELS
## [1] "dELS"
##
## $cCREs.pELS
## [1] "pELS"
##
## $cCREs.PLS
## [1] "PLS"
##
## $cCREs.TF
## [1] "TF"
head(encode_cCREs$cCREs)## GRanges object with 6 ranges and 3 metadata columns:
## seqnames ranges strand | Accession1 Accession2 Annotation
## <Rle> <IRanges> <Rle> | <character> <character> <character>
## [1] chr1 10034-10250 * | EH38D4327497 EH38E2776516 pELS
## [2] chr1 10386-10713 * | EH38D4327498 EH38E2776517 pELS
## [3] chr1 16098-16381 * | EH38D6144701 EH38E3951272 CA-CTCF
## [4] chr1 17344-17642 * | EH38D6144702 EH38E3951273 CA-TF
## [5] chr1 29321-29517 * | EH38D6144703 EH38E3951274 CA
## [6] chr1 66351-66509 * | EH38D4327503 EH38E3951275 CA
## -------
## seqinfo: 24 sequences from an unspecified genome; no seqlengths
How many in each set:
p <- data.frame(Set = names(encode_cCREs),
N_elements = map_dbl(encode_cCREs, length)) %>%
mutate(Set = gsub("cCREs.", "", Set)) %>%
ggplot(aes(x = Set, y = N_elements)) +
geom_col() +
scale_y_continuous(label = scales::comma, limits = c(0, 2600000)) +
ggtitle("# of ENCODE V4 cCREs") +
geom_text(aes(label = N_elements), vjust = -1) +
theme(axis.text.x = element_text(angle=90, hjust=1))
p# ggsave(plot=p, filename=paste0(figout, "encode_cCREs_breakdown.pdf"), width=7, height=5) # if not knittingHere, we calculate overlap between atlas peaks and ENCODE cCREs.
#' Update peaks_gr metadata columns based on overlaps with other region lists
#'
#' @param query GRanges, the query ranges in which to look for overlaps with \code{subject}.
#' @param subject GRanges, the subject ranges.
#' @param key character, used to name the new column in query
#' @param min_subject_overalp numeric, percentage of query region required for considering
#' two ranges overlapping.
get_overlaps <- function(query, subject, key, min_subject_overlap) {
# get overlap in basepairs, given that query peaks are 501bp
min_overlap_bp <- min_subject_overlap * 501
# find overlaps between peaks and ENCODE cCREs
olap_idx <- GenomicRanges::findOverlaps(query, subject = subject, minoverlap = min_overlap_bp)
peaks_idx <- queryHits(olap_idx)
annos_per_peaks_idx <- subject$Annotation[subjectHits(olap_idx)]
print(length(olap_idx))
# count the number of times each peak index appears (i.e., the number of overlaps per peak)
overlap_counts <- table(peaks_idx) %>% enframe(name = "peak_idx", value = "n_overlaps")
# create a dataframe to hold the overlap counts for each query, initialize with "0" (no overlaps)
anno_df <- data.frame(
peak_idx = as.character(seq_along(query)),
Olaps = rep(0, length(query))) %>%
left_join(overlap_counts, by = "peak_idx") %>%
mutate(Olaps = ifelse(!is.na(n_overlaps), n_overlaps, Olaps)) %>%
dplyr::select(peak_idx, Olaps)
if (!all(anno_df$peak_idx == as.character(seq_along(query)))) {
stop("Peak indices are not equal.")
}
anno_df <- anno_df %>% dplyr::select(Olaps)
anno_df$Olaps <- as.numeric(anno_df$Olaps)
colnames(anno_df) <- paste0("Overlap_", key)
return(anno_df)
}
peaks_anno_0.1 <- imap_dfc(encode_cCREs, ~ get_overlaps(query = peaks_gr, subject = .x, key = .y, min_subject_overlap = 0.1)) %>% mutate(Percent_overlap = 0.1)## [1] 1180068
## [1] 32339
## [1] 21708
## [1] 8404
## [1] 50181
## [1] 551933
## [1] 821960
## [1] 180156
## [1] 40701
## [1] 24619
peaks_anno_0.5 <- imap_dfc(encode_cCREs, ~ get_overlaps(query = peaks_gr, subject = .x, key = .y, min_subject_overlap = 0.5)) %>% mutate(Percent_overlap = 0.5)## [1] 526082
## [1] 13553
## [1] 9495
## [1] 5057
## [1] 26672
## [1] 219278
## [1] 378862
## [1] 62041
## [1] 23142
## [1] 7260
head(peaks_anno_0.1)## Overlap_cCREs Overlap_cCREs.CA-CTCF Overlap_cCREs.CA-H3K4me3
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 2 0 0
## 5 1 0 0
## 6 1 0 0
## Overlap_cCREs.CA-TF Overlap_cCREs.CA Overlap_cCREs.CTCF-bound
## 1 0 0 1
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 1
## Overlap_cCREs.dELS Overlap_cCREs.pELS Overlap_cCREs.PLS Overlap_cCREs.TF
## 1 1 0 0 0
## 2 0 0 1 0
## 3 0 1 0 0
## 4 0 2 0 0
## 5 1 0 0 0
## 6 1 0 0 0
## Percent_overlap
## 1 0.1
## 2 0.1
## 3 0.1
## 4 0.1
## 5 0.1
## 6 0.1
nrow(peaks_anno_0.5)## [1] 1032273
Also compute the reverse, i.e. which of the ENCODE elements overlap with atlas elements?
cCREs_anno_0.1 <- imap_dfr(encode_cCREs, ~ get_overlaps(query = .x, subject = peaks_gr, key = .y, min_subject_overlap = 0.1) %>% set_colnames("Overlap") %>% mutate(ENCODE_set = .y)) %>% mutate(Percent_overlap = 0.1)## [1] 1180068
## [1] 32339
## [1] 21708
## [1] 8404
## [1] 50181
## [1] 551933
## [1] 821960
## [1] 180156
## [1] 40701
## [1] 24619
cCREs_anno_0.5 <- imap_dfr(encode_cCREs, ~ get_overlaps(query = .x, subject = peaks_gr, key = .y, min_subject_overlap = 0.5) %>% set_colnames("Overlap") %>% mutate(ENCODE_set = .y)) %>% mutate(Percent_overlap = 0.5)## [1] 526082
## [1] 13553
## [1] 9495
## [1] 5057
## [1] 26672
## [1] 219278
## [1] 378862
## [1] 62041
## [1] 23142
## [1] 7260
peaks_anno_long <- bind_rows(data.frame(peaks_anno_0.1),
data.frame(peaks_anno_0.5)) %>%
dplyr::select(Percent_overlap, matches("Overlap_")) %>%
# convert to long/tidy format
gather(ENCODE_set, Status, 2:ncol(.)) %>%
# convert to binary
mutate(Status = case_when(
Status == 0 ~ "None",
Status == 1 ~ "1",
Status > 1 ~ ">1"))
# make a palette that encodes both the cCRE type and the status in terms of # overlaps
palette_encode_status <- base::expand.grid(unique(peaks_anno_long$ENCODE_set), unique(peaks_anno_long$Status)) %>%
mutate(Var1 = gsub("\\.", "-", gsub("cCREs.", "", gsub("Overlap_", "", Var1)))) %>%
rowwise() %>%
mutate(Color = case_when(
Var2 == "None" ~ "gray90",
Var2 == "1" ~ "gray50",
Var2 == ">1" ~ 'black')) %>%
mutate(Key = paste0(Var1, "_", Var2)) %>%
dplyr::select(Key, Color) %>%
tibble::deframe()
palette_encode_status## cCREs_1 CA-CTCF_1 CA-H3K4me3_1 CA-TF_1 CA_1
## "gray50" "gray50" "gray50" "gray50" "gray50"
## CTCF-bound_1 dELS_1 pELS_1 PLS_1 TF_1
## "gray50" "gray50" "gray50" "gray50" "gray50"
## cCREs_>1 CA-CTCF_>1 CA-H3K4me3_>1 CA-TF_>1 CA_>1
## "black" "black" "black" "black" "black"
## CTCF-bound_>1 dELS_>1 pELS_>1 PLS_>1 TF_>1
## "black" "black" "black" "black" "black"
## cCREs_None CA-CTCF_None CA-H3K4me3_None CA-TF_None CA_None
## "gray90" "gray90" "gray90" "gray90" "gray90"
## CTCF-bound_None dELS_None pELS_None PLS_None TF_None
## "gray90" "gray90" "gray90" "gray90" "gray90"
p1 <- peaks_anno_long %>%
mutate(ENCODE_set = gsub("\\.", "-", gsub("cCREs.", "", gsub("Overlap_", "", ENCODE_set))),
ENCODE_status = paste0(ENCODE_set, "_", Status)) %>%
ggplot(aes(x = ENCODE_set)) +
geom_bar(aes(fill = ENCODE_status)) +
scale_fill_manual(values = palette_encode_status) +
scale_y_continuous(labels = scales::comma) +
facet_wrap(~ Percent_overlap) +
xlab("ENCODE V4 cCRE set") + ylab("# of atlas peaks") +
ggtitle("Atlas peaks which overlap with \nENCODE candidate cis-regulatory elements") +
theme(strip.background = element_blank()) +
rotate_x()
p2 <- bind_rows(cCREs_anno_0.1, cCREs_anno_0.5) %>%
mutate(Status = case_when( # convert to binary
Overlap == 0 ~ "None",
Overlap == 1 ~ "1",
Overlap > 1 ~ ">1")) %>%
mutate(ENCODE_set = gsub("\\.", "-", gsub("cCREs.", "", gsub("Overlap_", "", ENCODE_set))),
ENCODE_status = paste0(ENCODE_set, "_", Status)) %>%
ggplot(aes(x = ENCODE_set)) +
geom_bar(aes(fill = ENCODE_status)) +
scale_fill_manual(values = palette_encode_status) +
scale_y_continuous(labels = scales::comma) +
facet_wrap(~ Percent_overlap) +
xlab("ENCODE V4 cCRE set") + ylab("# of ENCODE cCREs") +
ggtitle("ENCODE candidate cis-regulatory \nelements which overlap with atlas peaks") +
theme(strip.background = element_blank()) +
rotate_x()
p <- plot_grid(p1, p2, nrow = 2, align = "v", axis = "rl")
p# ggsave(plot=p, filename = paste0(figout, "barplot_peaks_encode_overlap.pdf"), width = 10, height = 9)peaks_anno_0.1 <- peaks_anno_0.1 %>%
tibble::add_column(peak_name = mcols(peaks_gr)$peak_name, .before = 1)
colnames(peaks_anno_0.1) <- gsub("Overlap_", "Overlap_ENCODE_", colnames(peaks_anno_0.1))
head(peaks_anno_0.1)## peak_name Overlap_ENCODE_cCREs Overlap_ENCODE_cCREs.CA-CTCF
## 1 chr1_794823_795323 1 0
## 2 chr1_817072_817572 1 0
## 3 chr1_817868_818368 1 0
## 4 chr1_818822_819322 2 0
## 5 chr1_819827_820327 1 0
## 6 chr1_821644_822144 1 0
## Overlap_ENCODE_cCREs.CA-H3K4me3 Overlap_ENCODE_cCREs.CA-TF
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Overlap_ENCODE_cCREs.CA Overlap_ENCODE_cCREs.CTCF-bound
## 1 0 1
## 2 0 0
## 3 0 1
## 4 0 0
## 5 0 0
## 6 0 1
## Overlap_ENCODE_cCREs.dELS Overlap_ENCODE_cCREs.pELS Overlap_ENCODE_cCREs.PLS
## 1 1 0 0
## 2 0 0 1
## 3 0 1 0
## 4 0 2 0
## 5 1 0 0
## 6 1 0 0
## Overlap_ENCODE_cCREs.TF Percent_overlap
## 1 0 0.1
## 2 0 0.1
## 3 0 0.1
## 4 0 0.1
## 5 0 0.1
## 6 0 0.1
readr::write_tsv(peaks_anno_0.1, file.path(out, "peaks_annotated_ENCODE_cCREs.tsv"))
readr::write_tsv(cCREs_anno_0.1, file.path(out, "ENCODE_cCREs_overlap_peaks.tsv"))
# and to the granges object
# first, sanity check
all(peaks_anno_0.1$peak_name == mcols(peaks_gr)$peak_name)## [1] TRUE
mcols(peaks_gr)$Overlap_ENCODE_cCREs <- peaks_anno_0.1$Overlap_ENCODE_cCREsHow do some of the metrics on the called peaks vary based on whether they overlap with an ENCDOE cCRE?
p1 <- mcols(peaks_gr) %>%
as.data.frame() %>%
ggplot(aes(x = as.factor(Overlap_ENCODE_cCREs), y = score)) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(ylim = c(0, 30)) +
ggtitle("Peak score")
p2 <- mcols(peaks_gr) %>%
as.data.frame() %>%
ggplot(aes(x = as.factor(Overlap_ENCODE_cCREs), y = Reproducibility)) +
geom_boxplot(outlier.shape = NA) +
coord_cartesian(ylim = c(0, 5)) +
ggtitle("Peak reproducibility")
p <- plot_grid(p1, p2, nrow = 1)
p# ggsave(plot=p, filename = paste0(figout, "qc_per_overlap_status.pdf"), width = 10, height = 5)What are the peaks that don’t overlap with any ENCODE peaks?
p <- peaks_gr[peaks_gr$Overlap_ENCODE_cCREs == 0, ] %>%
mcols() %>%
as.data.frame() %>%
ggplot(aes(x = "1")) +
geom_bar(aes(fill = peakType), position = "stack") +
scale_fill_manual(values = cmap_anno) +
scale_y_continuous(labels = scales::comma)
p# ggsave(plot=p, filename = paste0(figout, "peaks_no_olap_breakdown.pdf"), width = 4, height = 7)What cell types to they come from?
peaks_df <- peaks_gr %>%
mcols() %>%
as.data.frame() %>%
separate(GroupReplicate, sep = "\\._\\.", into = c("Cluster", "Sample")) %>%
left_join(cluster_meta, by = "Cluster")
cluster_order_lex <- cluster_meta %>% arrange(organ, L1_clusterID) %>% pull(Cluster)
clust_order_desc_npeak <- table(peaks_df$Cluster) %>% as.data.frame() %>% dplyr::rename(Cluster="Var1", npeaks="Freq") %>% dplyr::arrange(desc(npeaks)) %>% pull(Cluster)
peaks_df <- peaks_df %>% mutate(Cluster = factor(Cluster, levels = clust_order_desc_npeak))
clust_organ_map <- peaks_df %>% dplyr::select(Cluster, organ) %>% unique
rownames(clust_organ_map) <- as.character(clust_organ_map$Cluster)
p1 <- peaks_df %>%
ggplot(aes(x = Cluster)) +
geom_bar(aes(fill = organ)) +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
rotate_x() +
ggtitle("Number of peaks in global peakset originating from each cluster") +
ylab("# peaks") +
hide_legend() +
theme(axis.text.x = element_text(size = 6))
p2 <- peaks_df %>%
ggplot(aes(x = Cluster)) +
geom_bar(aes(fill = organ)) +
facet_wrap(~ Overlap_ENCODE_cCREs, nrow = 3) +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
rotate_x() +
ggtitle("Number of peaks in global peakset originating from each cluster by ENCODE overlap status") +
ylab("# peaks") +
theme(axis.text.x = element_text(size = 4))
a <- peaks_df %>% dplyr::group_by(Cluster) %>% dplyr::summarise(npeak=n())
b <- peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0) %>% dplyr::group_by(Cluster) %>% dplyr::summarise(npeak_0overlap=n())
ratio_df <- merge(a, b) %>% dplyr::mutate(new_ratio=npeak_0overlap/npeak)
ratio_df$Cluster <- factor(ratio_df$Cluster, levels=levels(peaks_df$Cluster))
ratio_df$organ <- clust_organ_map[as.character(ratio_df$Cluster),]$organ
avg_ratio <- nrow(peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0)) / nrow(peaks_df)
p3 <- ratio_df %>%
ggplot(aes(x = Cluster, y=new_ratio)) +
geom_col(aes(fill = organ)) +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
rotate_x() +
geom_hline(yintercept=avg_ratio, linetype="dashed", size=1, color="gray") +
ggtitle(paste0("Ratio of new peaks with 0 overlap with ENCODE\n(Avg Ratio=", round(avg_ratio,2), ")")) +
ylab("ratio") +
theme(axis.text.x = element_text(size = 6)) +
theme(legend.position = "none")
p <- cowplot::plot_grid(p1, p2, p3, ncol = 1, rel_heights = c(0.3, 0.7, 0.3), align = "v", axis = "rl")
p# ggsave(plot=p, filename = paste0(figout, "peak_breakdown_by_celltype.pdf"), width = 18, height = 12)What organs do they come from?
peaks_df <- peaks_gr %>%
mcols() %>%
as.data.frame() %>%
separate(GroupReplicate, sep = "\\._\\.", into = c("Cluster", "Sample")) %>%
left_join(cluster_meta, by = "Cluster")
organ_order_desc_npeak <- table(peaks_df$organ) %>% as.data.frame() %>% dplyr::rename(organ="Var1", npeaks="Freq") %>% dplyr::arrange(desc(npeaks)) %>% pull(organ)
peaks_df <- peaks_df %>% mutate(organ = factor(organ, levels = organ_order_desc_npeak))p1 <- peaks_df %>%
ggplot(aes(x = organ)) +
geom_bar(aes(fill = organ)) +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
rotate_x() +
ggtitle("Number of peaks in global peakset originating from each organ") +
ylab("# peaks") +
hide_legend() +
theme(axis.text.x = element_text(size = 12))
p2 <- peaks_df %>%
ggplot(aes(x = organ)) +
geom_bar(aes(fill = organ)) +
facet_wrap(~ Overlap_ENCODE_cCREs, ncol = 1) +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
rotate_x() +
ggtitle("Number of peaks in global peakset originating from each organ by ENCODE overlap status") +
ylab("# peaks") +
theme(axis.text.x = element_text(size = 12)) +
theme(legend.position = "none")
a <- peaks_df %>% dplyr::group_by(organ) %>% dplyr::summarise(npeak=n())
b <- peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0) %>% dplyr::group_by(organ) %>% dplyr::summarise(npeak_0overlap=n())
ratio_df <- merge(a, b) %>% dplyr::mutate(new_ratio=npeak_0overlap/npeak)
ratio_df$organ <- factor(ratio_df$organ, levels=levels(peaks_df$organ))
avg_ratio <- nrow(peaks_df %>% dplyr::filter(Overlap_ENCODE_cCREs==0)) / nrow(peaks_df)
p3 <- ratio_df %>%
ggplot(aes(x = organ, y=new_ratio)) +
geom_col(aes(fill = organ)) +
scale_fill_manual(values = cmap_organ) +
scale_y_continuous(labels = scales::comma) +
rotate_x() +
geom_hline(yintercept=avg_ratio, linetype="dashed", size=1, color="gray") +
ggtitle(paste0("Ratio of new peaks with 0 overlap with ENCODE\n(Avg Ratio=", round(avg_ratio,2), ")")) +
ylab("ratio") +
theme(axis.text.x = element_text(size = 12)) +
theme(legend.position = "none")
p <- cowplot::plot_grid(p1, p2, p3, ncol = 1, rel_heights = c(0.3, 0.7), align = "v", axis = "rl")
p# ggsave(plot=p, filename = paste0(figout, "peak_breakdown_by_organ.pdf"), width = 8, height = 20).libPaths()## [1] "/oak/stanford/groups/wjg/bliu/software/R_lib"
## [2] "/share/software/user/open/R/4.1.2/lib64/R/library"
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggthemes_4.2.4 RColorBrewer_1.1-3 ggrastr_1.0.1
## [4] magrittr_2.0.3 GenomicFeatures_1.46.5 AnnotationDbi_1.56.2
## [7] Biobase_2.54.0 rtracklayer_1.54.0 GenomicRanges_1.46.1
## [10] GenomeInfoDb_1.30.1 IRanges_2.28.0 S4Vectors_0.32.4
## [13] BiocGenerics_0.40.0 cowplot_1.1.1 tibble_3.2.1
## [16] readr_2.1.4 glue_1.6.2 purrr_1.0.2
## [19] ggplot2_3.5.0 tidyr_1.3.1 dplyr_1.1.4
## [22] here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 matrixStats_0.63.0
## [3] bit64_4.0.5 filelock_1.0.2
## [5] progress_1.2.2 httr_1.4.6
## [7] rprojroot_2.0.3 tools_4.1.2
## [9] bslib_0.4.2 utf8_1.2.3
## [11] R6_2.5.1 vipor_0.4.5
## [13] DBI_1.1.3 colorspace_2.1-0
## [15] withr_3.0.0 tidyselect_1.2.0
## [17] prettyunits_1.1.1 bit_4.0.5
## [19] curl_5.0.0 compiler_4.1.2
## [21] cli_3.6.2 xml2_1.3.4
## [23] DelayedArray_0.20.0 labeling_0.4.2
## [25] sass_0.4.6 scales_1.3.0
## [27] rappdirs_0.3.3 stringr_1.5.0
## [29] digest_0.6.31 Rsamtools_2.10.0
## [31] rmarkdown_2.21 XVector_0.34.0
## [33] pkgconfig_2.0.3 htmltools_0.5.5
## [35] MatrixGenerics_1.6.0 highr_0.10
## [37] dbplyr_2.3.2 fastmap_1.1.1
## [39] rlang_1.1.3 RSQLite_2.3.1
## [41] farver_2.1.1 jquerylib_0.1.4
## [43] BiocIO_1.4.0 generics_0.1.3
## [45] jsonlite_1.8.4 vroom_1.6.3
## [47] BiocParallel_1.28.3 RCurl_1.98-1.12
## [49] GenomeInfoDbData_1.2.7 Matrix_1.5-4
## [51] Rcpp_1.0.10 ggbeeswarm_0.7.2
## [53] munsell_0.5.0 fansi_1.0.4
## [55] lifecycle_1.0.3 stringi_1.7.12
## [57] yaml_2.3.7 SummarizedExperiment_1.24.0
## [59] zlibbioc_1.40.0 BiocFileCache_2.2.1
## [61] blob_1.2.4 parallel_4.1.2
## [63] crayon_1.5.2 lattice_0.20-45
## [65] Biostrings_2.62.0 hms_1.1.3
## [67] KEGGREST_1.34.0 knitr_1.42
## [69] pillar_1.9.0 rjson_0.2.21
## [71] biomaRt_2.50.3 XML_3.99-0.14
## [73] evaluate_0.21 png_0.1-8
## [75] vctrs_0.6.5 tzdb_0.4.0
## [77] gtable_0.3.3 cachem_1.0.8
## [79] xfun_0.39 restfulr_0.0.15
## [81] GenomicAlignments_1.30.0 beeswarm_0.4.0
## [83] memoise_2.0.1