# **MODIFY THIS CHUNK**
project_id  <- "HDMA-public" # determines the name of the cache folder
doc_id      <- "04-enhancers/06" # determines name of the subfolder of `figures` where pngs/pdfs are saved
out         <- here::here("output/", doc_id); dir.create(out, recursive = TRUE)
figout      <- here::here("figures/", doc_id, "/")
cache       <- paste0("~/scratch/cache/", project_id, "/", doc_id, "/")

1 Set up

# libraries
library(here)
library(ArchR)
library(tidyverse)
library(ggplot2)
library(glue)
library(cowplot)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ComplexHeatmap)
library(circlize)
library(ggpubr)
library(BPCells)

# shared project scripts/functions
script_path <- here::here("code/utils/")
source(file.path(script_path, "track_helpers_BL.R"))
source(file.path(script_path, "track_helpers_SJ.R"))
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "GO_wrappers.R"))
source(file.path(script_path, "trackplots.R"))

ggplot2::theme_set(theme_minimal())
addArchRGenome("hg38")

2 Get tissue metadata

tissue_meta <- read_tsv(here::here("code/02-global_analysis/01-organs_keep.tsv"))

## Rows: 12 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): organcode, organ, iteration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

3 Get all annotations

# add organ colors
col.data.frame <- data.frame(Color=cmap_organ) %>% dplyr::mutate(organ=names(cmap_organ))

organ.code.list <- tissue_meta$organcode
all.annots <- lapply(1:length(organ.code.list), function(i){
  # read cluster id annotation
  annot <- read.csv(sprintf(here::here("output/01-preprocessing/02/shared/meta/%s_meta_cluster.txt"), organ.code.list[i]), sep="\t") %>% as.data.frame
  rownames(annot) <- annot$cb
  annot$L0_clusterID <- paste0(organ.code.list[i],"_",annot$L1_clusterID)
  annot$L3_clusterID <- paste0(organ.code.list[i],"_",annot$L3_clusterName)
  return(annot)
})
all.annots <- dplyr::bind_rows(all.annots)
rownames(all.annots) <- all.annots$L0_clusterID

all.annots <- all.annots %>%
  mutate(Cluster = paste0(organ_code, "_",L1_clusterID)) %>%
  mutate(Cluster_labelled = paste0(organ_code, "_", L1_clusterID, "_", L3_clusterName)) %>%
  left_join(col.data.frame) %>%
  tibble::column_to_rownames(var = "Cluster")

## Joining with `by = join_by(organ)`

head(all.annots)

##        organ organ_code L1_clusterID L0_clusterName L1_clusterName L2_clusterID
## AG_0 Adrenal         AG            0            epi         AG_epi      AG_epi1
## AG_1 Adrenal         AG            1            epi         AG_epi      AG_epi2
## AG_2 Adrenal         AG            2            epi         AG_epi      AG_epi3
## AG_3 Adrenal         AG            3            epi         AG_epi      AG_epi4
## AG_4 Adrenal         AG            4            epi         AG_epi      AG_epi5
## AG_5 Adrenal         AG            5            end         AG_end       AG_end
##               L2_clusterName            L3_clusterName ncell median_numi
## AG_0          adrenal cortex          adrenal cortex 1   823      9498.0
## AG_1          adrenal cortex          adrenal cortex 2   576      6827.5
## AG_2          adrenal cortex          adrenal cortex 3   511     10567.0
## AG_3          adrenal cortex          adrenal cortex 4   483      6340.0
## AG_4 sympathoadrenal lineage sympathoadrenal lineage 1   155      5262.0
## AG_5             endothelial               endothelial   115      4117.0
##      median_ngene median_pctmt median_nfrags median_tss median_frip
## AG_0         3280  0.008694896        6784.0     9.8320   0.2906086
## AG_1         3114  0.000000000        4341.5    10.3445   0.2400227
## AG_2         3661  0.008910274        5692.0     9.6830   0.2679032
## AG_3         2487  0.000000000        5519.0     9.8760   0.3020710
## AG_4         2564  0.000000000        4425.0    10.1390   0.2104465
## AG_5         2188  0.006551792        4192.0    10.1250   0.2417165
##                                                                                            note
## AG_0                 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## AG_1                                                  AGTR1+ likely zona glomerulosa; all PCW17
## AG_2                                        NR5A1+ AKR1B1+ SULT2A1+ CYP11A1/CYP11B1+; all PCW18
## AG_3                                                                        NR5A1+; mosly PCW21
## AG_4 PHOX2A/B+ ISL1+ GATA3+; likely mix of chromaffin cells and sympathoblasts from all samples
## AG_5                                                               PECAM1+ TEK+; mix of samples
##      L0_clusterID                 L3_clusterID               Cluster_labelled
## AG_0         AG_0          AG_adrenal cortex 1          AG_0_adrenal cortex 1
## AG_1         AG_1          AG_adrenal cortex 2          AG_1_adrenal cortex 2
## AG_2         AG_2          AG_adrenal cortex 3          AG_2_adrenal cortex 3
## AG_3         AG_3          AG_adrenal cortex 4          AG_3_adrenal cortex 4
## AG_4         AG_4 AG_sympathoadrenal lineage 1 AG_4_sympathoadrenal lineage 1
## AG_5         AG_5               AG_endothelial               AG_5_endothelial
##        Color
## AG_0 #876941
## AG_1 #876941
## AG_2 #876941
## AG_3 #876941
## AG_4 #876941
## AG_5 #876941

4 Plot basic ABC stats

abc_dir <- here::here("output/04-enhancers/03/")
global <- readRDS(here::here("output/04-enhancers/01/peaks_all_df_annots.rds"))

for (organ in tissue_meta$organ){
#for (organ in tissue_meta$organ[10:12]){
  message(organ)
  if (organ=="StomachEsophagus"){
    dirs <- Sys.glob(paste0(abc_dir, "/results/Stomach*"))
  } else{
  dirs <- Sys.glob(paste0(abc_dir, "/results/", organ, "*"))
  }
  abc_preds <- lapply(dirs, function(n){
    message(n)
    output <- read.csv(file.path(n, "Predictions/EnhancerPredictionsFull_threshold0.013_self_promoter.tsv"), sep = "\t")
    return(output)
  })
  
  # merge all predictions from clusters of the same organ
  abc <- do.call(rbind, abc_preds)
  
  # filter for only predictions where taret gene is expressed by promoter activity
  abc <- abc[abc$TargetGeneIsExpressed=="True",]
  dim(abc)
  table(abc$class)
  table(abc$CellType)
  p6 <- ggplot(abc, aes(x=CellType, fill=CellType)) + geom_bar() + theme_BOR() + theme(axis.text.x =element_text(angle=90, hjust=1)) + ylab("number of ABC links pass filter")
  table(abc$TargetGeneIsExpressed)


  # plots
  p1 <- gghistogram(abc$ABC.Score) + xlab("ABC Score") + ggtitle(organ)
  p2 <- gghistogram(abc[abc$class!="promoter",]$ABC.Score) + xlab("ABC Score") + ggtitle(paste0(organ, " (no promoter)"))

  p3 <- gghistogram(abc[abc$class!="promoter",]$distance) + xlab("distance in bp") + ggtitle(paste0(organ, " (no promoter)"))
  p4 <- ggplot(abc, aes(x=distance, y=ABC.Score)) + rasterise(geom_point(), dpi = 150) + xlab("distance in bp") + ggtitle(organ)

  # filter for non-promoters only
  abc[abc$class!="promoter",] %>% group_by(chr, start, end, TargetGene) %>% dplyr::summarise(nclusts=n()) %>% arrange(desc(nclusts)) -> test
  p5 <- gghistogram(test$nclusts) + ggtitle(paste0(organ, " (no promoter)")) + xlab("# of clusters where peak-gene link was predicted by ABC")

  pdf(paste0(figout, "/abc_summary_", organ, ".pdf"), width=5, height=5)
  for (p in (paste0("p", 1:6))){
    eval(as.symbol(p)) %>% print
  }
  dev.off()

  saveRDS(abc, paste0(out, "/abc_raw_", organ, ".rds"))
  
  ## overlap with global peakset
  # filter to top target gene per peak with the highest ABC score
  abc_dedup <- abc %>% dplyr::group_by(chr, start, end, TargetGene) %>% dplyr::summarise(nclusts=n(), med_abc_score=median(ABC.Score)) %>% group_by(chr,start,end) %>% arrange(desc(med_abc_score)) %>% dplyr::slice(1)

  overlap <- findOverlaps(GRanges(global), GRanges(abc_dedup))

  organcode <- tissue_meta[tissue_meta$organ==organ,]$organcode

  global[paste0("abc_is_linked_to_gene_", organcode)] <- 0
  global[overlap@from, paste0("abc_is_linked_to_gene_", organcode)] <- 1

  global[paste0("abc_linked_gene_", organcode)] <- "None"
  global[overlap@from, paste0("abc_linked_gene_", organcode)] <- abc_dedup[overlap@to, "TargetGene"]

  global[paste0("med_abc_score_", organcode)] <- 0
  global[overlap@from, paste0("med_abc_score_", organcode)] <- abc_dedup[overlap@to, "med_abc_score"]

  # the percentage of peaks with identical nearest gene and linked abc gene
  test <- global[!is.na(global$nearestGene) & global[paste0("abc_linked_gene_", organcode)]!="None",]
  message(sprintf("%s: %.2f %% of peaks have identical nearest gene and linked abc gene", organ, sum(test$nearestGene == test[paste0("abc_linked_gene_", organcode)])/dim(test)[1] * 100))
}

saveRDS(global, paste0(out, "/abc_summary_df.rds"))

5 Exploring the global ABC dataframe

global <- readRDS(paste0(out, "/abc_summary_df.rds"))
df <- global[,colnames(global)[grep("abc_linked_gene|nearestGene",colnames(global))]]

identical_row <- apply(df, 1, function(row) all(row == row[1]))

sprintf("%d global peaks are linked to the same gene that's also the nearest gene in all organ ABC analysis",length(which(identical_row)))

## [1] "44922 global peaks are linked to the same gene that's also the nearest gene in all organ ABC analysis"

# plot the breakdown of the n_organs each peak has a linked ABC gene in
global[,grep("abc_is_linked_to_gene", colnames(global))] -> test
global$norg_abc <- rowSums(test)
p1 <- ggpubr::gghistogram(global$norg_abc) + xlab("number of organs each peak is linked to a gene in") + ggtitle("global")

p2 <- ggplot(global[sample(1:dim(global)[1],size=100000),], aes_string(x="norg_abc",group="norg_abc", y="score")) + geom_boxplot(outliers = F) + xlab("#organs each peak is linked to a gene in") + ylab("macs2 peak calling score") + theme_classic()


p3 <- ggplot(global, aes(x=norg_abc, fill=peakType)) + geom_bar(position="fill") + theme_BOR()

## 
## Attaching package: 'ggthemes'

## The following object is masked from 'package:cowplot':
## 
##     theme_map

p1

p2

p3

5.1 plot some examples of the ABC links predicted in all organs

global_bp_obj <- readRDS(here::here("output/05-misc/01/global_bp_obj.rds"))

subglobal <- global[which(identical_row), ]
subglobal$med_abc_score <- subglobal[,colnames(global)[grep("med_abc_score_",colnames(global))]] %>% as.data.frame %>% rowMeans
subglobal$abc_linked_gene <- subglobal$abc_linked_gene_AG

# convert to loops 
# first get the TSS coords from ABC reference file
tss <- read.table(here::here("output/04-enhancers/03/reference/hg38/CollapsedGeneBounds.hg38.bed"), col.names = c("seqnames", "start", "end", "symbol", "", "strand", "id", "type"))
tss$tss_start <- 0
tss[tss$strand=="-", "tss_start"] <- tss[tss$strand=="-", "end"]
tss[tss$strand=="+", "tss_start"] <- tss[tss$strand=="+", "start"]

subglobal <- merge(subglobal, tss[,c("symbol", "tss_start")], by.x="abc_linked_gene", by.y="symbol", all.x=T)

loops <- subglobal[,c("seqnames", "abc_linked_gene", "peak_name", "med_abc_score")]
loops$start <- pmin((subglobal$start + subglobal$end)%/%2, subglobal$tss_start)
loops$end <- pmax((subglobal$start + subglobal$end)%/%2, subglobal$tss_start)

# plot
region <- GRanges(subglobal[1,]) + 5e4
overlap <- findOverlaps(region, GRanges(subglobal))
peaks_highlight <- ranges(GRanges(subglobal)[overlap@to])
highlights <- gr_to_highlight(peaks_highlight)

p <- trackplot_helper_v2c(
  region=region,
  clusters=global_bp_obj$cell_metadata$organ, 
  fragments=global_bp_obj$frags, 
  cell_read_counts=global_bp_obj$cell_metadata$nFrags, 
  transcripts=global_bp_obj$transcripts, 
  loops=GRanges(loops)[findOverlaps(GRanges(loops), region)@from], loop_color="med_abc_score", 
  flank=1e5,
  loop_label = "ABC predicted links",
  highlights = highlights
)
p

## distribution of the abc loop span

# for cross-organ links
p <- ggpubr::gghistogram(ranges(GRanges(loops))@width) + xlab("distance between ABC predicted enhancer-gene pairs") + ggtitle("peaks linked to the same gene in all 12 organs")
p1 <- p + scale_x_log10()
p

p1

5.2 J plot, top genes with most # of linked peaks conserved across organs

top_genes <- subglobal$abc_linked_gene %>% table %>% as.data.frame %>% dplyr::rename(gene=".", npeak="Freq") %>% arrange(desc(npeak)) %>% dplyr::mutate(gene_rank=seq_along(gene))

p1 <- ggplot(top_genes, aes(x=gene_rank, y=npeak)) + geom_point() + theme_BOR() +
    theme(legend.position = "none") + 
    ggtitle("Top genes with most # of linked peaks conserved across organs")

cutoff <- 7
top_genes$color <- "gray"
top_genes[top_genes$npeak>cutoff, "color"] <- "red"
p2 <- ggplot(top_genes, aes(x=gene_rank, y=npeak, col=color)) + geom_point() + theme_BOR() +
    geom_hline(yintercept=cutoff,linetype = 'dashed', col = 'gray') + 
    scale_color_manual(values=c("gray", "red")) + 
    ggrepel::geom_label_repel(data=subset(top_genes, npeak>cutoff), aes(label=gene), max.overlaps = 10) + 
    theme(legend.position = "none") + 
    annotate("text", x = max(top_genes$gene_rank)*0.8, y = cutoff, label = paste0("total genes passing cutoff = ", dim(top_genes[top_genes$npeak>cutoff,])[1], "\n with n linked peaks > ", cutoff), vjust = -0.5) + 
    ggtitle("Top genes with most # of linked peaks conserved across organs")

p3 <- ggplot(top_genes, aes(x=gene_rank, y=npeak, col=color)) + geom_point() + theme_BOR() +
    geom_hline(yintercept=cutoff,linetype = 'dashed', col = 'gray') + 
    scale_color_manual(values=c("gray", "red")) + 
    ggrepel::geom_label_repel(data=subset(top_genes, npeak>cutoff), aes(label=gene), max.overlaps = 100) + 
    theme(legend.position = "none") + 
    annotate("text", x = max(top_genes$gene_rank)*0.8, y = cutoff, label = paste0("total genes passing cutoff = ", dim(top_genes[top_genes$npeak>cutoff,])[1], "\n with n linked peaks > ", cutoff), vjust = -0.5) + 
    ggtitle("Top genes with most # of linked peaks conserved across organs")


p1

p2

p3

write.table(top_genes, paste0(out, "/abc_linkinallorgan_top_genes.tsv"), sep="\t", quote=F)

5.3 GO enrichment for top genes

top_genes <- read.table(paste0(out, "/abc_linkinallorgan_top_genes.tsv"))

all_tested_genes <- top_genes$gene
goi <- top_genes[top_genes$npeak>7,]$gene

upGO <- calcTopGo(all_tested_genes, interestingGenes=goi, nodeSize=5, ontology="BP")

## Running GO enrichments with 567 genes in universe of 12513...

goRes <- upGO[order(as.numeric(upGO$pvalue), decreasing=FALSE),]

# pdf(paste0(figout, "/top567_linkinallorgan_genes_go_enrichment.pdf"), height=7, width=8)
if(nrow(goRes)>1){
  print(topGObarPlot(goRes, cmap = cmaps_BOR$comet, 
                     nterms=15, border_color="black", 
                     barwidth=0.85, title="top genes with conserved ABC links", barLimits=c(0, 20)))
}

# dev.off()

# get the actual genes linked to each GO term
geneList <- factor(as.integer(all_tested_genes %in% goi))
names(geneList) <- all_tested_genes
GOdata <- suppressMessages(new(
            "topGOdata",
            ontology = "BP",
            allGenes = geneList,
            geneSel = goi,
            annot = annFUN.org, mapping = "org.Hs.eg.db", ID = "symbol",
            nodeSize = 50
            ))

goi[goi %in% unlist(genesInTerm(GOdata, "GO:0000122"))]

##  [1] "BCOR"     "CBX4"     "ARX"      "NFIC"     "DAB2IP"   "HES1"    
##  [7] "IRF2BP2"  "IRF2BPL"  "KDM2B"    "MEIS2"    "PDGFB"    "BCL11A"  
## [13] "HDAC7"    "TBX2"     "ARID5B"   "ATXN1"    "CCND1"    "CREB3L1" 
## [19] "GTF2IRD1" "KLF2"     "MXI1"     "NFIB"     "PTCH1"    "ZBTB7A"  
## [25] "ZNF608"   "ARID5A"   "BACH2"    "CTBP2"    "FOXO3"    "GFI1"    
## [31] "LRRFIP1"  "MAF"      "MNT"      "NFATC3"   "SIN3A"    "SKI"     
## [37] "BCL6"     "CITED2"   "CXXC5"    "FOXP1"    "HMGA2"    "KLF11"   
## [43] "MAFK"     "MAGED1"   "NFATC2"   "NR2F1"    "NR4A2"    "OSR1"    
## [49] "POU4F1"   "PRDM1"    "RREB1"    "RXRA"     "SREBF1"   "TAL1"    
## [55] "TBL1X"    "VEGFA"    "ZFHX3"    "ZFP90"    "AKIRIN2"  "CCNE1"   
## [61] "CEBPA"    "CUX1"     "DACH1"    "DLX1"     "DNMT3A"   "EOMES"   
## [67] "ETS2"     "ETV6"     "HIVEP1"   "ID3"      "JDP2"     "KLF4"    
## [73] "MAGEF1"   "NCOR2"    "NFATC4"   "NFIL3"    "NKX6-1"   "NRARP"   
## [79] "PAX5"     "PHF19"    "SMAD3"    "SMAD7"    "SNAI1"    "SOX2"    
## [85] "TCF7L2"   "ZBTB16"   "ZFPM1"    "ZNF846"

goi[goi %in% unlist(genesInTerm(GOdata, "GO:0045944"))]

##  [1] "NFATC1"   "NFIC"     "DAB2IP"   "HES1"     "IRF2BPL"  "MEIS2"   
##  [7] "PDGFB"    "ELF4"     "KDM6B"    "OTX1"     "CREB3L1"  "HAND2"   
## [13] "KLF2"     "NFIB"     "RGCC"     "SOX4"     "ARHGEF2"  "CTBP2"   
## [19] "DDX3X"    "FOXO3"    "KLF6"     "MAF"      "NFATC3"   "NFKBIA"  
## [25] "SIN3A"    "SKI"      "SOX17"    "SSBP3"    "TNFRSF1A" "ZNF462"  
## [31] "ARRB1"    "ATOH8"    "CITED2"   "CSRNP1"   "FOXO1"    "HMGA2"   
## [37] "IER5"     "LRP5"     "LRP5L"    "MAFB"     "MEF2D"    "NFATC2"  
## [43] "NR2F1"    "NR4A2"    "OSR1"     "POU3F2"   "POU4F1"   "RREB1"   
## [49] "RXRA"     "SMAD6"    "SREBF1"   "TAL1"     "TBL1X"    "TEAD1"   
## [55] "VEGFA"    "WNT5A"    "ZFHX3"    "AGO2"     "AKIRIN2"  "AKNA"    
## [61] "BCL11B"   "CEBPA"    "CHD7"     "DLX1"     "EOMES"    "ETS2"    
## [67] "ETV6"     "FOS"      "FOXD1"    "FZD5"     "HIVEP1"   "HLF"     
## [73] "IRF1"     "KLF4"     "LMO4"     "MAFA"     "NFATC4"   "NFKB2"   
## [79] "NKX6-1"   "PAX5"     "PAX9"     "PBX1"     "PIK3R1"   "RPS6KA4" 
## [85] "SMAD3"    "SOX2"     "TCF4"     "TCF7L2"   "WNT7A"    "ZBTB16"  
## [91] "ZFAT"     "ZFPM1"    "ZNF629"

5.4 tracks for top genes

# highlights <- tibble(
#   xmin = peaks_highlight@start,
#   xmax = peaks_highlight@start + peaks_highlight@width - 1,
#   color = rep("red", length(peaks_highlight)),
#   ymin = -Inf, ymax = Inf, alpha = .2
# )

p <- trackplot_helper_v2c(
    gene="WNT11",
    clusters=global_bp_obj$cell_metadata$organ, 
    fragments=global_bp_obj$frags, 
    cell_read_counts=global_bp_obj$cell_metadata$nFrags, 
    transcripts=global_bp_obj$transcripts, 
    expression_matrix = global_bp_obj$rna,
    loops=GRanges(loops), loop_color="med_abc_score", 
    flank=3e4,
    loop_label = "ABC predicted links"
    #highlights = highlights
  )
p
  # ggplot2::ggsave(plot=p, paste0(figout, "/plot_global_organ_abc_linkinallorgan_WNT11.pdf"), dpi=300, width=9, height=8, units="in")


# split by organ
for (organcode in unique(global_bp_obj$cell_metadata$organ)){
  submeta <- global_bp_obj$cell_metadata %>% dplyr::filter(organ==organcode)
  p <- trackplot_helper_v2c(
    gene="WNT11",
    clusters=submeta$L3_clusterID, 
    fragments=global_bp_obj$frags %>% select_cells(submeta$cb), 
    cell_read_counts=submeta$nFrags, 
    transcripts=global_bp_obj$transcripts, 
    expression_matrix = global_bp_obj$rna[,submeta$cb],
    loops=GRanges(loops), loop_color="med_abc_score", 
    flank=3e4,
    loop_label = "ABC predicted links"
    #highlights = highlights
  )
  print(p)
  # ggplot2::ggsave(plot=p, paste0(figout, "/plot_global_organ_abc_linkinallorgan_WNT11_",organcode, ".pdf"), dpi=300, width=9, height=8, units="in")
}

5.5 tracks for peaks with no linked genes

we don’t expect a lot of ATAC signal around here

peak <- GRanges(global %>% dplyr::filter(norg_abc==0))[10]
region <- peak + 5e4
peaks_highlight <- ranges(peak)

highlights <- tibble(
  xmin = peaks_highlight@start,
  xmax = peaks_highlight@start + peaks_highlight@width - 1,
  color = rep("red", length(peaks_highlight)),
  ymin = -Inf, ymax = Inf, alpha = .2
)

p <- trackplot_helper_v2c(
  region=region,
  clusters=global_bp_obj$cell_metadata$organ, 
  fragments=global_bp_obj$frags, 
  cell_read_counts=global_bp_obj$cell_metadata$nFrags, 
  transcripts=global_bp_obj$transcripts, 
  loops=GRanges(loops)[findOverlaps(GRanges(loops), region)@from], loop_color="med_abc_score", 
  flank=1e5,
  loop_label = "ABC predicted links",
  highlights = highlights
)
p

6 Per organ enrichment analysis

For each organ, find the genes with the most number of ABC links and do GO term enrichment

read global bp object

global_bp_obj <- readRDS(here::here("output/05-misc/01/global_bp_obj.rds"))

6.1 J plot, top genes with most # of linked peaks

# cutoff for defining highly regulated genes
# fill this in after inspecting the J plots
cutoff_ls <- list(
  "Adrenal" = 100,
  "Brain" = 220,
  "Eye" = 250,
  "Heart" = 200,
  "Liver" = 200,
  "Lung" = 230,
  "Muscle" = 250,
  "Skin" = 220,
  "Spleen" = 150,
  "StomachEsophagus" = 260,
  "Thymus" = 200,
  "Thyroid" = 130
)

for (organ in tissue_meta$organ){
  message(organ)
  # no deduplication, so the same peak-gene link can appear in multiple cell types
  top_genes <- global_bp_obj$loops_abc[[organ]]$TargetGene %>% table %>% as.data.frame %>% dplyr::rename(gene=".", npeak="Freq") %>% arrange(desc(npeak)) %>% dplyr::mutate(gene_rank=seq_along(gene))
  
  
  cutoff <- cutoff_ls[[organ]]
  top_genes$color <- "gray"
  top_genes[top_genes$npeak>cutoff, "color"] <- "red"
  p2 <- ggplot(top_genes, aes(x=gene_rank, y=npeak, col=color)) + geom_point() + theme_BOR() +
      geom_hline(yintercept=cutoff,linetype = 'dashed', col = 'gray') +
      scale_color_manual(values=c("gray", "red")) +
      ggrepel::geom_label_repel(data=subset(top_genes, npeak>cutoff), aes(label=gene), max.overlaps = 30) +
      theme(legend.position = "none") +
      annotate("text", x = max(top_genes$gene_rank)*0.8, y = cutoff, label = paste0("total genes passing cutoff = ", dim(top_genes[top_genes$npeak>cutoff,])[1], "\n with n linked peaks > ", cutoff), vjust = -0.5) +
      ggtitle(paste0(organ, " highly regulated genes"))

  write.table(top_genes, paste0(out, "/abc_perorgan_top_genes_", organ, ".tsv"), sep="\t", quote=F)

  # GO term enrichment
  all_tested_genes <- top_genes$gene
  goi <- top_genes[top_genes$npeak>cutoff,]$gene
  
  # GO BP
  upGO <- calcTopGo(all_tested_genes, interestingGenes=goi, nodeSize=5, ontology="BP")
  goRes <- upGO[order(as.numeric(upGO$pvalue), decreasing=FALSE),]
  
  pdf(paste0(figout, "/abc_perorgan_top_genes_", organ, ".pdf"), width=10, height=10)
  print(p2)
  if(nrow(goRes)>1){
    print(topGObarPlot(goRes, cmap = cmaps_BOR$comet,
                       nterms=15, border_color="black",
                       barwidth=0.85, title=paste0(organ, " high regulated genes GO BP enrichment")))
  }
   # GO MF
  upGO <- calcTopGo(all_tested_genes, interestingGenes=goi, nodeSize=5, ontology="MF")
  goRes <- upGO[order(as.numeric(upGO$pvalue), decreasing=FALSE),]
  if(nrow(goRes)>1){
    print(topGObarPlot(goRes, cmap = cmaps_BOR$comet,
                       nterms=15, border_color="black",
                       barwidth=0.85, title=paste0(organ, " high regulated genes GO MF enrichment")))
  }
  dev.off()


  # get the actual genes linked to each GO term
  geneList <- factor(as.integer(all_tested_genes %in% goi))
  names(geneList) <- all_tested_genes
  GOdata <- suppressMessages(new(
              "topGOdata",
              ontology = "BP",
              allGenes = geneList,
              geneSel = goi,
              annot = annFUN.org, mapping = "org.Hs.eg.db", ID = "symbol",
              nodeSize = 50
              ))
  saveRDS(GOdata, paste0(out, "/abc_perorgan_top_genes_GOBP_", organ, ".rds"))
  
  GOdata <- suppressMessages(new(
              "topGOdata",
              ontology = "MF",
              allGenes = geneList,
              geneSel = goi,
              annot = annFUN.org, mapping = "org.Hs.eg.db", ID = "symbol",
              nodeSize = 50
              ))
  saveRDS(GOdata, paste0(out, "/abc_perorgan_top_genes_GOMF_", organ, ".rds"))
  
  goi[goi %in% unlist(genesInTerm(GOdata, "GO:0000122"))] # negative regulation of transcription by RNA polymerase II
  goi[goi %in% unlist(genesInTerm(GOdata, "GO:0045944"))] # positive regulation of transcription by RNA polymerase II
  

}

6.2 top 1% of genes with most # of linked peaks

cutoff_pct <- 0.01 # top x pct

for (organ in tissue_meta$organ){
  message(organ)
  # no deduplication, so the same peak-gene link can appear in multiple cell types
  top_genes <- global_bp_obj$loops_abc[[organ]]$TargetGene %>% table %>% as.data.frame %>% dplyr::rename(gene=".", npeak="Freq") %>% arrange(desc(npeak)) %>% dplyr::mutate(gene_rank=seq_along(gene))
  
  cutoff <- quantile(top_genes$npeak, 1-cutoff_pct)[[1]]
  
  top_genes$color <- "gray"
  top_genes[top_genes$npeak>cutoff, "color"] <- "red"
  p2 <- ggplot(top_genes, aes(x=gene_rank, y=npeak, col=color)) + geom_point() + theme_BOR() +
      geom_hline(yintercept=cutoff,linetype = 'dashed', col = 'gray') +
      scale_color_manual(values=c("gray", "red")) +
      ggrepel::geom_label_repel(data=subset(top_genes, npeak>cutoff), aes(label=gene), max.overlaps = 30) +
      theme(legend.position = "none") +
      annotate("text", x = max(top_genes$gene_rank)*0.8, y = cutoff, label = paste0("total genes passing cutoff = ", dim(top_genes[top_genes$npeak>cutoff,])[1], "\n with n linked peaks > ", cutoff), vjust = -0.5) +
      ggtitle(paste0(organ, " highly regulated genes"))

  write.table(top_genes, paste0(out, "/abc_perorgan_top_genes_1pct_", organ, ".tsv"), sep="\t", quote=F)

  # GO term enrichment
  all_tested_genes <- top_genes$gene
  goi <- top_genes[top_genes$npeak>cutoff,]$gene
  
  # GO BP
  upGO <- calcTopGo(all_tested_genes, interestingGenes=goi, nodeSize=5, ontology="BP")
  goRes <- upGO[order(as.numeric(upGO$pvalue), decreasing=FALSE),]
  
  pdf(paste0(figout, "/abc_perorgan_top_genes_1pct_", organ, ".pdf"), width=10, height=10)
  print(p2)
  if(nrow(goRes)>1){
    print(topGObarPlot(goRes, cmap = cmaps_BOR$comet,
                       nterms=15, border_color="black",
                       barwidth=0.85, title=paste0(organ, " high regulated genes GO BP enrichment")))
  }
   # GO MF
  upGO <- calcTopGo(all_tested_genes, interestingGenes=goi, nodeSize=5, ontology="MF")
  goRes <- upGO[order(as.numeric(upGO$pvalue), decreasing=FALSE),]
  if(nrow(goRes)>1){
    print(topGObarPlot(goRes, cmap = cmaps_BOR$comet,
                       nterms=15, border_color="black",
                       barwidth=0.85, title=paste0(organ, " high regulated genes GO MF enrichment")))
  }
  dev.off()


  # get the actual genes linked to each GO term
  geneList <- factor(as.integer(all_tested_genes %in% goi))
  names(geneList) <- all_tested_genes
  GOdata <- suppressMessages(new(
              "topGOdata",
              ontology = "BP",
              allGenes = geneList,
              geneSel = goi,
              annot = annFUN.org, mapping = "org.Hs.eg.db", ID = "symbol",
              nodeSize = 50
              ))
  saveRDS(GOdata, paste0(out, "/abc_perorgan_top_genes_1pct_GOBP_", organ, ".rds"))
  
  GOdata <- suppressMessages(new(
              "topGOdata",
              ontology = "MF",
              allGenes = geneList,
              geneSel = goi,
              annot = annFUN.org, mapping = "org.Hs.eg.db", ID = "symbol",
              nodeSize = 50
              ))
  saveRDS(GOdata, paste0(out, "/abc_perorgan_top_genes_1pct_GOMF_", organ, ".rds"))
  
  # goi[goi %in% unlist(genesInTerm(GOdata, "GO:0000122"))] # negative regulation of transcription by RNA polymerase II
  # goi[goi %in% unlist(genesInTerm(GOdata, "GO:0045944"))] # positive regulation of transcription by RNA polymerase II
  # 

}

7 Per meta cell cluster enrichment analysis

Manually define broad cell clusters by adding a column L4_clusterName in output/02-global_analysis/02/cluster_metadata_dend_order.tsv, e.g. fibroblasts, immune, neural crest derived

loops <- do.call(c, global_bp_obj$loops_abc %>% unname)

7.1 top 1% of genes with most # of linked peaks

clust_meta <- read_tsv(here::here("output/02-global_analysis/02/cluster_metadata_dend_order.tsv"))

## Rows: 203 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (13): Cluster, organ, organ_code, L0_clusterName, L1_clusterName, L2_clu...
## dbl  (9): L1_clusterID, ncell, median_numi, median_ngene, median_pctmt, medi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

clust_meta$CellType <- paste0(clust_meta$organ, "_c", clust_meta$L1_clusterID)
clust_meta$CellType[clust_meta$organ=="StomachEsophagus"] <- paste0("Stomach", "_c", clust_meta$L1_clusterID[clust_meta$organ=="StomachEsophagus"])

clust_meta

## # A tibble: 203 × 23
##    Cluster organ   organ_code L1_clusterID L0_clusterName L1_clusterName
##    <chr>   <chr>   <chr>             <dbl> <chr>          <chr>         
##  1 AG_0    Adrenal AG                    0 epi            AG_epi        
##  2 AG_1    Adrenal AG                    1 epi            AG_epi        
##  3 AG_2    Adrenal AG                    2 epi            AG_epi        
##  4 AG_3    Adrenal AG                    3 epi            AG_epi        
##  5 AG_4    Adrenal AG                    4 epi            AG_epi        
##  6 AG_5    Adrenal AG                    5 end            AG_end        
##  7 AG_6    Adrenal AG                    6 imm            AG_imm        
##  8 AG_7    Adrenal AG                    7 str            AG_str        
##  9 AG_8    Adrenal AG                    8 epi            AG_epi        
## 10 AG_9    Adrenal AG                    9 epi            AG_epi        
## # ℹ 193 more rows
## # ℹ 17 more variables: L2_clusterID <chr>, L2_clusterName <chr>,
## #   L3_clusterName <chr>, ncell <dbl>, median_numi <dbl>, median_ngene <dbl>,
## #   median_pctmt <dbl>, median_nfrags <dbl>, median_tss <dbl>,
## #   median_frip <dbl>, note <chr>, Cluster_labelled <chr>, Color <chr>,
## #   Color_compartment <chr>, Order <dbl>, L4_clusterName <chr>, CellType <chr>

cutoff_pct <- 0.01 # top x pct

for (metaclust in unique(clust_meta$L4_clusterName)){
  message(metaclust)
  if (file.exists(paste0(out, "/abc_permetaclust_top_genes_1pct_GOMF_", metaclust, ".rds"))){
    message("skipping")
    next
  }
  # no deduplication, so the same peak-gene link can appear in multiple cell types
  clusts <- clust_meta %>% dplyr::filter(L4_clusterName==metaclust)
  
  top_genes <- loops[loops$CellType %in% clusts$CellType]$TargetGene %>% table %>% as.data.frame %>% dplyr::rename(gene=".", npeak="Freq") %>% arrange(desc(npeak)) %>% dplyr::mutate(gene_rank=seq_along(gene))
  
  cutoff <- quantile(top_genes$npeak, 1-cutoff_pct)[[1]]
  
  top_genes$color <- "gray"
  top_genes[top_genes$npeak>cutoff, "color"] <- "red"
  p2 <- ggplot(top_genes, aes(x=gene_rank, y=npeak, col=color)) + geom_point() + theme_BOR() +
      geom_hline(yintercept=cutoff,linetype = 'dashed', col = 'gray') +
      scale_color_manual(values=c("gray", "red")) +
      ggrepel::geom_label_repel(data=subset(top_genes, npeak>cutoff), aes(label=gene), max.overlaps = 30) +
      theme(legend.position = "none") +
      annotate("text", x = max(top_genes$gene_rank)*0.8, y = cutoff, label = paste0("total genes passing cutoff = ", dim(top_genes[top_genes$npeak>cutoff,])[1], "\n with n linked peaks > ", cutoff), vjust = -0.5) +
      ggtitle(paste0(metaclust, " highly regulated genes"))

  write.table(top_genes, paste0(out, "/abc_permetaclust_top_genes_1pct_", metaclust, ".tsv"), sep="\t", quote=F)

  # GO term enrichment
  all_tested_genes <- top_genes$gene
  goi <- top_genes[top_genes$npeak>cutoff,]$gene
  
  # GO BP
  upGO <- calcTopGo(all_tested_genes, interestingGenes=goi, nodeSize=5, ontology="BP")
  goRes <- upGO[order(as.numeric(upGO$pvalue), decreasing=FALSE),]
 
  pdf(paste0(figout, "/abc_permetaclust_top_genes_1pct_", metaclust, ".pdf"), width=10, height=10)
  print(p2)
  if(nrow(goRes)>1){
    print(topGObarPlot(goRes, cmap = cmaps_BOR$comet,
                       nterms=15, border_color="black",
                       barwidth=0.85, title=paste0(metaclust, " high regulated genes GO BP enrichment")))
  }
   # GO MF
  upGO <- calcTopGo(all_tested_genes, interestingGenes=goi, nodeSize=5, ontology="MF")
  goRes <- upGO[order(as.numeric(upGO$pvalue), decreasing=FALSE),]
  if(nrow(goRes)>1){
    print(topGObarPlot(goRes, cmap = cmaps_BOR$comet,
                       nterms=15, border_color="black",
                       barwidth=0.85, title=paste0(metaclust, " high regulated genes GO MF enrichment")))
  }
  dev.off()


  # get the actual genes linked to each GO term
  geneList <- factor(as.integer(all_tested_genes %in% goi))
  names(geneList) <- all_tested_genes
  GOdata <- suppressMessages(new(
              "topGOdata",
              ontology = "BP",
              allGenes = geneList,
              geneSel = goi,
              annot = annFUN.org, mapping = "org.Hs.eg.db", ID = "symbol",
              nodeSize = 50
              ))
  saveRDS(GOdata, paste0(out, "/abc_permetaclust_top_genes_1pct_GOBP_", metaclust, ".rds"))
  
  GOdata <- suppressMessages(new(
              "topGOdata",
              ontology = "MF",
              allGenes = geneList,
              geneSel = goi,
              annot = annFUN.org, mapping = "org.Hs.eg.db", ID = "symbol",
              nodeSize = 50
              ))
  saveRDS(GOdata, paste0(out, "/abc_permetaclust_top_genes_1pct_GOMF_", metaclust, ".rds"))
  

}

## adrenal cortex

## skipping

## neurons

## skipping

## endothelial

## skipping

## immune

## skipping

## stromal

## skipping

## neural crest derived

## skipping

## dorsal radial glia

## skipping

## smooth muscle

## skipping

## oligodendrocyte precursor cells

## skipping

## retinal progenitor cell

## skipping

## fibroblasts

## skipping

## pigmented epithelium

## skipping

## myogenic progenitors

## skipping

## cornea

## skipping

## trabecular meshwork

## skipping

## skeletal muscle

## skipping

## cardiomyocytes

## skipping

## epicardial

## skipping

## pericytes

## skipping

## erythroblasts

## skipping

## hepatocytes

## skipping

## cholangiocyte

## skipping

## stellate

## skipping

## airway epithelial progenitor

## skipping

## lymphatics

## skipping

## endocrine

## skipping

## ciliated

## skipping

## mesothelial

## skipping

## keratinocytes

## skipping

## interfollicular cells

## skipping

## epidermal progenitor

## skipping

## chief and neck cells

## skipping

## interstitial cells of Cajal

## skipping

## parietal cells

## skipping

## thymic epithelial cells

## skipping

## thyroid follicular

## skipping

## parathyroid

## skipping

7.2 summary of all GO MF enrichments

allGO <- list()
for (metaclust in unique(clust_meta$L4_clusterName)){
  message(metaclust)
  
  GOdata <- readRDS(paste0(out, "/abc_permetaclust_top_genes_1pct_GOMF_", metaclust, ".rds"))
  GOresult <- suppressMessages(runTest(GOdata, algorithm="weight01", stat="fisher"))
  GOtab <- GenTable(GOdata, pvalue=GOresult, topNodes=20, numChar=1000)
  GOtab$Log2FE <- log2(GOtab$Significant / GOtab$Expected)
  GOtab$pvalue <- as.numeric(GOtab$pvalue)
  GOtab$negLog10p <- -log10(GOtab$pvalue)
  GOtab$metaclust <- metaclust
  allGO[[metaclust]] <- GOtab
}

## adrenal cortex

## neurons

## endothelial

## immune

## stromal

## neural crest derived

## dorsal radial glia

## smooth muscle

## oligodendrocyte precursor cells

## retinal progenitor cell

## fibroblasts

## pigmented epithelium

## myogenic progenitors

## cornea

## trabecular meshwork

## skeletal muscle

## cardiomyocytes

## epicardial

## pericytes

## erythroblasts

## hepatocytes

## cholangiocyte

## stellate

## airway epithelial progenitor

## lymphatics

## endocrine

## ciliated

## mesothelial

## keratinocytes

## interfollicular cells

## epidermal progenitor

## chief and neck cells

## interstitial cells of Cajal

## parietal cells

## thymic epithelial cells

## thyroid follicular

## parathyroid

topx <-lapply(allGO %>% names, function(n){head(allGO[[n]], 5)})

tmp <- do.call(rbind, topx)

# clustering the GO terms 
mat_raw <- reshape2::dcast(tmp, metaclust ~ Term, value.var = "Log2FE")
mat <- mat_raw[,2:ncol(mat_raw)]
mat[is.na(mat)] <- min(mat[!is.na(mat)]) - 1
dend <- hclust(dist(mat %>% t), method = "mcquitty")
dend_clust <- hclust(dist(mat), method = "mcquitty")

tmp$Term <- factor(tmp$Term, levels=colnames(mat)[rev(dend$order)])
tmp$metaclust <- factor(tmp$metaclust, levels=mat_raw$metaclust[dend_clust$order])

# plot dotplot of enrichments
ggplot(tmp, aes(x=metaclust, y=Term, color=Log2FE, size=negLog10p)) + geom_point(stroke=0) +
  theme_classic() + theme(axis.text.x = element_text(angle=90, hjust=1)) + 
  # scale_size(range = c(0, 8)) +  # Adjust dot size range 
  scale_color_gradientn(colors = cmaps_BOR$comet, limits=c(0,5.5)) +
  labs(x = "Meta cluster", y = "GO term", size = "-log10(p)", color = "log2FE")

# ggsave(paste0(figout, "/dotplot_abc_go_term_metaclust_alltop5.pdf"), height=10, width=10)

manually select terms to display

terms <- c( 
  "heme binding", "antioxidant activity", "signaling receptor binding",
  "DNA-binding transcription factor binding","transcription corepressor activity", "DNA-binding transcription repressor activity, RNA polymerase II-specific",
  "actin filament binding", "actin binding",
  "transcription factor binding", 
  "DNA-binding transcription factor activity, RNA polymerase II-specific",
  "DNA-binding transcription activator activity, RNA polymerase II-specific","RNA polymerase II cis-regulatory region sequence-specific DNA binding")

ftmp <- tmp %>% dplyr::filter(Term %in% terms)
ftmp$Term <- factor(ftmp$Term, levels=terms)

# plot dotplot of enrichments
p1 <- ggplot(ftmp, aes(x=metaclust, y=Term, color=Log2FE, size=negLog10p)) + geom_point(stroke=0) +
  theme_classic() + theme(axis.text.x = element_text(angle=90, hjust=1)) + 
  # scale_size(range = c(0, 8)) +  # Adjust dot size range 
  scale_color_gradientn(colors = cmaps_BOR$comet, limits=c(0,5.5)) +
  labs(x = "Meta cluster", y = "GO term", size = "-log10(p)", color = "log2FE") + theme(legend.position = "bottom") 
 
summary <- clust_meta %>% dplyr::group_by(L4_clusterName) %>% dplyr::summarise(ncell=sum(ncell))
summary$L4_clusterName <- factor(summary$L4_clusterName, levels=mat_raw$metaclust[dend_clust$order])

p2 <- ggplot(summary, aes(x=L4_clusterName, y=log10(ncell))) + geom_col() + theme_classic() + theme(axis.text.x=element_blank(), axis.title.x=element_blank())

clust_meta$L4_clusterName <- factor(clust_meta$L4_clusterName, levels=mat_raw$metaclust[dend_clust$order])
p3 <- ggplot(clust_meta, aes(x=L4_clusterName, y=ncell, fill=organ)) + geom_col(stat="identity", position="fill") + theme_classic()  + scale_fill_manual(values=cmap_organ) + ylab("pct cells") + theme(axis.text.x = element_text(angle=90, hjust=1)) + theme(legend.position = "none") + theme(axis.text.x=element_blank(), axis.title.x=element_blank())

p <- plot_grid(p2, p3, p1, ncol = 1, rel_heights = c(1, 1, 10), align="v", axis="lr")
p

# ggsave(plot=p, filename=paste0(figout, "/dotplot_abc_go_term_metaclust_summary.pdf"), width=12, height=6)
# ggsave(plot=p, filename=paste0(figout, "/dotplot_abc_go_term_metaclust_summary.png"), width=12, height=6)

7.3 summary of all GO BP enrichments

allGO <- list()
for (metaclust in unique(clust_meta$L4_clusterName)){
  message(metaclust)
  
  GOdata <- readRDS(paste0(out, "/abc_permetaclust_top_genes_1pct_GOBP_", metaclust, ".rds"))
  GOresult <- suppressMessages(runTest(GOdata, algorithm="weight01", stat="fisher"))
  GOtab <- GenTable(GOdata, pvalue=GOresult, topNodes=20, numChar=1000)
  GOtab$Log2FE <- log2(GOtab$Significant / GOtab$Expected)
  GOtab$pvalue <- as.numeric(GOtab$pvalue)
  GOtab$negLog10p <- -log10(GOtab$pvalue)
  GOtab$metaclust <- metaclust
  allGO[[metaclust]] <- GOtab
}

## adrenal cortex

## neurons

## endothelial

## immune

## stromal

## neural crest derived

## dorsal radial glia

## smooth muscle

## oligodendrocyte precursor cells

## retinal progenitor cell

## fibroblasts

## pigmented epithelium

## myogenic progenitors

## cornea

## trabecular meshwork

## skeletal muscle

## cardiomyocytes

## epicardial

## pericytes

## erythroblasts

## hepatocytes

## cholangiocyte

## stellate

## airway epithelial progenitor

## lymphatics

## endocrine

## ciliated

## mesothelial

## keratinocytes

## interfollicular cells

## epidermal progenitor

## chief and neck cells

## interstitial cells of Cajal

## parietal cells

## thymic epithelial cells

## thyroid follicular

## parathyroid

topx <-lapply(allGO %>% names, function(n){head(allGO[[n]], 5)})

tmp <- do.call(rbind, topx)

# clustering the GO terms 
mat_raw <- reshape2::dcast(tmp, metaclust ~ Term, value.var = "Log2FE")
mat <- mat_raw[,2:ncol(mat_raw)]
mat[is.na(mat)] <- min(mat[!is.na(mat)]) - 1
dend <- hclust(dist(mat %>% t), method = "mcquitty")
dend_clust <- hclust(dist(mat), method = "mcquitty")

tmp$Term <- factor(tmp$Term, levels=colnames(mat)[rev(dend$order)])
tmp$metaclust <- factor(tmp$metaclust, levels=mat_raw$metaclust[dend_clust$order])

# plot dotplot of enrichments
ggplot(tmp, aes(x=metaclust, y=Term, color=Log2FE, size=negLog10p)) + geom_point(stroke=0) +
  theme_classic() + theme(axis.text.x = element_text(angle=90, hjust=1)) + 
  # scale_size(range = c(0, 8)) +  # Adjust dot size range 
  scale_color_gradientn(colors = cmaps_BOR$comet, limits=c(0,5.5)) +
  labs(x = "Meta cluster", y = "GO term", size = "-log10(p)", color = "log2FE")

# ggsave(paste0(figout, "/dotplot_abc_go_term_metaclust_alltop5.pdf"), height=10, width=10)