# **MODIFY THIS CHUNK**
project_id <- "HDMA-public" # determines the name of the cache folder
doc_id <- "04-enhancers/09" # determines name of the subfolder of `figures` where pngs/pdfs are saved
out <- here::here("output/", doc_id); dir.create(out, recursive = TRUE)
figout <- here::here("figures/", doc_id, "/")
cache <- paste0("~/scratch/cache/", project_id, "/", doc_id, "/")
script_path <- here::here("code/utils/")We received the VISTA-validated enhancer set from collaborators. Here we explore the data to guide our handling of the dataset for training/testing validation.
# libraries
library(here)
library(dplyr)
library(tidyr)
library(ggplot2)
library(purrr)
library(glue)
library(readr)
library(tibble)
library(cowplot)
library(GenomicRanges)
library(rtracklayer)
library(ArchR)
library(ComplexHeatmap)
library(BSgenome.Hsapiens.UCSC.hg38)
library(Seurat)
library(BPCells)
library(ggseqlogo)
# shared project scripts/functions
source(file.path(script_path, "plotting_config.R"))
source(file.path(script_path, "hdma_palettes.R"))
source(file.path(script_path, "sj_scRNAseq_helpers.R"))
source(file.path(script_path, "trackplots.R"))
source(file.path(script_path, "track_helpers_BL.R"))
source(file.path(script_path, "track_helpers_SJ.R"))
ggplot2::theme_set(theme_BOR())vista <- read_tsv(here::here("data/external/2024-01-24_vista_@mkosicki/2024-01-24_vista.clean.tsv"))## Rows: 3389 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): vista_id, backbone, stage, tissue, tissue_positive, assembly, coor...
## dbl (1): denominator
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#tibble::glimpse(vista)
unique(vista$assembly)## [1] "hg38" "mm10"
sum(is.na(vista$vista_id))## [1] 0
length(unique(vista$vista_id))## [1] 3272
length(unique(vista$coord))## [1] 3272
These also contain enhancers whose vista_id start with “mm” and use a “mm10” assembly, but mapped to an hg38 ortholog
vista <- vista[!is.na(vista$coordinate_hg38),]
vista## # A tibble: 3,266 × 13
## vista_id backbone stage denominator tissue tissue_positive assembly coord
## <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 hs3 hZR e11.5 4 neg <NA> hg38 chr16:…
## 2 hs4 hZR e11.5 10 hb;mb;nt 10;10;6 hg38 chr16:…
## 3 hs9 hZR e11.5 16 neg <NA> hg38 chr16:…
## 4 hs10 hZR e11.5 7 neg <NA> hg38 chr16:…
## 5 hs13 hZR e11.5 16 neg <NA> hg38 chr16:…
## 6 hs16 hZR e11.5 7 cn;other 6;6 hg38 chr16:…
## 7 hs17 hZR e11.5 7 neg <NA> hg38 chr16:…
## 8 hs20 hZR e11.5 18 hb;mb 4;8 hg38 chr16:…
## 9 hs62 hZR e11.5 8 neg <NA> hg38 chr16:…
## 10 hs63 hZR e11.5 20 neg <NA> hg38 chr16:…
## # ℹ 3,256 more rows
## # ℹ 5 more variables: strand <chr>, coordinate_hg38 <chr>,
## # coordinate_mm10 <chr>, seq <chr>, external_note <chr>
med_width <- median(width(GRanges(vista$coordinate_hg38)))
ggpubr::gghistogram(width(GRanges(vista$coordinate_hg38))) + xlab("vista enhancer width in bp") + geom_vline(xintercept=med_width, linetype="dashed", color="red") +
annotate("text", x=med_width, hjust=-0.1, y=800, label=paste0("median: ", med_width, "bp"), color="red")also tried increasing minoverlap to 200bp for findOverlaps and still get around 85% of vista enhancers in global peakset
global_peaks <- readRDS(here::here("output/04-enhancers/01/peaks_all_df.rds"))
overlap <- findOverlaps(GRanges(global_peaks), GRanges(vista$coordinate_hg38) %>% unique)
sprintf("From the vista enhancer dataset (hg38 only), %d of %d (%.2f%%) enhancers overlap with the hdma global peakset", overlap@to %>% unique %>% length, vista$vista_id %>% unique %>% length, overlap@to %>% unique %>% length/(vista$vista_id %>% unique %>% length) *100)## [1] "From the vista enhancer dataset (hg38 only), 2731 of 3149 (86.73%) enhancers overlap with the hdma global peakset"
sprintf("From the hdma global peakset, %d of %d (%.2f%%) peaks overlap with vista enhancers", overlap@from %>% unique %>% length, dim(global_peaks)[1], overlap@from %>% unique %>% length/dim(global_peaks)[1]*100)## [1] "From the hdma global peakset, 7664 of 1032273 (0.74%) peaks overlap with vista enhancers"
plotdf <- data.frame(vista_idx=seq_along(GRanges(vista$coordinate_hg38) %>% unique), n_hdma_peak_overlap=0)
tmp <- overlap@to %>% table %>% as.data.frame(stringsAsFactors=F) %>% dplyr::rename(n_hdma_peak_overlap="Freq", vista_idx=".")
plotdf[tmp$vista_idx, "n_hdma_peak_overlap"] <- tmp$n_hdma_peak_overlap
med <- median(tmp$n_hdma_peak_overlap)
ggplot(plotdf, aes(x=n_hdma_peak_overlap)) + geom_bar() + theme_BOR() + geom_vline(xintercept = med, linetype="dashed", color="red") +
annotate("text", x=med, y=600, hjust=-0.5, label=paste0("median: ", med), color="red") + ylab("n_vista_enhancers")vista_long <- read.csv(here::here("output/04-enhancers/08/vista_long_with_atlas_tissues.tsv"), sep="\t")
vista_long$vista_id %>% unique %>% length## [1] 3272
pos <- vista_long[vista_long$HDMA_tissue %in% c("Brain", "Eye", "Heart", "Skin", "Liver"),]
pos_coord <- vista[vista$vista_id %in% pos$vista_id, ]$coordinate_hg38 %>% GRanges
overlap <- findOverlaps(GRanges(global_peaks), pos_coord %>% unique)
sprintf("From the vista enhancer dataset (hg38 only) positive for Brain/Eye/Heart/Skin/Liver, %d of %d (%.2f%%) enhancers overlap with the hdma global peakset", overlap@to %>% unique %>% length, pos$vista_id %>% unique %>% length, overlap@to %>% unique %>% length/(pos$vista_id %>% unique %>% length) *100)## [1] "From the vista enhancer dataset (hg38 only) positive for Brain/Eye/Heart/Skin/Liver, 797 of 865 (92.14%) enhancers overlap with the hdma global peakset"
sprintf("From the hdma global peakset, %d of %d (%.2f%%) peaks overlap with vista enhancers", overlap@from %>% unique %>% length, dim(global_peaks)[1], overlap@from %>% unique %>% length/dim(global_peaks)[1]*100)## [1] "From the hdma global peakset, 2551 of 1032273 (0.25%) peaks overlap with vista enhancers"
sprintf("%d of %d vista enhancers are positive for Brain/Eye/Heart/Skin/Liver", pos_coord %>% unique %>% length, vista_long$vista_id %>% unique %>% length)## [1] "851 of 3272 vista enhancers are positive for Brain/Eye/Heart/Skin/Liver"
library(ArchR)
library(BSgenome.Hsapiens.UCSC.hg38)
addArchRGenome("hg38")## Setting default genome to Hg38.
geneAnnot <- getArchRGenome(geneAnnotation = T, genomeAnnotation = F) %>% as.list## Using GeneAnnotation set by addArchRGenome(Hg38)!
peaks <- GRanges(vista$coordinate_hg38) %>% unique
peaks_anno <- ArchR:::.fastAnnoPeaks(peaks, BSgenome = BSgenome.Hsapiens.UCSC.hg38, geneAnnotation = geneAnnot)## Annotating Peaks : Nearest Gene
## Annotating Peaks : Gene
## Annotating Peaks : TSS
## Annotating Peaks : GC
ggplot(peaks_anno %>% as.data.frame, aes(x=1, fill=peakType)) + geom_bar() + theme_BOR()ggplot(peaks_anno %>% as.data.frame, aes(x=1, fill=peakType)) + geom_bar(position="fill") + theme_BOR()
# Heatmap of VISTA enhancer frag counts
proj <- loadArchRProject(here::here("output/01-preprocessing/03/allSamples_decontx"))
proj <- addFeatureMatrix(
input = proj,
features = GRanges(vista$coordinate_hg38) %>% unique,
matrixName = "VistaPeakMatrix"
)
proj <- saveArchRProject(proj)tissue_meta <- read_tsv(here::here("code/02-global_analysis/01-organs_keep.tsv"))## Rows: 12 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): organcode, organ, iteration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# add organ colors
col.data.frame <- data.frame(Color=cmap_organ) %>% dplyr::mutate(organ=names(cmap_organ))
organ.code.list <- tissue_meta$organcode
all.annots <- lapply(1:length(organ.code.list), function(i){
# read cluster id annotation
annot <- read.csv(sprintf(here::here("output/01-preprocessing/02/shared/meta/%s_meta_cluster.txt"), organ.code.list[i]), sep="\t") %>% as.data.frame
rownames(annot) <- annot$cb
annot$L0_clusterID <- paste0(organ.code.list[i],"_",annot$L1_clusterID)
annot$L3_clusterID <- paste0(organ.code.list[i],"_",annot$L3_clusterName)
return(annot)
})
all.annots <- dplyr::bind_rows(all.annots)
rownames(all.annots) <- all.annots$L0_clusterID
all.annots <- all.annots %>%
mutate(Cluster = paste0(organ_code, "_",L1_clusterID)) %>%
mutate(Cluster_labelled = paste0(organ_code, "_", L1_clusterID, "_", L3_clusterName)) %>%
left_join(col.data.frame) %>%
tibble::column_to_rownames(var = "Cluster")## Joining with `by = join_by(organ)`
head(all.annots)## organ organ_code L1_clusterID L0_clusterName L1_clusterName L2_clusterID
## AG_0 Adrenal AG 0 epi AG_epi AG_epi1
## AG_1 Adrenal AG 1 epi AG_epi AG_epi2
## AG_2 Adrenal AG 2 epi AG_epi AG_epi3
## AG_3 Adrenal AG 3 epi AG_epi AG_epi4
## AG_4 Adrenal AG 4 epi AG_epi AG_epi5
## AG_5 Adrenal AG 5 end AG_end AG_end
## L2_clusterName L3_clusterName ncell median_numi
## AG_0 adrenal cortex adrenal cortex 1 823 9498.0
## AG_1 adrenal cortex adrenal cortex 2 576 6827.5
## AG_2 adrenal cortex adrenal cortex 3 511 10567.0
## AG_3 adrenal cortex adrenal cortex 4 483 6340.0
## AG_4 sympathoadrenal lineage sympathoadrenal lineage 1 155 5262.0
## AG_5 endothelial endothelial 115 4117.0
## median_ngene median_pctmt median_nfrags median_tss median_frip
## AG_0 3280 0.008694896 6784.0 9.8320 0.2906086
## AG_1 3114 0.000000000 4341.5 10.3445 0.2400227
## AG_2 3661 0.008910274 5692.0 9.6830 0.2679032
## AG_3 2487 0.000000000 5519.0 9.8760 0.3020710
## AG_4 2564 0.000000000 4425.0 10.1390 0.2104465
## AG_5 2188 0.006551792 4192.0 10.1250 0.2417165
## note
## AG_0 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## AG_1 AGTR1+ likely zona glomerulosa; all PCW17
## AG_2 NR5A1+ AKR1B1+ SULT2A1+ CYP11A1/CYP11B1+; all PCW18
## AG_3 NR5A1+; mosly PCW21
## AG_4 PHOX2A/B+ ISL1+ GATA3+; likely mix of chromaffin cells and sympathoblasts from all samples
## AG_5 PECAM1+ TEK+; mix of samples
## L0_clusterID L3_clusterID Cluster_labelled
## AG_0 AG_0 AG_adrenal cortex 1 AG_0_adrenal cortex 1
## AG_1 AG_1 AG_adrenal cortex 2 AG_1_adrenal cortex 2
## AG_2 AG_2 AG_adrenal cortex 3 AG_2_adrenal cortex 3
## AG_3 AG_3 AG_adrenal cortex 4 AG_3_adrenal cortex 4
## AG_4 AG_4 AG_sympathoadrenal lineage 1 AG_4_sympathoadrenal lineage 1
## AG_5 AG_5 AG_endothelial AG_5_endothelial
## Color
## AG_0 #876941
## AG_1 #876941
## AG_2 #876941
## AG_3 #876941
## AG_4 #876941
## AG_5 #876941
# add average cluster age
meta <- read_tsv(Sys.glob(here::here("output/01-preprocessing/02/shared/meta/*_meta.txt"))) %>% as.data.frame(stringsAsFactors=F)## Rows: 817740 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): cb, L1_clusterName, L2_clusterID, L2_clusterName, L3_clusterName
## dbl (1): L1_clusterID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
smeta <- read_tsv(Sys.glob(here::here("output/01-preprocessing/02/shared/meta/*_meta_sample.txt"))) %>% as.data.frame(stringsAsFactors=F)## Rows: 76 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): sample, organ, organ_code, sex
## dbl (9): pcw, pcd, ncell, median_ngene, median_numi, median_pctmt, median_nf...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cmeta <- read_tsv(Sys.glob(here::here("output/01-preprocessing/02/shared/meta/*_meta_cluster.txt"))) %>% as.data.frame(stringsAsFactors=F)## Rows: 203 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (8): organ, organ_code, L0_clusterName, L1_clusterName, L2_clusterID, L2...
## dbl (8): L1_clusterID, ncell, median_numi, median_ngene, median_pctmt, media...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rownames(smeta) <- smeta$sample
meta$sample <- lapply(meta$cb, function(n){strsplit(n, split="#")[[1]][1]}) %>% unlist
meta$PCW <- smeta[meta$sample, "pcw"]
meta$PCD <- smeta[meta$sample, "pcd"]
meta_group <- meta %>% group_by(L2_clusterID) %>% dplyr::summarise(avg_pcw=mean(PCW), avg_pcd=mean(PCD)) %>% as.data.frame(stringsAsFactors=F)
all.annots <- merge(all.annots, meta_group, by="L2_clusterID", all.x=T) %>% merge(cmeta, all.x=T)
head(all.annots)## L2_clusterID organ organ_code L1_clusterID L0_clusterName L1_clusterName
## 1 AG_end Adrenal AG 5 end AG_end
## 2 AG_epi1 Adrenal AG 0 epi AG_epi
## 3 AG_epi2 Adrenal AG 1 epi AG_epi
## 4 AG_epi3 Adrenal AG 2 epi AG_epi
## 5 AG_epi4 Adrenal AG 3 epi AG_epi
## 6 AG_epi5 Adrenal AG 4 epi AG_epi
## L2_clusterName L3_clusterName ncell median_numi
## 1 endothelial endothelial 115 4117.0
## 2 adrenal cortex adrenal cortex 1 823 9498.0
## 3 adrenal cortex adrenal cortex 2 576 6827.5
## 4 adrenal cortex adrenal cortex 3 511 10567.0
## 5 adrenal cortex adrenal cortex 4 483 6340.0
## 6 sympathoadrenal lineage sympathoadrenal lineage 1 155 5262.0
## median_ngene median_pctmt median_nfrags median_tss median_frip
## 1 2188 0.006551792 4192.0 10.1250 0.2417165
## 2 3280 0.008694896 6784.0 9.8320 0.2906086
## 3 3114 0.000000000 4341.5 10.3445 0.2400227
## 4 3661 0.008910274 5692.0 9.6830 0.2679032
## 5 2487 0.000000000 5519.0 9.8760 0.3020710
## 6 2564 0.000000000 4425.0 10.1390 0.2104465
## note
## 1 PECAM1+ TEK+; mix of samples
## 2 NR5A1+ CYP11A1/CYP11B1+; mostly PCW21; groups with 2 and 3 in cluster tree
## 3 AGTR1+ likely zona glomerulosa; all PCW17
## 4 NR5A1+ AKR1B1+ SULT2A1+ CYP11A1/CYP11B1+; all PCW18
## 5 NR5A1+; mosly PCW21
## 6 PHOX2A/B+ ISL1+ GATA3+; likely mix of chromaffin cells and sympathoblasts from all samples
## L0_clusterID L3_clusterID Cluster_labelled
## 1 AG_5 AG_endothelial AG_5_endothelial
## 2 AG_0 AG_adrenal cortex 1 AG_0_adrenal cortex 1
## 3 AG_1 AG_adrenal cortex 2 AG_1_adrenal cortex 2
## 4 AG_2 AG_adrenal cortex 3 AG_2_adrenal cortex 3
## 5 AG_3 AG_adrenal cortex 4 AG_3_adrenal cortex 4
## 6 AG_4 AG_sympathoadrenal lineage 1 AG_4_sympathoadrenal lineage 1
## Color avg_pcw avg_pcd
## 1 #876941 17.24348 123.7130
## 2 #876941 21.00000 151.0000
## 3 #876941 16.99132 121.9444
## 4 #876941 18.00000 126.0000
## 5 #876941 21.00000 151.0000
## 6 #876941 16.30323 116.9290
per cluster
addArchRThreads(4)
proj$RNA_Clusters_Labelled <- all.annots[proj$RNA_Clusters, "Cluster_labelled"]
vista_peak_matrix <- getMatrixFromProject(proj, useMatrix = "VistaPeakMatrix")
saveRDS(vista_peak_matrix, paste0(out, "/vista_peak_matrix_cell.rds"))
tmp <- vista_peak_matrix@assays@data$VistaPeakMatrix %>% t %>% as.matrix %>% as.data.frame %>% group_by(vista_peak_matrix$RNA_Clusters_Labelled) %>% dplyr::summarise(across(everything(), list(median=median, sum=sum, mean=mean)))
saveRDS(tmp, paste0(out, "/vista_peak_matrix_clust.rds"))add peak annotations
vista_peak_matrix <- readRDS(paste0(out, "/vista_peak_matrix_cell.rds"))
peaks <- vista_peak_matrix@rowRanges
# add enh name to peaks
vista_gr <- GRanges(vista$coordinate_hg38)
mcols(vista_gr) <- vista
vista_gr <- vista_gr %>% unique
overlap <- findOverlaps(peaks, vista_gr, type="equal")
peaks$vista_id <- NA
peaks[overlap@from]$vista_id <- vista_gr[overlap@to]$vista_id
# add vista organ positive info
for (organ in c("Brain", "Heart", "Eye", "Liver", "Skin")){
pos_peaks <- vista[vista$vista_id %in% vista_long[vista_long$HDMA_tissue==organ, "vista_id"],]$coordinate_hg38 %>% GRanges
mcols(peaks)[paste0("vista_pos_", organ)] <- rep("neg", length(peaks))
mcols(peaks)[peaks$vista_id %in% vista_long[vista_long$HDMA_tissue==organ, "vista_id"], paste0("vista_pos_", organ)] <- "pos"
}
vista_peak_matrix@rowRanges <- peaks
saveRDS(vista_peak_matrix, paste0(out, "/vista_peak_matrix_cell.rds"))## cannot get peak matrix at once because it's too large, submit as job
#system("sbatch -p wjg,sfgf,biochem --mem-per-cpu=250g --time=48:00:00 --job-name=peakmtx --wrap \"Rscript 09-get_peak_matrix.R\"")
peak_matrix <- readRDS(paste0(out, "/peak_matrix_cell_overlapvista.rds")) # output from 09-get_peak_matrix.R
vista_peak_matrix <- readRDS(paste0(out, "/vista_peak_matrix_cell.rds"))
# overlap vista peaks with global peaks, min 75%=375bp overlap
# if overlapping multiple global peaks, use ATAC signal from the peak with the highest total ATAC signal
overlaps <- findOverlaps(peak_matrix@rowRanges, vista_peak_matrix@rowRanges, minoverlap=375)
vista_withoverlap <- overlaps@to %>% unique
maxmat <- list()
hdma_peaks <- c()
peak_matrix_data <- peak_matrix@assays@data$PeakMatrix
for (i in vista_withoverlap){
if (mod(i, 50)==0){
message(i)
}
cand_id <- overlaps@from[which(overlaps@to==i)]
cand <- peak_matrix_data[cand_id,,drop=F]
maxid <- rev(order(rowSums(cand)))[1]
maxmat[[i]] <- cand[maxid,] %>% unname
hdma_peaks <- c(hdma_peaks, cand_id[maxid])
}
stopifnot(sum(lengths(maxmat)!=0)==length(hdma_peaks))
maxmat <- maxmat[lengths(maxmat)!=0]
maxmat <- do.call(rbind, maxmat)
hdma_peaks <- peak_matrix@rowRanges[hdma_peaks]
vista_peak_matrix_max <- vista_peak_matrix
vista_peak_matrix_max <- vista_peak_matrix_max[vista_withoverlap,]
vista_peak_matrix_max@assays@data$VistaPeakMatrix <- Matrix(maxmat, sparse=T)
mcols(hdma_peaks) <- cbind(mcols(hdma_peaks), mcols(rowRanges(vista_peak_matrix_max))[,2:7])
rowRanges(vista_peak_matrix_max) <- hdma_peaks
saveRDS(vista_peak_matrix_max, paste0(out, "/peak_matrix_cell_overlapvista_maxpeak.rds"))
tmp <- vista_peak_matrix_max@assays@data$VistaPeakMatrix %>% t %>% as.matrix %>% as.data.frame %>% group_by(vista_peak_matrix_max$RNA_Clusters_Labelled) %>% dplyr::summarise(across(everything(), list(median=median, sum=sum, mean=mean)))
saveRDS(tmp, paste0(out, "/peak_matrix_cell_overlapvista_maxpeak_clust.rds"))Read overlap data
vista_peak_matrix <- readRDS(paste0(out, "/peak_matrix_cell_overlapvista_maxpeak.rds"))
tmp <- readRDS(paste0(out, "/peak_matrix_cell_overlapvista_maxpeak_clust.rds"))
# annotations
rownames(all.annots) <- all.annots$Cluster_labelled
all.annots.sorted <- all.annots[tmp[[grep("vista_peak", colnames(tmp), value=T)]], ]Filter to vista enhancers positive in at least 1 hdma tissue
df <- tmp[,grepl("sum", colnames(tmp))]
peaks <- vista_peak_matrix@rowRanges
idx_pf <- which((peaks$vista_pos_Brain=="pos") | (peaks$vista_pos_Heart=="pos") | (peaks$vista_pos_Eye=="pos") |
(peaks$vista_pos_Liver=="pos") | (peaks$vista_pos_Skin=="pos"))
# different normalizations
df <- df[, idx_pf]
peaks <- peaks[idx_pf,]
df_lognorm_sub <- (df/rowSums(df)) * 1e6 %>% log1p
df_lognorm_0to1_sub <- as.data.frame(lapply(df_lognorm_sub, scales::rescale))
ha <- columnAnnotation(organ=all.annots.sorted$organ, compartment=all.annots.sorted$L0_clusterName,
col=list(compartment=cmap_compartment,
organ=cmap_organ))
rowannot_ls <- list()
col_ls <- list()
for (organ in c("Heart", "Brain", "Eye", "Liver", "Skin")){
rowannot_ls[paste0("vista_pos_", organ)] <- mcols(peaks)[paste0("vista_pos_", organ)]
col_ls[[paste0("vista_pos_", organ)]] <- c(neg="gray", pos=cmap_organ[[organ]])
}
# order the rows and cols
ha2 <- HeatmapAnnotation(df=do.call(cbind, rowannot_ls %>% unname), col = col_ls, which="row")
# Calculate breaks based on quantiles to handle outliers
cmap <- cmaps_BOR$whitePurple
dfnorm <- df_lognorm_0to1_sub
plot_title <- "Normalized ATAC \n signal (0-1) \nLn(CPM+1)"
file_name <- "heatmap_vista_hdma_maxpeakoverlap.pdf"
break_points <- quantile(dfnorm %>% as.matrix, probs = seq(0.1, 0.9, length.out = length(cmap)))
color_mapping_1 <- circlize::colorRamp2(break_points, cmap)
break_points <- quantile(dfnorm %>% as.matrix, probs = seq(0.01, 0.99, length.out = length(cmap)))
color_mapping_2 <- circlize::colorRamp2(break_points, cmap)
color_mapping_3 <- circlize::colorRamp2(seq(0, 0.3, length.out=length(cmap)), cmap)
color_mapping_4 <- circlize::colorRamp2(seq(0, 0.5, length.out=length(cmap)), cmap)
color_mapping_5 <- circlize::colorRamp2(seq(0, 0.8, length.out=length(cmap)), cmap)
### cluster within groups ----------------------------------
# add peak group info
peaks$n_org_pos <- (data.frame(mcols(peaks)[,c("vista_pos_Heart", "vista_pos_Brain", "vista_pos_Eye", "vista_pos_Liver", "vista_pos_Skin")]) == "pos") %>% rowSums
peaks$group <- "negative"
mcols(peaks)[peaks$vista_pos_Heart=="pos" & peaks$n_org_pos==1, "group"] <- "heart positive only"
mcols(peaks)[peaks$vista_pos_Brain=="pos" & peaks$n_org_pos==1, "group"] <- "brain positive only"
mcols(peaks)[peaks$vista_pos_Eye=="pos" & peaks$n_org_pos==1, "group"] <- "eye positive only"
mcols(peaks)[peaks$vista_pos_Liver=="pos" & peaks$n_org_pos==1, "group"] <- "liver positive only"
mcols(peaks)[peaks$vista_pos_Skin=="pos" & peaks$n_org_pos==1, "group"] <- "skin positive only"
mcols(peaks)[peaks$n_org_pos > 1, "group"] <- "multi-organ positive"
peaks$group <- factor(peaks$group, levels=c("liver positive only", "skin positive only", "brain positive only", "eye positive only", "multi-organ positive","heart positive only" ))
ht_list <- list()
cmap_list <- c(color_mapping_1, color_mapping_2, color_mapping_3, color_mapping_4, color_mapping_5)
for (i in seq_along(cmap_list)){
ht1 <- Heatmap(
dfnorm %>% t,
name=plot_title,
top_annotation = ha,
right_annotation = ha2,
row_split = peaks$group,
cluster_row_slices = T,
cluster_rows = T,
cluster_columns=F,
clustering_distance_columns = "pearson",
clustering_distance_rows = "pearson",
clustering_method_rows = "complete",
clustering_method_columns = "complete",
column_names_gp = gpar(fontsize = 2, col=all.annots.sorted$Color),
column_labels = all.annots.sorted$Cluster_labelled,
column_title = paste0(dim(dfnorm)[1], " HDMA clusters"),
row_names_gp = gpar(fontsize=0.5),
row_labels = peaks$vista_id,
row_title = paste0(dim(dfnorm)[2], " VISTA enhancers"),
col = cmap_list[[i]]
)
ht1 <- draw(ht1)
ht_list[[i]] <- ht1
}
saveRDS(ht_list, paste0(out, "/", file_name, ".rds"))file_name <- "heatmap_vista_hdma_maxpeakoverlap.pdf"
ht_list <- readRDS(paste0(out, "/", file_name, ".rds"))
for (h in ht_list){
plot(h)
}df <- tmp[,grepl("sum", colnames(tmp))]
peaks <- vista_peak_matrix@rowRanges
idx_pf <- which((peaks$vista_pos_Brain=="pos") | (peaks$vista_pos_Heart=="pos") | (peaks$vista_pos_Eye=="pos") |
(peaks$vista_pos_Liver=="pos") | (peaks$vista_pos_Skin=="pos"))
# different normalizations
df <- df[, idx_pf]
peaks <- peaks[idx_pf,]
df_lognorm_sub <- (df/rowSums(df)) * 1e6 %>% log1p
df_lognorm_z_sub <- scale(df_lognorm_sub) %>% as.data.frame
# read saved row order
ord <- readRDS(glue("{out}/heatmap_vista_hdma_maxpeakoverlap.pdf.rds"))
ord <- ord[[5]] %>% row_order
peak_order <- ord %>% unlist %>% unname
df_lognorm_z_sub <- df_lognorm_z_sub[,peak_order]
peaks <- peaks[peak_order]
ha <- columnAnnotation(organ=all.annots.sorted$organ, compartment=all.annots.sorted$L0_clusterName,
col=list(compartment=cmap_compartment,
organ=cmap_organ))
rowannot_ls <- list()
col_ls <- list()
for (organ in c("Heart", "Brain", "Eye", "Liver", "Skin")){
rowannot_ls[paste0("vista_pos_", organ)] <- mcols(peaks)[paste0("vista_pos_", organ)]
col_ls[[paste0("vista_pos_", organ)]] <- c(neg="gray", pos=cmap_organ[[organ]])
}
# order the rows and cols
ha2 <- HeatmapAnnotation(df=do.call(cbind, rowannot_ls %>% unname), col = col_ls, which="row")# Calculate breaks based on quantiles to handle outliers
cmap <- cmap_chromvar
dfnorm <- df_lognorm_z_sub
plot_title <- "Normalized ATAC \n signal (Z-Score) \nLn(CPM+1)"
file_name <- "heatmap_vista_hdma_maxpeakoverlap_zscore.pdf"
break_points <- quantile(dfnorm %>% as.matrix, probs = seq(0.1, 0.9, length.out = length(cmap)))
color_mapping_1 <- circlize::colorRamp2(break_points, cmap)
break_points <- quantile(dfnorm %>% as.matrix, probs = seq(0.01, 0.99, length.out = length(cmap)))
color_mapping_2 <- circlize::colorRamp2(break_points, cmap)
color_mapping_3 <- circlize::colorRamp2(seq(-1, 1, length.out=length(cmap)), cmap)
color_mapping_4 <- circlize::colorRamp2(seq(-2, 2, length.out=length(cmap)), cmap)
color_mapping_5 <- circlize::colorRamp2(seq(0, 0.8, length.out=length(cmap)), cmap)
### cluster within groups ----------------------------------
# add peak group info
peaks$n_org_pos <- (data.frame(mcols(peaks)[,c("vista_pos_Heart", "vista_pos_Brain", "vista_pos_Eye", "vista_pos_Liver", "vista_pos_Skin")]) == "pos") %>% rowSums
peaks$group <- "negative"
mcols(peaks)[peaks$vista_pos_Heart=="pos" & peaks$n_org_pos==1, "group"] <- "heart positive only"
mcols(peaks)[peaks$vista_pos_Brain=="pos" & peaks$n_org_pos==1, "group"] <- "brain positive only"
mcols(peaks)[peaks$vista_pos_Eye=="pos" & peaks$n_org_pos==1, "group"] <- "eye positive only"
mcols(peaks)[peaks$vista_pos_Liver=="pos" & peaks$n_org_pos==1, "group"] <- "liver positive only"
mcols(peaks)[peaks$vista_pos_Skin=="pos" & peaks$n_org_pos==1, "group"] <- "skin positive only"
mcols(peaks)[peaks$n_org_pos > 1, "group"] <- "multi-organ positive"
peaks$group <- factor(peaks$group, levels=c("liver positive only", "skin positive only", "brain positive only", "eye positive only", "multi-organ positive","heart positive only" ))
ht_list <- list()
cmap_list <- c(color_mapping_1, color_mapping_2, color_mapping_3, color_mapping_4, color_mapping_5)
for (i in seq_along(cmap_list)){
ht1 <- Heatmap(
dfnorm %>% t,
name=plot_title,
top_annotation = ha,
right_annotation = ha2,
cluster_rows = F,
split=peaks$group,
cluster_columns=F,
column_names_gp = gpar(fontsize = 2, col=all.annots.sorted$Color),
column_labels = all.annots.sorted$Cluster_labelled,
column_title = paste0(dim(dfnorm)[1], " HDMA clusters"),
row_names_gp = gpar(fontsize=0.5),
row_labels = peaks$vista_id,
row_title = paste0(dim(dfnorm)[2], " VISTA enhancers"),
col = cmap_list[[i]]
)
ht1 <- draw(ht1)
ht_list[[i]] <- ht1
}
saveRDS(ht_list, paste0(out, "/", file_name, ".rds"))file_name <- "heatmap_vista_hdma_maxpeakoverlap_zscore.pdf"
ht_list <- readRDS(paste0(out, "/", file_name, ".rds"))
# for (h in ht_list){
# plot(h)
# }
plot(ht_list[[4]])“If an enhancer is labelled VISTA heart positive, how likely is it going to have higher HDMA ATAC signal than those not VISTA heart positive”
P value = (how many times is the shuffled enrichment > actual enrichment)
organ <- "Eye"
df <- df_lognorm_z_sub
# df <- df_lognorm_sub
mask_pos <- mcols(peaks)[[paste0("vista_pos_",organ)]] == "pos"
n_pos <- sum(mask_pos)
mask_hdma_clust <- all.annots.sorted$organ == organ
avg_signal_neg <- df[mask_hdma_clust,!mask_pos] %>% unlist %>% mean
avg_signal_pos <- df[mask_hdma_clust,mask_pos] %>% unlist %>% mean
enrichment <- avg_signal_pos - avg_signal_neg # actual enrichment, subtract in log space = division in nonlog space
# now create shuffles
enrichment_shuffles <- list()
avg_signal_pos_shuffles <- list()
n_shuffles <- 10000
set.seed(16)
for (i in 1:n_shuffles){
idx_pos_label <- sample(1:length(peaks), size = n_pos)
mask_pos <- (1:length(peaks)) %in% idx_pos_label
neg <- df[mask_hdma_clust,!mask_pos] %>% unlist %>% mean
pos <- df[mask_hdma_clust,mask_pos] %>% unlist %>% mean
avg_signal_pos_shuffles[[i]] <- pos
enrichment_shuffles[[i]] <- pos - neg
}
avg_signal_pos_shuffles <- avg_signal_pos_shuffles %>% unlist
enrichment_shuffles <- enrichment_shuffles %>% unlist
sum(enrichment_shuffles > enrichment) / n_shuffles
sum(avg_signal_pos_shuffles > avg_signal_pos) / n_shuffles
exp(enrichment)# also try wilcoxon rank sum test
for (organ in c("Brain", "Heart", "Eye", "Liver", "Skin")){
print("#####################")
print(organ)
df <- df_lognorm_z_sub
# df <- df_lognorm_sub
mask_pos <- mcols(peaks)[[paste0("vista_pos_",organ)]] == "pos"
n_pos <- sum(mask_pos)
mask_hdma_clust <- all.annots.sorted$organ == organ
y <- df[mask_hdma_clust,!mask_pos] %>% unlist
x <- df[mask_hdma_clust,mask_pos] %>% unlist
res <- wilcox.test(x, y,
alternative = c("greater"),
conf.int = T, conf.level = 0.95)
print(res)
print(res$p.value)
print(res$statistic/(length(x)*length(y)))
}plot the matching RNA heatmap to the ATAC heatmap, use expression of the nearest gene
# get peaks
peaks <- vista_peak_matrix@rowRanges
idx_pf <- which((peaks$vista_pos_Brain=="pos") | (peaks$vista_pos_Heart=="pos") | (peaks$vista_pos_Eye=="pos") |
(peaks$vista_pos_Liver=="pos") | (peaks$vista_pos_Skin=="pos"))
peaks <- peaks[idx_pf,]
# get list of all nearest genes first
addArchRGenome("hg38")## Setting default genome to Hg38.
geneAnnot <- getArchRGenome(geneAnnotation = T, genomeAnnotation = F) %>% as.list## Using GeneAnnotation set by addArchRGenome(Hg38)!
peaks_anno <- ArchR:::.fastAnnoPeaks(peaks, BSgenome = BSgenome.Hsapiens.UCSC.hg38, geneAnnotation = geneAnnot)## Annotating Peaks : Nearest Gene
## Annotating Peaks : Gene
## Annotating Peaks : TSS
## Annotating Peaks : GC
gene.list <- unique(peaks_anno$nearestGene)
dir.create(paste0(out, "/gex/"), recursive = TRUE, showWarnings = F)get RNA data
for (i in 1:nrow(tissue_meta)){
organ <- tissue_meta$organ[i]
organcode <- tissue_meta$organcode[i]
message(sprintf("@ current organ: %s", organ))
out_files <- c(file.path(out, sprintf("gex/all_gex_rna.mean.%s.rds", organcode)),
file.path(out, sprintf("gex/all_gex_rna.detection.%s.rds", organcode)),
file.path(out, sprintf("gex/all_gex_decontx.mean.%s.rds", organcode)),
file.path(out, sprintf("gex/all_gex_decontx.detection.%s.rds", organcode)))
if (all(file.exists(out_files))) {
message("@ done, skipping.")
} else {
# read cluster id annotation
annot <- read.csv(sprintf(here::here("output/01-preprocessing/02/shared/meta/%s_meta.txt"), organcode), sep="\t") %>% as.data.frame
rownames(annot) <- annot$cb
# Raw RNA counts -------------------------------------------------------------
# get gene expression (RNA raw counts, scaled to 10000 per cell then log-transformed)
message("fetching gene expression from RNA assay")
rna_proj <- readRDS(
sprintf(here::here("output/01-preprocessing/02/%s/rna_preprocess_output/RNA_obj_clustered_final.rds"),
organ))
rna_proj <- NormalizeData(rna_proj, assay="RNA")
DefaultAssay(rna_proj) <- "RNA"
gex.df <- FetchData(rna_proj, vars = gene.list[gene.list %in% rownames(rna_proj)], slot="data")
# add cluster
gex.df$cluster <- paste0(organcode, "_",annot[rownames(gex.df),"L1_clusterID"]) # e.g. AG_0, AG_10
gex.df.mean <- gex.df %>% group_by(cluster) %>% dplyr::summarize(across(everything(), mean)) %>% as.data.frame
saveRDS(gex.df.mean, file.path(out, sprintf("gex/all_gex_rna.mean.%s.rds", organcode)))
# detection rate -------------------------------------------------------------
rna.detection <- calc_pct1(gex.df, "cluster")
saveRDS(rna.detection, file.path(out, sprintf("gex/all_gex_rna.detection.%s.rds", organcode)))
dim(gex.df.mean)
dim(rna.detection)
# DecontX --------------------------------------------------------------------
message("@ fetching gene expression from decontX assay")
# get decontX normalized data but get genes in gene list only
DefaultAssay(rna_proj) <- "decontX"
gex.df <- FetchData(rna_proj, vars = gene.list[gene.list %in% rownames(rna_proj)], slot="data")
# add cluster
gex.df$cluster <- paste0(organcode, "_",annot[rownames(gex.df),"L1_clusterID"]) # e.g. AG_0, AG_10
gex.df.mean <- gex.df %>% group_by(cluster) %>% dplyr::summarize(across(everything(), mean)) %>% as.data.frame
saveRDS(gex.df.mean, file.path(out, sprintf("gex/all_gex_decontx.mean.%s.rds", organcode)))
# detection rate -------------------------------------------------------------
decontx_detection <- calc_pct1(gex.df, "cluster")
saveRDS(decontx_detection, file.path(out, sprintf("gex/all_gex_decontx.detection.%s.rds", organcode)))
dim(gex.df.mean)
dim(decontx_detection)
rm(rna_proj)
message("@ done.")
}
}# write out gene expression averaged per cluster, binding all rows together
# dplyr::bind_rows(will fill NAs for missing columns between dataframes being bound togetther)
all_gex_rna.mean <- map(
tissue_meta$organcode,
~ readRDS(file.path(out, glue("gex/all_gex_rna.mean.{.x}.rds")))) %>%
bind_rows()
all_gex_rna.detection <- map(
tissue_meta$organcode,
~ readRDS(file.path(out, glue("gex/all_gex_rna.detection.{.x}.rds")))) %>%
bind_rows()
all_gex_decontx.mean <- map(
tissue_meta$organcode,
~ readRDS(file.path(out, glue("gex/all_gex_decontx.mean.{.x}.rds")))) %>%
bind_rows()
all_gex_decontx.detection <- map(
tissue_meta$organcode,
~ readRDS(file.path(out, glue("gex/all_gex_decontx.detection.{.x}.rds")))) %>%
bind_rows()
dim(all_gex_rna.mean)
dim(all_gex_rna.detection)
dim(all_gex_decontx.mean)
dim(all_gex_decontx.detection)
saveRDS(dplyr::bind_rows(all_gex_rna.mean), file.path(out, "gex/all_gex_rna.mean.rds"))
saveRDS(dplyr::bind_rows(all_gex_decontx.mean), file.path(out, "gex/all_gex_decontx.mean.rds"))
saveRDS(dplyr::bind_rows(all_gex_rna.detection), file.path(out, "gex/all_gex_rna.detectionrate.rds"))
saveRDS(dplyr::bind_rows(all_gex_decontx.detection), file.path(out, "gex/all_gex_decontx.detectionrate.rds"))read gex data
all_gex_rna.mean <- readRDS(file.path(out, "gex/all_gex_rna.mean.rds"))
all_gex_decontx.mean <- readRDS(file.path(out, "gex/all_gex_decontx.mean.rds"))# read saved row order
ht_list <- readRDS(glue("{out}/heatmap_vista_hdma_maxpeakoverlap.pdf.rds"))
ht <- ht_list[[5]]
ord <- lapply(seq_along(row_order(ht)), function(i){data.frame(idx=row_order(ht)[[i]]) %>% mutate(group=names(row_order(ht))[i])}) %>% do.call(rbind, .)
ord$group <- factor(ord$group, levels=c("liver positive only", "skin positive only", "brain positive only", "eye positive only", "multi-organ positive","heart positive only" ))
peak_order <- ord$idx
genes_sorted <- peaks_anno[peak_order]$nearestGene %>% unname
clust_sorted <- all.annots.sorted$L0_clusterID
# filter for peaks whose nearest gene has expression data
rna <- all_gex_rna.mean
#rna <- all_gex_decontx.mean
rownames(rna) <- rna$cluster
idx <- which(genes_sorted %in% colnames(rna))
mat <- rna[clust_sorted, genes_sorted[idx]]
mat[is.na(mat)] <- 0
mat <- as.data.frame(lapply(mat, scales::rescale)) # scale 0 to 1 for each gene across all clusters
# plotting config
cmap <- cmaps_BOR$whiteBlue
plot_title <- "Normalized RNA \n signal (0-1) \nLn(TPM+1)"
break_points <- quantile(mat %>% as.matrix, probs = seq(0.1, 0.9, length.out = length(cmap)), na.rm=T)
color_mapping_1 <- circlize::colorRamp2(break_points, cmap)
break_points <- quantile(mat %>% as.matrix, probs = seq(0.01, 0.99, length.out = length(cmap)), na.rm=T)
color_mapping_2 <- circlize::colorRamp2(break_points, cmap)
color_mapping_3 <- circlize::colorRamp2(seq(0, 1, length.out=length(cmap)), cmap)
ha3 <- HeatmapAnnotation(df=do.call(cbind, rowannot_ls %>% unname)[idx,], col = col_ls, which="row")
ht1 <- Heatmap(
mat %>% t,
name=plot_title,
top_annotation = ha,
right_annotation = ha3,
split = ord[idx,"group"],
cluster_rows = F,
cluster_columns=F,
column_names_gp = gpar(fontsize = 2, col=all.annots.sorted$Color),
column_labels = all.annots.sorted$Cluster_labelled,
column_title = paste0(dim(mat)[1], " HDMA clusters"),
row_names_gp = gpar(fontsize=0.5),
row_labels = genes_sorted[idx],
row_title = paste0(dim(mat)[2], " nearest genes of VISTA enhancers"),
col = color_mapping_3
)
ht1 <- draw(ht1)
plot(ht1)# read saved row order
ht_list <- readRDS(glue("{out}/heatmap_vista_hdma_maxpeakoverlap.pdf.rds"))
ht <- ht_list[[5]]
ord <- lapply(seq_along(row_order(ht)), function(i){data.frame(idx=row_order(ht)[[i]]) %>% mutate(group=names(row_order(ht))[i])}) %>% do.call(rbind, .)
ord$group <- factor(ord$group, levels=c("liver positive only", "skin positive only", "brain positive only", "eye positive only", "multi-organ positive","heart positive only" ))
peak_order <- ord$idx
genes_sorted <- peaks_anno[peak_order]$nearestGene %>% unname
clust_sorted <- all.annots.sorted$L0_clusterID
# filter for peaks whose nearest gene has expression data
rna <- all_gex_rna.mean
rownames(rna) <- rna$cluster
idx <- which(genes_sorted %in% colnames(rna))
mat <- rna[clust_sorted, genes_sorted[idx]]
mat[is.na(mat)] <- 0
mat <- scale(mat) %>% as.data.frame # z score
# plotting config
# cmap <- cmap_rna
cmap <- cmap_chromvar
plot_title <- "Normalized RNA \n signal (Z-Score) \nLn(TPM+1)"
break_points <- quantile(mat %>% as.matrix, probs = seq(0.1, 0.9, length.out = length(cmap)), na.rm=T)
color_mapping_1 <- circlize::colorRamp2(break_points, cmap)
break_points <- quantile(mat %>% as.matrix, probs = seq(0.01, 0.99, length.out = length(cmap)), na.rm=T)
color_mapping_2 <- circlize::colorRamp2(break_points, cmap)
color_mapping_3 <- circlize::colorRamp2(seq(-2, 2, length.out=length(cmap)), cmap)
ha3 <- HeatmapAnnotation(df=do.call(cbind, rowannot_ls %>% unname)[idx,], col = col_ls, which="row")
ht1 <- Heatmap(
mat %>% t,
name=plot_title,
top_annotation = ha,
right_annotation = ha3,
split = ord[idx,"group"],
cluster_rows = F,
cluster_columns=F,
column_names_gp = gpar(fontsize = 2, col=all.annots.sorted$Color),
column_labels = all.annots.sorted$Cluster_labelled,
column_title = paste0(dim(mat)[1], " HDMA clusters"),
row_names_gp = gpar(fontsize=0.5),
row_labels = genes_sorted[idx],
row_title = paste0(dim(mat)[2], " nearest genes of VISTA enhancers"),
col = color_mapping_3
)
ht1 <- draw(ht1)# plot(ht1)“If an enhancer is labelled VISTA heart positive, how likely is it going to have higher HDMA RNA signal than those not VISTA heart positive”
P value = (how many times is the shuffled enrichment > actual enrichment)
organ <- "Brain"
df <- mat
peaks_test <- peaks_anno[peak_order][idx,]
mask_pos <- mcols(peaks_test)[[paste0("vista_pos_",organ)]] == "pos"
n_pos <- sum(mask_pos)
mask_hdma_clust <- all.annots.sorted$organ == organ
avg_signal_neg <- df[mask_hdma_clust,!mask_pos] %>% unlist %>% mean
avg_signal_pos <- df[mask_hdma_clust,mask_pos] %>% unlist %>% mean
enrichment <- avg_signal_pos - avg_signal_neg # actual enrichment, subtract in log space = division in nonlog space
# now create shuffles
enrichment_shuffles <- list()
avg_signal_pos_shuffles <- list()
n_shuffles <- 10000
set.seed(16)
for (i in 1:n_shuffles){
idx_pos_label <- sample(1:length(peaks_test), size = n_pos)
mask_pos <- (1:length(peaks_test)) %in% idx_pos_label
neg <- df[mask_hdma_clust,!mask_pos] %>% unlist %>% mean
pos <- df[mask_hdma_clust,mask_pos] %>% unlist %>% mean
avg_signal_pos_shuffles[[i]] <- pos
enrichment_shuffles[[i]] <- pos - neg
}
avg_signal_pos_shuffles <- avg_signal_pos_shuffles %>% unlist
enrichment_shuffles <- enrichment_shuffles %>% unlist
sum(enrichment_shuffles > enrichment) / n_shuffles
sum(avg_signal_pos_shuffles > avg_signal_pos) / n_shufflesVISTA heart pos only, HDMA pos in liver from global peak max overlap approach: c(“hs2007”,“mm99”,“hs2240”,“mm194”,“mm122”,“mm2143”,“mm69”,“mm21”,“mm291”,“mm296”,“mm101”,“mm78”,“hs2062”,“mm1668”,“mm18”,“mm257”,“hs502”,“mm244”,“mm1343”,“mm89”,“mm1211”,“mm1330”,“mm1326”)
# also try wilcoxon rank sum test
for (organ in c("Brain", "Heart", "Eye", "Liver", "Skin")){
print("#####################")
print(organ)
df <- mat
peaks_test <- peaks_anno[peak_order][idx,]
mask_pos <- mcols(peaks_test)[[paste0("vista_pos_",organ)]] == "pos"
n_pos <- sum(mask_pos)
mask_hdma_clust <- all.annots.sorted$organ == organ
y <- df[mask_hdma_clust,!mask_pos] %>% unlist
x <- df[mask_hdma_clust,mask_pos] %>% unlist
res <- wilcox.test(x, y,
alternative = c("greater"),
conf.int = T, conf.level = 0.95)
print(res)
print(res$p.value)
print(res$statistic/(length(x)*length(y)))
}global_bp_obj <- readRDS(here::here("output/05-misc/01/global_bp_obj.rds"))
# plot
# region_name <- "mm291"
# region_str <- "chr2:126673667-126674912"
region_name <- "mm101"
region_str <- "chr16:104534-105616"
region <- GRanges(region_str) + 5e4
peaks_highlight <- ranges(GRanges(region_str))
highlights <- tibble(
xmin = peaks_highlight@start,
xmax = peaks_highlight@start + peaks_highlight@width - 1,
color = rep("red", length(peaks_highlight)),
ymin = -Inf, ymax = Inf, alpha = .2
)#
# add in ABC loops
global <- readRDS(here::here("output/04-enhancers/06/abc_summary_df.rds"))
# first get the TSS coords from ABC reference file
tss <- read.table(here::here("data/reference/tss/CollapsedGeneBounds.hg38.bed"), col.names = c("seqnames", "start", "end", "symbol", "", "strand", "id", "type"))
tss$tss_start <- 0
tss[tss$strand=="-", "tss_start"] <- tss[tss$strand=="-", "end"]
tss[tss$strand=="+", "tss_start"] <- tss[tss$strand=="+", "start"]
#subglobal <- global %>% dplyr::filter(abc_is_linked_to_gene_HT==1)
subglobal <- global %>% dplyr::filter(abc_is_linked_to_gene_LI==1)
#subglobal <- global %>% dplyr::filter((abc_is_linked_to_gene_HT==1) | (abc_is_linked_to_gene_LI==1))
overlap <- findOverlaps(region, GRanges(subglobal))
subglobal <- subglobal[overlap@to,]
subglobal <- merge(subglobal, tss[,c("symbol", "tss_start")], by.x="abc_linked_gene_LI", by.y="symbol", all.x=T)
loops <- subglobal[,c("seqnames", "abc_linked_gene_LI", "peak_name", "med_abc_score_LI")]
loops$chr <- loops$seqnames
loops$start <- pmin((subglobal$start + subglobal$end)%/%2, subglobal$tss_start)
loops$end <- pmax((subglobal$start + subglobal$end)%/%2, subglobal$tss_start)
p <- trackplot_helper_v2c(
region=region,
clusters=global_bp_obj$cell_metadata$organ,
fragments=global_bp_obj$frags,
cell_read_counts=global_bp_obj$cell_metadata$nFrags,
transcripts=global_bp_obj$transcripts,
loops=loops,
loop_color = "med_abc_score_LI",
loop_label = "ABC Liver",
highlights = highlights
)
ggplot2::ggsave(plot=p, paste0(figout, "/plot_track_", region_name, "_LI_abc.png"), dpi=300, width=9, height=8, units="in")# gene_name <- "GYPC"
gene_name <- "HBA2"
####################
p <- trackplot_helper_v2c(
gene=gene_name,
clusters=global_bp_obj$cell_metadata$organ,
fragments=global_bp_obj$frags,
cell_read_counts=global_bp_obj$cell_metadata$nFrags,
transcripts=global_bp_obj$transcripts,
expression_matrix = global_bp_obj$rna,
loops=loops,
flank = 5e4,
loop_color = "med_abc_score_LI",
loop_label = "ABC Liver",
highlights = highlights
)
ggplot2::ggsave(plot=p, paste0(figout, "/plot_track_", region_name, "_LI_abc_", gene_name, "gex.png"), dpi=300, width=12, height=8, units="in")submeta <- global_bp_obj$cell_metadata %>% dplyr::filter(organ %in% c("Liver"))
p <- trackplot_helper_v2c(
gene=gene_name,
clusters=submeta$archive_L3_clusterID,
fragments=global_bp_obj$frags %>% select_cells(submeta$cb),
cell_read_counts=submeta$nFrags,
transcripts=global_bp_obj$transcripts,
expression_matrix = global_bp_obj$rna[,submeta$cb],
loops=loops,
flank = 1e4,
#flank = 2e5,
loop_color = "med_abc_score_LI",
loop_label = "ABC Liver",
highlights = highlights
)
ggplot2::ggsave(plot=p, paste0(figout, "/plot_track_", region_name, "_LI_abc_", gene_name, "gex_liveronly.png"), dpi=300, width=12, height=8, units="in")
ggplot2::ggsave(plot=p, paste0(figout, "/plot_track_", region_name, "_LI_abc_", gene_name, "gex_liveronly.pdf"), dpi=300, width=8, height=8, units="in")#region <- GRanges("chr2:126655000-126678000") # mm291
# region <- GRanges("chr16:103900-181800") # mm101
region <- GRanges("chr16:76800-181800") # mm101 wider to include alpha globin superenhancer
submeta <- global_bp_obj$cell_metadata %>% dplyr::filter(organ %in% c("Liver"))
submeta$archive_L2_clusterName <- factor(submeta$archive_L2_clusterName, levels=c("erythroblast", "cycling erythroblast", "early erythroblast", "Kupffer", "B cell progenitor", "cholangiocyte", "endothelial", "hepatocyte", "megakaryocyte", "stellate"))
# filter for only ABC loops that originate from the enhancer
region_narrow <- GRanges("chr16:104534-105616")
# can only start inside the enhancer
mask <- ((start(global_bp_obj$loops_abc$Liver) >= start(region_narrow)) & (start(global_bp_obj$loops_abc$Liver) <= end(region_narrow))) & (seqnames(global_bp_obj$loops_abc$Liver)=="chr16")
# can start or end inside the enhancer
# mask <- (((start(global_bp_obj$loops_abc$Liver) >= start(region_narrow)) & (start(global_bp_obj$loops_abc$Liver) <= end(region_narrow))) | ((end(global_bp_obj$loops_abc$Liver) >= start(region_narrow)) & (end(global_bp_obj$loops_abc$Liver) <= end(region_narrow)))) & (seqnames(global_bp_obj$loops_abc$Liver)=="chr16")
p1 <- trackplot_helper_v2c(
region=region,
clusters=submeta$archive_L2_clusterName,
fragments=global_bp_obj$frags %>% select_cells(submeta$cb),
cell_read_counts=submeta$nFrags,
transcripts=global_bp_obj$transcripts,
loops=global_bp_obj$loops_abc$Liver[mask] %>% unique,
loop_color = "ABC.Score",
loop_label = "ABC Liver",
highlights = highlights
)
p2 <- trackplot_helper_v2c(
gene="HBA2",
clusters=submeta$archive_L2_clusterName,
fragments=global_bp_obj$frags %>% select_cells(submeta$cb),
cell_read_counts=submeta$nFrags,
transcripts=global_bp_obj$transcripts,
expression_matrix = global_bp_obj$rna[,submeta$cb],
flank = 5e3,
highlights = highlights
)
p1p2organ_code <- "BR"
organ_name <- "Brain"
abc_is_linked_to_gene_organ <- paste0("abc_is_linked_to_gene_", organ_code)
abc_linked_gene_organ <- paste0("abc_linked_gene_", organ_code)
med_abc_score_organ <- paste0("med_abc_score_", organ_code)
subglobal <- global %>% dplyr::filter(global[abc_is_linked_to_gene_organ]==1)
overlap <- findOverlaps(region, GRanges(subglobal))
subglobal <- subglobal[overlap@to,]
subglobal <- merge(subglobal, tss[,c("symbol", "tss_start")], by.x=abc_linked_gene_organ, by.y="symbol", all.x=T)
loops <- subglobal[,c("seqnames", abc_linked_gene_organ, "peak_name", med_abc_score_organ)]
loops$chr <- loops$seqnames
loops$start <- pmin((subglobal$start + subglobal$end)%/%2, subglobal$tss_start)
loops$end <- pmax((subglobal$start + subglobal$end)%/%2, subglobal$tss_start)
Sys.time()
submeta <- global_bp_obj$cell_metadata %>% dplyr::filter(organ_code==organ_code)
p <- trackplot_helper_v2c(
#gene=gene_name,
region=region,
clusters=submeta$archive_L3_clusterID,
fragments=global_bp_obj$frags %>% select_cells(submeta$cb),
cell_read_counts=submeta$nFrags,
transcripts=global_bp_obj$transcripts,
expression_matrix = global_bp_obj$rna[,submeta$cb],
loops=loops,
flank = 5e4,
loop_color = med_abc_score_organ,
loop_label = paste0("ABC ", organ_name),
highlights = highlights
)
Sys.time()
ggplot2::ggsave(plot=p, paste0(figout, "/plot_track_", region_name, "_", organ_code, "_abc_", gene_name, "gex_", tolower(organ_name), "only.png"), dpi=300, width=12, height=8, units="in")global_atac_proj <- loadArchRProject(here::here("output/01-preprocessing/03/allSamples_decontx"))
loops <- get_p2g_loops(global_atac_proj)
p <- trackplot_helper_v2c(
region=region,
clusters=global_bp_obj$cell_metadata$organ,
fragments=global_bp_obj$frags,
cell_read_counts=global_bp_obj$cell_metadata$nFrags,
transcripts=global_bp_obj$transcripts,
loops=GRanges(loops),
loop_color="Correlation",
loop_label = "P2G Links (Global)",
highlights = highlights
)
ggplot2::ggsave(plot=p, paste0(figout, "/plot_track_", region_name, "_globalp2g.png"), dpi=300, width=12, height=8, units="in")# region <- "chr16:104534-105616" # mm101
region <- "chr16:105080-105200" # zoomed in mm101
chrombp_outdir <- here::here("output/03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/")
finemo_param <- "counts_v0.23_a0.8_all"
global_bp_obj <- readRDS(here::here("output/05-misc/01/global_bp_obj.rds"))
# hits
cluster <- "Liver_c0" # for loading hits only
hits <- global_bp_obj$hits[[cluster]]# subset the global object
sub_meta <- global_bp_obj$cell_metadata %>%
dplyr::filter(organ=="Liver")
sub_meta$archive_L2_clusterName <- factor(sub_meta$archive_L2_clusterName, levels=c( "erythroblast", "cycling erythroblast", "early erythroblast", "Kupffer", "B cell progenitor", "cholangiocyte", "endothelial", "hepatocyte", "megakaryocyte", "stellate"))track_contribs_list <- list()
for (L2_cluster in levels(sub_meta$archive_L2_clusterName)){
# L2_cluster <- "erythroblast"
print(L2_cluster)
L1_clusters <- sub_meta[sub_meta$archive_L2_clusterName==L2_cluster, "Cluster_chrombpnet"] %>% unique
bw_contrib_list <- list()
for (clust in L1_clusters){
# load contribution scores
print(paste0("loading contrib scores from ", clust))
bw_contrib_list[[clust]] <- rtracklayer::import.bw(glue(here::here("output/03-chrombpnet/01-models/contribs/bias_Heart_c0_thresh0.4/{clust}/average_shaps.counts.bw")), selection=BigWigSelection(GRanges(region))) %>% as.data.frame
}
tmp <- do.call(rbind, bw_contrib_list)
bw_contrib <- tmp %>% dplyr::group_by(seqnames,start,end, width, strand) %>% dplyr::summarise(score=mean(score)) %>% GRanges
track_contribs_list[[L2_cluster]] <- trackplot_contribs_BL(bw_contrib,
region = region,
track_label = NULL,
facet_label = L2_cluster,
genome = BSgenome.Hsapiens.UCSC.hg38,
rel_height = 1,
ymax = 0.25, ymin=-0.039,
) + xlab(NULL)
}## [1] "erythroblast"
## [1] "loading contrib scores from Liver_c2"
## [1] "loading contrib scores from Liver_c8"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "cycling erythroblast"
## [1] "loading contrib scores from Liver_c0"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "early erythroblast"
## [1] "loading contrib scores from Liver_c5"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "Kupffer"
## [1] "loading contrib scores from Liver_c7"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "B cell progenitor"
## [1] "loading contrib scores from Liver_c11"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "cholangiocyte"
## [1] "loading contrib scores from Liver_c13"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "endothelial"
## [1] "loading contrib scores from Liver_c10"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "hepatocyte"
## [1] "loading contrib scores from Liver_c1"
## [1] "loading contrib scores from Liver_c3"
## [1] "loading contrib scores from Liver_c4"
## [1] "loading contrib scores from Liver_c6"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "megakaryocyte"
## [1] "loading contrib scores from Liver_c12"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
## [1] "stellate"
## [1] "loading contrib scores from Liver_c9"
## `summarise()` has grouped output by 'seqnames', 'start', 'end', 'width'. You
## can override using the `.groups` argument.
## @ plotting basepair-level contribs for width 121
## Scale for x is already present. Adding another scale for x, which will replace
## the existing scale.
# gene annotation
track_genes <- trackplot_gene(global_bp_obj$transcripts, region) +
ggplot2::guides(color = "none")
# scale bar for the top
scale_plot <- trackplot_scalebar(region)
track_hits <- trackplot_genome_annotation(loci = hits,
region = region,
color_by = "score",
show_strand = T,
colors = c("white", "red"),
label_size = 3,
label_by = "name",
track_label = "Instances")
plot_list <- list(track_genes + highlight_region(region, alpha = 0.4, color = "yellow"), track_hits)
p <- trackplot_combine(c(list(scale_plot), track_contribs_list, plot_list),
title = str_to_pretty(region)) &
ggplot2::theme(legend.direction = "vertical")
p# ggsave(plot=p, paste0(figout, "contrib_scores_mm101.pdf"), width=8, height=6).libPaths()## [1] "/oak/stanford/groups/wjg/bliu/software/R_lib"
## [2] "/share/software/user/open/R/4.1.2/lib64/R/library"
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /share/software/user/open/openblas/0.3.10/lib/libopenblas_haswellp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ggthemes_4.2.4 RColorBrewer_1.1-3
## [3] ggrastr_1.0.1 GenomicFeatures_1.46.5
## [5] AnnotationDbi_1.56.2 ggseqlogo_0.1
## [7] BPCells_0.2.0 SeuratObject_4.1.3
## [9] Seurat_4.3.0 BSgenome.Hsapiens.UCSC.hg38_1.4.4
## [11] BSgenome_1.62.0 Biostrings_2.62.0
## [13] XVector_0.34.0 ComplexHeatmap_2.10.0
## [15] rhdf5_2.38.1 SummarizedExperiment_1.24.0
## [17] Biobase_2.54.0 MatrixGenerics_1.6.0
## [19] Rcpp_1.0.10 Matrix_1.5-4
## [21] matrixStats_0.63.0 data.table_1.14.8
## [23] stringr_1.5.0 plyr_1.8.8
## [25] magrittr_2.0.3 gtable_0.3.3
## [27] gtools_3.9.4 gridExtra_2.3
## [29] ArchR_1.0.2 rtracklayer_1.54.0
## [31] GenomicRanges_1.46.1 GenomeInfoDb_1.30.1
## [33] IRanges_2.28.0 S4Vectors_0.32.4
## [35] BiocGenerics_0.40.0 cowplot_1.1.1
## [37] tibble_3.2.1 readr_2.1.4
## [39] glue_1.6.2 purrr_1.0.2
## [41] ggplot2_3.5.0 tidyr_1.3.1
## [43] dplyr_1.1.4 here_1.0.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 spatstat.explore_3.1-0 reticulate_1.28
## [4] tidyselect_1.2.0 RSQLite_2.3.1 htmlwidgets_1.6.2
## [7] BiocParallel_1.28.3 Rtsne_0.16 munsell_0.5.0
## [10] codetools_0.2-18 ica_1.0-3 future_1.33.2
## [13] miniUI_0.1.1.1 withr_3.0.0 spatstat.random_3.1-4
## [16] colorspace_2.1-0 progressr_0.13.0 filelock_1.0.2
## [19] highr_0.10 knitr_1.42 ROCR_1.0-11
## [22] ggsignif_0.6.4 tensor_1.5 listenv_0.9.0
## [25] labeling_0.4.2 GenomeInfoDbData_1.2.7 polyclip_1.10-4
## [28] farver_2.1.1 bit64_4.0.5 rprojroot_2.0.3
## [31] parallelly_1.35.0 vctrs_0.6.5 generics_0.1.3
## [34] xfun_0.39 BiocFileCache_2.2.1 R6_2.5.1
## [37] doParallel_1.0.17 ggbeeswarm_0.7.2 clue_0.3-64
## [40] bitops_1.0-7 rhdf5filters_1.6.0 spatstat.utils_3.0-2
## [43] cachem_1.0.8 DelayedArray_0.20.0 vroom_1.6.3
## [46] promises_1.2.0.1 BiocIO_1.4.0 scales_1.3.0
## [49] beeswarm_0.4.0 globals_0.16.2 goftest_1.2-3
## [52] rlang_1.1.3 GlobalOptions_0.1.2 splines_4.1.2
## [55] rstatix_0.7.2 lazyeval_0.2.2 broom_1.0.5
## [58] spatstat.geom_3.1-0 yaml_2.3.7 reshape2_1.4.4
## [61] abind_1.4-5 backports_1.4.1 httpuv_1.6.10
## [64] tools_4.1.2 ellipsis_0.3.2 jquerylib_0.1.4
## [67] ggridges_0.5.4 progress_1.2.2 zlibbioc_1.40.0
## [70] RCurl_1.98-1.12 prettyunits_1.1.1 ggpubr_0.6.0
## [73] deldir_1.0-6 pbapply_1.7-0 GetoptLong_1.0.5
## [76] zoo_1.8-12 ggrepel_0.9.5 cluster_2.1.2
## [79] magick_2.7.3 scattermore_1.0 circlize_0.4.15
## [82] lmtest_0.9-40 RANN_2.6.1 fitdistrplus_1.1-11
## [85] hms_1.1.3 patchwork_1.2.0 mime_0.12
## [88] evaluate_0.21 xtable_1.8-4 XML_3.99-0.14
## [91] shape_1.4.6 biomaRt_2.50.3 compiler_4.1.2
## [94] KernSmooth_2.23-20 crayon_1.5.2 htmltools_0.5.5
## [97] later_1.3.1 tzdb_0.4.0 DBI_1.1.3
## [100] dbplyr_2.3.2 rappdirs_0.3.3 MASS_7.3-54
## [103] car_3.1-0 cli_3.6.2 parallel_4.1.2
## [106] igraph_1.4.2 pkgconfig_2.0.3 GenomicAlignments_1.30.0
## [109] sp_1.6-0 plotly_4.10.1 spatstat.sparse_3.0-1
## [112] xml2_1.3.4 foreach_1.5.2 vipor_0.4.5
## [115] bslib_0.4.2 digest_0.6.31 sctransform_0.3.5
## [118] RcppAnnoy_0.0.20 spatstat.data_3.0-1 rmarkdown_2.21
## [121] leiden_0.4.2 uwot_0.1.14 curl_5.0.0
## [124] restfulr_0.0.15 shiny_1.7.4 Rsamtools_2.10.0
## [127] rjson_0.2.21 lifecycle_1.0.3 nlme_3.1-153
## [130] jsonlite_1.8.4 Rhdf5lib_1.16.0 carData_3.0-5
## [133] viridisLite_0.4.2 fansi_1.0.4 pillar_1.9.0
## [136] lattice_0.20-45 KEGGREST_1.34.0 fastmap_1.1.1
## [139] httr_1.4.6 survival_3.2-13 png_0.1-8
## [142] iterators_1.0.14 bit_4.0.5 stringi_1.7.12
## [145] sass_0.4.6 blob_1.2.4 memoise_2.0.1
## [148] irlba_2.3.5.1 future.apply_1.10.0