Introduction¶

Here, we show some various use cases of the pre-computed ChromBPNet-related analysis products (i.e. accessibility predictions, contribution scores, and motif istances). Then, we show how to load the trained cell type-specific ChromBPNet model to make new predictions.

We make heavy use of the tangermeme and bpnetlite packages by Jacob Schreiber for working with trained models in pytorch. Please see the tangermeme documentation for more ways to use ChromBPNet models.

See our more detailed documentation on analysis products and data types here.

Run in the tangermeme environment on lab cluster.

Setup¶

%load_ext autoreload
%autoreload 2

# deal with matplotlib plots
%matplotlib inline

# display all outputs in a cell
get_ipython().ast_node_interactivity = 'all'

from IPython.display import display, HTML

# misc
import sys
import os
import numpy as np
import random
from tqdm import tqdm
import pandas as pd

# io
import h5py
from pyfaidx import Fasta

# ML
import bpnetlite
from bpnetlite import ChromBPNet
from bpnetlite import BPNet
from bpnetlite.bpnet import CountWrapper
from bpnetlite.attribute import deep_lift_shap

import torch


# plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn; seaborn.set_style('white')
import matplotlib.ticker as ticker
import plotnine as pn
import logomaker
import tangermeme
from tangermeme.plot import plot_logo

from tangermeme.utils import characters
from tangermeme.utils import one_hot_encode
from tangermeme import io


print(f"numpy version: {np.__version__}")
print(f"tangermeme version: {tangermeme.__version__}")
print(f"bpnetlite version: {bpnetlite.__version__}")
print(f"torch version: {torch.__version__}")

# os.environ['CUDA_VISIBLE_DEVICES']='0'
os.environ['CUDA_VISIBLE_DEVICES']='MIG-40f43250-998e-586a-ac37-d6520e92590f'

torch.manual_seed(100)

# editable text in PDFs
# https://stackoverflow.com/a/54111532
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

numpy version: 2.0.2
tangermeme version: 0.4.3
bpnetlite version: 0.8.1
torch version: 2.3.1+cu121

<torch._C.Generator at 0x7f90176f38f0>

Check we have GPU access:

torch.cuda.device_count()

dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
dev

torch.cuda.is_available()

t1 = torch.randn(1,2).to(dev)
t1
t1.device

device(type='cuda')

True

tensor([[ 0.3607, -0.2859]], device='cuda:0')

device(type='cuda', index=0)

Variables¶

ChromBPNet models take in sequences of length INPUTLEN (2,114 bp) and returns accessibility in the central OUTPUTLEN (1,000 bp) region of the input.

Thus, we define some helpful variables below. We will use these to compute relative coordinates in some cases for plotting tracks which are genomically aligned.

INPUTLEN = 2114
OUTPUTLEN = 1000
MIDPOINT = INPUTLEN // 2
SHIFT_REL_INPUT = np.int64((2114 - 1000) / 2)
SHIFT_REL_INPUT

NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

LOGO_ALPHABET = 'ACGT'

# finemo colorscheme
LOGO_COLORS = {"A": '#109648', "C": '#255C99', "G": '#F7B32B', "T": '#D62839'}

# logomaker colorscheme
LOGO_COLORS2= {
        'G': [1, .65, 0],
        'T': [1, 0, 0],
        'C': [0, 0, 1],
        'A': [0, .5, 0]
    }

CHAR_IGNORE = ['QWERYUIOPSDFHJKLZXVBNM']

np.int64(557)

Paths¶

with open("../DURGA_DIRS.txt", 'r') as f:
	proj_in = f.readline().strip()
	proj_out = f.readline().strip()

with open("../AK_PROJ_DIR.txt", 'r') as f:
    kundaje_dir = f.readline().strip()

genome_fa = os.path.join(kundaje_dir, "refs/hg38/chrombpnet/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta")

hg38 = Fasta(genome_fa)

Load and visualize pre-computed predictions and contribution scores¶

We provide peaks used for model training, accessibility predictions, and per-nucleotide contribution scores for each peak in every cell type. We can load those outputs to inspect predicted accessibility and contribution scores at peaks of interest.

In this section, we point to internal local paths, but each of these files can be obtained from our data repositories on Zenodo.

We use the the extract_loci function from tangermeme, which loads the peaks as one-hot encoded sequences, along with the outputs (predictions, contribution scores) for each sequence.

# path to bed file for peaks
peaks_bed = os.path.join(proj_in, "00-inputs/chrombpnet_peaks/Brain_c0__peaks_bpnet.narrowPeak.gz")

# path to bigwig with bias-corrected predictions
preds_bw    = os.path.join(proj_in, "01-models/predictions/bias_corrected/Brain_c10_avg_chrombpnet_nobias.bw")

# path to bigwig with contribution scores
contribs_bw = os.path.join(proj_in, "01-models/contribs/bias_Heart_c0_thresh0.4/Brain_c0/average_shaps.counts.bw")

X_peaks, y_peaks = io.extract_loci(peaks_bed, genome_fa,
								   signals = [preds_bw, contribs_bw],
								   ignore = CHAR_IGNORE)

The X_peaks variable contains the one-hot encoded sequences, with shape (num_peaks, 4, INPUTLEN).

X_peaks.shape
X_peaks[0]

torch.Size([117140, 4, 2114])

tensor([[0, 0, 0,  ..., 0, 1, 0],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int8)

The y_peaks variable contains scalar values for predicted accessibility and contribution score at each nucleotide, with shape (num_peaks, 2, OUTPUTLEN).

y_peaks.shape
y_peaks[0]

torch.Size([117140, 2, 1000])

tensor([[ 0.0043,  0.0043,  0.0043,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0009, -0.0006, -0.0006]])

We also load in the peak coordinates:

X_peaks_coords = pd.read_csv(peaks_bed, sep='\t', header=None, names = NARROWPEAK_SCHEMA)
X_peaks_coords.head()

	chr	start	end	1	2	3	4	5	6	summit
0	chr1	9641	10641	.	.	.	.	.	.	500
1	chr1	9905	10905	.	.	.	.	.	.	500
2	chr1	180976	181976	.	.	.	.	.	.	500
3	chr1	777919	778919	.	.	.	.	.	.	500
4	chr1	778224	779224	.	.	.	.	.	.	500

We also load in the contribution scores in h5 format, which will allow us to plot them as sequence logos.

# load contribution scores
y_contribs = h5py.File(proj_in + "01-models/contribs/bias_Heart_c0_thresh0.4/Brain_c0/average_shaps.counts.h5", "r")
y_contribs.keys()

<KeysViewHDF5 ['projected_shap', 'raw', 'shap']>

We also load in the hits aka predictive motif instances:

instances = pd.read_csv(proj_out + "03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/Brain_c0/counts_v0.23_a0.8_all/hits_unique.reconciled.annotated.tsv.gz", sep = "\t")
instances.head()

# print column names
instances.columns

	seqnames	start	end	width	strand	start_untrimmed	end_untrimmed	motif_name	source	hit_coefficient	...	motif_name_unlabeled	pattern_class	distToGeneStart	nearestGene	peakType	distToTSS	nearestTSS	GC	distToPeakSummit
0	chr1	10175	10180	5	-	10158	10188	456\|ZEB/SNAI	1	0.961838	...	neg_patterns.neg.Average_12__merged_pattern_0	neg_patterns	1690	DDX11L1	Promoter	1690	ENST00000456328.2	0.4000	36
1	chr1	10503	10509	6	-	10497	10527	456\|ZEB/SNAI	2	0.985665	...	neg_patterns.neg.Average_12__merged_pattern_1	neg_patterns	1362	DDX11L1	Promoter	1362	ENST00000456328.2	0.6667	100
2	chr1	10555	10561	6	+	10545	10575	4\|BCL11A/Brepressive	2	0.432991	...	neg_patterns.neg.Thymus__Average_6__merged_pat...	neg_patterns	1310	DDX11L1	Promoter	1310	ENST00000456328.2	0.5000	152
3	chr1	10566	10575	9	+	10560	10590	436\|SP/KLF	2	6.695272	...	pos_patterns.pos.Average_212__merged_pattern_0	pos_patterns	1297	DDX11L1	Promoter	1297	ENST00000456328.2	0.8889	165
4	chr1	10642	10652	10	-	10629	10659	400\|NRF1	2	3.583460	...	pos_patterns.pos.Average_159__merged_pattern_0	pos_patterns	1221	DDX11L1	Promoter	1221	ENST00000456328.2	0.9000	241

5 rows × 24 columns

Index(['seqnames', 'start', 'end', 'width', 'strand', 'start_untrimmed',
       'end_untrimmed', 'motif_name', 'source', 'hit_coefficient',
       'hit_correlation', 'hit_importance', 'peak_name', 'peak_id',
       'motif_name_unlabeled', 'pattern_class', 'distToGeneStart',
       'nearestGene', 'peakType', 'distToTSS', 'nearestTSS', 'GC', 'N',
       'distToPeakSummit'],
      dtype='object')

The extracted sequences are length 2114, because of the default in_window parameter of tangermeme.io.extract_loci, and matches the input length of ChromBPNet.

Therefore, the start of the 1,000 bp peak within the 2,114 bp window should be position (2114 - 1000) / 2 = 557. We can confirm that the next 10 bases of the one-hot encoded sequence match what we see at the beginning of the peak in the reference:

# taking the 31861th peak:
i = 31861

# reference squence
hg38[X_peaks_coords.iloc[i]['chr']][X_peaks_coords.iloc[i]['start']:X_peaks_coords.iloc[i]['start'] + 10].seq

# one-hot encoded sequence converted back to nucleotides
characters(X_peaks[i][:, 557:567])

'GCCTGCAAGG'

'GCCTGCAAGG'

Now we can plot the predictions and contribution scores for a peak of interest near FOXG1.

Let's also get the motif instances in this region:

instances_local = instances[instances['peak_id']==i]
instances_local

# convert these to relative coordinates for annotation
annotations = instances_local[['start', 'end', 'motif_name', 'hit_coefficient', 'strand']]
annotations.loc[:, 'start'] = annotations['start'] - X_peaks_coords.iloc[i]['start'] + SHIFT_REL_INPUT
annotations.loc[:, 'end'] = annotations['end'] - X_peaks_coords.iloc[i]['start'] + SHIFT_REL_INPUT

# rename motif_name to name
annotations = annotations.rename(columns={'hit_coefficient': 'score'})

# reorder the columns
annotations = annotations[['motif_name', 'start', 'end', 'strand', 'score']]

	seqnames	start	end	width	strand	start_untrimmed	end_untrimmed	motif_name	source	hit_coefficient	...	motif_name_unlabeled	pattern_class	distToGeneStart	nearestGene	peakType	distToTSS	nearestTSS	GC	distToPeakSummit
545198	chr14	28766432	28766440	8	+	28766420	28766450	157\|ETS:ELF/ETV#1	1	0.134989	...	pos_patterns.pos.Average_306__merged_pattern_0	pos_patterns	350	FOXG1	Promoter	350	ENST00000313071.7	0.5000	293
545199	chr14	28766664	28766671	7	-	28766650	28766680	367\|NFY	2	89.228280	...	pos_patterns.pos.Average_276__merged_pattern_1	pos_patterns	118	FOXG1	Promoter	118	ENST00000313071.7	0.5714	61
545200	chr14	28766709	28766716	7	-	28766695	28766725	367\|NFY	2	69.797290	...	pos_patterns.pos.Average_276__merged_pattern_1	pos_patterns	73	FOXG1	Promoter	73	ENST00000313071.7	0.4286	16

3 rows × 24 columns

We highlight a region which we will zoom into, and we see two nice NFY sites standing out.

fig, ax = plt.subplots(3, 1, figsize = (11, 6))

# plot the predicted profile for the top hit
ax[0].plot(y_peaks[i][0], linewidth=1, label = "bias-corrected", color = 'red')
ax[0].set_ylabel("Predicted profile")
ax[0].legend(fontsize = 8)
ax[0].axvspan(400, 600, alpha=0.4, color='yellow')
ax[0].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# contribs
ax[1].plot(y_peaks[i][1], linewidth=1, label = "contrib.", color = "blue")
ax[1].set_ylabel("Contribution scores")
ax[1].legend(fontsize = 8)
ax[1].axvspan(400, 600, alpha=0.4, color='yellow')
ax[1].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# plot the contrbs in the central 200 bp as a sequence logo
tangermeme.plot.plot_logo(np.float64(y_contribs['projected_shap']['seq'][i]),
						  start = SHIFT_REL_INPUT + 400, end = SHIFT_REL_INPUT + 600, ax=ax[2],
						  # add in the annotations from the motif instances
						  annotations = annotations)
ax[2].set_ylabel("Contrib. (zoom)")
ax[2].set_ylim(-0.05, 0.1)
ax[2].set_xticks(
	ticks = [0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 199],
	labels = [-100, -80, -60, -40, -20, 0, 20, 40, 60, 80, 100])

# put a title at the very top
plt.xlabel("Relative genomic position")
plt.suptitle("peak: " + X_peaks_coords.iloc[i]['chr'] + ":" + str(X_peaks_coords.iloc[i]['start']) + "-" + str(X_peaks_coords.iloc[i]['end']) + " in Brain_c0", fontsize = 12)
plt.tight_layout()
plt.show();

This matches what we see in the genome browser at this locus:

Load a model¶

We can download the tar file of trained models for each cell type from Zenodo:

part 1: https://doi.org/10.5281/zenodo.15048277
part 2: https://doi.org/10.5281/zenodo.15200486

And then extract the models:

$ tar -xvf Brain_c0.gz
x Brain_c0/
x Brain_c0/Brain_c0__fold_0__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_2__chrombpnet.h5
x Brain_c0/Brain_c0__fold_4__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_4__chrombpnet.h5
x Brain_c0/Brain_c0__fold_3__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_1__chrombpnet.h5
x Brain_c0/Brain_c0__fold_0__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_0__chrombpnet.h5
x Brain_c0/Brain_c0__fold_3__chrombpnet.h5
x Brain_c0/Brain_c0__fold_1__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_2__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_4__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_2__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_1__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_3__chrombpnet_nobias.h5
$ ls Brain_c0
Brain_c0__fold_0__bias_model_scaled.h5 Brain_c0__fold_2__chrombpnet.h5
Brain_c0__fold_0__chrombpnet_nobias.h5 Brain_c0__fold_3__bias_model_scaled.h5
Brain_c0__fold_0__chrombpnet.h5        Brain_c0__fold_3__chrombpnet_nobias.h5
Brain_c0__fold_1__bias_model_scaled.h5 Brain_c0__fold_3__chrombpnet.h5
Brain_c0__fold_1__chrombpnet_nobias.h5 Brain_c0__fold_4__bias_model_scaled.h5
Brain_c0__fold_1__chrombpnet.h5        Brain_c0__fold_4__chrombpnet_nobias.h5
Brain_c0__fold_2__bias_model_scaled.h5 Brain_c0__fold_4__chrombpnet.h5
Brain_c0__fold_2__chrombpnet_nobias.h5

Generally, we will want to use *_chrombpnet_nobias.h5 models for all downstream work, which produces bias-corrected model predictions. See our documentation here for more details.

To load a pre-trained ChromBPNet model, we can use the BPNet class from the bpnetlite package. The ChromBPNet models return two outputs: the profile predictions a vector of logits over 1,000 bp, and the log counts as a scalar value.

To convert the logits to accessibility profiles, we softmax the logits and scale by the exponentiated log counts. Throughout the code, we use a pytorch wrapper implemented in code/03-chrombpnet/tangermeme_utils/wrappers.py, but for convenience, we show the wrapper here:

class ChromBPNetWrapper(torch.nn.Module):
    """A wrapper class that returns counts and transformed profiles
    from a bias-corrected ChromBPNet model.

    This class takes in a trained model, specifically expecting a BPNet model
    from a trained bias-corrected ChromBPNet model (i.e. `chrombpnet_nobias.h5`),
    _not_ the full accessibility model.

    It returns outputs the same shape as the BPNet model, but with the predicted
    profile logits softmaxed and scaled by the exponent of the predicted counts.
    This is for convenience when plotting, since in e.g. bigwigs, we are inspecting
    the scaled profile, not the unnormalized logits.

    Parameters
    ----------
    model: torch.nn.Module
        A torch model to be wrapped.
    """

    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, X):

        y = self.model(X)

        # predicted profile logits
        y_profile = y[0]

        # predicted log counts
        y_counts = y[1]

        # softmax the logits to get profile probabilities
        y_profile = torch.nn.functional.softmax(y_profile, dim = -1)

        # scale the profile by the exponentiated predicted ln(counts)
        y_profile = y_profile * torch.exp(y_counts).unsqueeze(2)

        return y_profile, y_counts

First, we load the model for one fold:

brain_model = ChromBPNetWrapper(BPNet.from_chrombpnet(
	filename=os.path.join(proj_in, f"01-models/models/bias_Heart_c0_thresh0.4/Brain_c0/fold_0/models/chrombpnet_nobias.h5")))

Predict and interpret for a region of interest¶

Now we can run a forward pass through the model to get predicted outputs. Let's use a different peak region, near the SLC17A7 gene.

Check out the tutorial on prediction from tangermeme for more details: https://tangermeme.readthedocs.io/en/latest/tutorials/Tutorial_A3_Deep_LIFT_SHAP.html

We use "contribution", "attribution", "interpretation", and "importance" interchangeably.

j = 54288
y_1 = tangermeme.predict.predict(brain_model, X_peaks[j].unsqueeze(0).float())

As expected, the model returns two values, one for each model head: the profiles, and the ocunts. Since we used our model wrapper, the profile values are already scaled by the exponentiated log counts and represent predicted accessibility profiles. The counts are predicted log counts.

len(y_1)
y_1[0].shape
y_1[1].shape

torch.Size([1, 1, 1000])

torch.Size([1, 1])

Next we run model interpretation. Note that for convenience in calculating the attributions, we need to use the CountWrapper class from bpnet-lite to get the counts predictions only, since the attributions algorithm (DeepLIFT/SHAP) can only explain a single output at a time.

We also use bpnetlite.attribute.deep_lift_shap as our attribution/model interpretation function, which automatically registers the layers that are necessary for using Chrom BPNet models.

Check out the tangermeme tutorial on interpretation for more details: https://tangermeme.readthedocs.io/en/latest/tutorials/Tutorial_A3_Deep_LIFT_SHAP.html

brain_model_counts = CountWrapper(BPNet.from_chrombpnet(
	filename=os.path.join(proj_in, f"01-models/models/bias_Heart_c0_thresh0.4/Brain_c0/fold_0/models/chrombpnet_nobias.h5")))

# we need to pass a batch of shape (n_seqs, 4, 1000), so we add an extra dimension here 
# since we're just passing in a single sequence
attr_1 = deep_lift_shap(brain_model_counts, X_peaks[j].unsqueeze(0).float(), random_state=0, print_convergence_deltas=True)

tensor([4.5419e-04, 1.3351e-05, 2.0528e-04, 1.1563e-04, 1.2207e-04, 1.8907e-04,
        2.4796e-05, 2.8729e-04, 3.2043e-04, 1.3494e-04, 1.0395e-04, 2.5105e-04,
        2.1458e-05, 9.5367e-06, 3.8862e-04, 1.4544e-04, 1.8716e-04, 2.6727e-04,
        1.1206e-04, 2.2173e-04], device='cuda:0', grad_fn=<AbsBackward0>)

As expected, every nucleotide in the input region (INPUTLEN, 2,114 bp) is assigned a contirubtion score to the output, therefore the output is of shape (num_peaks, 4, INPUTLEN).

attr_1.shape
attr_1.sum(axis = 1).shape

torch.Size([1, 4, 2114])

torch.Size([1, 2114])

We can make a similar plot as above, with the newly calculated predictions and contribution scores.

y_1[0].squeeze(0).T.shape
attr_1.sum(axis = 1).T.shape
attr_1.shape

torch.Size([1000, 1])

torch.Size([2114, 1])

torch.Size([1, 4, 2114])

In this case, we can see a few sites popping up in the contribution tracks, including a BHLH site (E-box) in the center (CAGATG), which contributes the most to accessibility in this region.

fig, ax = plt.subplots(3, 1, figsize = (11, 6))

# plot the predicted profile for the top hit, in the shape (1000, 1)
ax[0].plot(y_1[0].squeeze(0).T, linewidth=1, label = "bias-corrected", color = 'red')
ax[0].set_ylabel("Predicted profile")
ax[0].legend(fontsize = 8)
ax[0].axvspan(400, 600, alpha=0.4, color='yellow')
ax[0].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# contribs - the central 1,000 bp only to align with the output window
ax[1].plot(attr_1.sum(axis = 1).T[SHIFT_REL_INPUT:(SHIFT_REL_INPUT+1000), :], linewidth=1, label = "contrib.", color = "blue")
ax[1].set_ylabel("Contribution scores")
ax[1].legend(fontsize = 8)
ax[1].axvspan(400, 600, alpha=0.4, color='yellow')
ax[1].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# plot the contrbs in the central 200 bp as a sequence logo
tangermeme.plot.plot_logo(np.float64(attr_1.squeeze(0)),
						  start = SHIFT_REL_INPUT + 400, end = SHIFT_REL_INPUT + 600, ax=ax[2])
ax[2].set_ylabel("Contrib. (zoom)")
ax[2].set_ylim(-0.05, 0.15)
ax[2].set_xticks(
	ticks = [0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 199],
	labels = [-100, -80, -60, -40, -20, 0, 20, 40, 60, 80, 100])

# put a title at the very top
plt.xlabel("Relative genomic position")
plt.suptitle("peak: " + X_peaks_coords.iloc[j]['chr'] + ":" + str(X_peaks_coords.iloc[j]['start']) + "-" + str(X_peaks_coords.iloc[j]['end']) + " in Brain_c0", fontsize = 12)
plt.tight_layout()
plt.show();

Predict the effect of a variant¶

Let's start from the sequences in the same peak as above:

X_peaks_coords.iloc[j]

chr          chr19
start     49436891
end       49437891
1                .
2                .
3                .
4                .
5                .
6                .
summit         500
Name: 54288, dtype: object

# get 2,114 bp ref/alt sequencearound the 1,000 bp peak
ref_seq = str(hg38['chr19'][X_peaks_coords.iloc[j]['start'] - SHIFT_REL_INPUT : X_peaks_coords.iloc[j]['end'] + SHIFT_REL_INPUT])

# get the relative position of the variant within the peak
var_pos_input = 1057
var_pos_output = 1057 + SHIFT_REL_INPUT

# get the alt seq
alt_seq = (
		ref_seq[: var_pos_input]
		+ "C"
		+ ref_seq[var_pos_input + 1 :]
	)

# convert to one-hot
ref_ohe = one_hot_encode(ref_seq).type(torch.float32).unsqueeze(0)
alt_ohe = one_hot_encode(alt_seq).type(torch.float32).unsqueeze(0)

ref_ohe.shape
alt_ohe.shape

torch.Size([1, 4, 2114])

torch.Size([1, 4, 2114])

Predict accessibility for each allele:

y_ref = tangermeme.predict.predict(brain_model, ref_ohe)
y_alt = tangermeme.predict.predict(brain_model, alt_ohe)

Interpret counts output for each allele:

attr_ref = deep_lift_shap(brain_model_counts, ref_ohe, random_state=0)
attr_alt = deep_lift_shap(brain_model_counts, alt_ohe, random_state=0)

attr_ref.shape
attr_alt.shape

torch.Size([1, 4, 2114])

torch.Size([1, 4, 2114])

Plot the results:

attr_ref.shape

torch.Size([1, 4, 2114])

In this example, mutating a G to C in the E-box results in a decrease in predicted accessibility.

See our Methods section for the scores we use to quantify the effects of variants on accessibility. See also the ChromBPNet paper, Pampari et al, biorxiv 2024, for a more detailed discussion of techniques for quantifying variant effect.

fig, ax = plt.subplots(4, 1, figsize = (11, 6))

alpha = 0.3
contrib_flank = 70

# reshape the predictions
pred_ref = y_ref[0].squeeze()
pred_alt = y_alt[0].squeeze()

# get the attributions in the central 1,000 bp
attr_ref_short = attr_ref.T[(INPUTLEN // 2 - 500):(INPUTLEN // 2 + 500)]
attr_alt_short = attr_alt.T[(INPUTLEN // 2 - 500):(INPUTLEN // 2 + 500)]

# prediction in 1,000 bp
ax[0].plot(pred_ref, linewidth = 0.5, label = "REF", color = "gray")
ax[0].plot(pred_alt, linewidth = 0.5, label = "ALT", color = "red")
ax[0].set_ylabel("Pred. \naccessibility")
ax[0].axvspan(500 - contrib_flank, 500 + contrib_flank, alpha = alpha, color='yellow')
ax[0].legend(fontsize = 8)

# attribution in 1,000 bp
ax[1].plot(attr_ref_short.sum(axis = 1).squeeze(), linewidth = 0.5, label = "REF", color = "gray")
ax[1].plot(attr_alt_short.sum(axis = 1).squeeze(), linewidth = 0.5, label = "ALT", color = "red")
ax[1].set_ylabel("Contrib. \nscores")
ax[1].axvspan(500 - contrib_flank, 500 + contrib_flank, alpha = alpha, color = 'yellow')

# zoomed contribs (REF)
tangermeme.plot.plot_logo(np.float64(attr_ref.squeeze(0)), start = var_pos_input - contrib_flank, end = var_pos_input + contrib_flank, ax=ax[2], ylim = (-0.05, 0.5))
ax[2].set_ylim(-0.05, 0.15)
ax[2].axvspan(contrib_flank - 0.5, contrib_flank + 0.5, alpha = alpha - 0.1, color = 'gray')
ax[2].set_ylabel("Contrib. \nREF")

# zoomed contribs (ALT)
tangermeme.plot.plot_logo(np.float64(attr_alt.squeeze(0)), start = var_pos_input - contrib_flank, end = var_pos_input + contrib_flank, ax=ax[3], ylim = (-0.05, 0.5)),
ax[3].set_ylim(-0.05, 0.15)
ax[3].axvspan(contrib_flank - 0.5, contrib_flank + 0.5, alpha = alpha - 0.1, color = 'red')
ax[3].set_ylabel("Contrib. \nALT")

# put a title at the very top
plt.suptitle("Predicted accessibility and contributions for example variant", fontsize = 12)
plt.xlabel("Relative genomic position (bp)")
plt.tight_layout()

plt.show();

Usage examples for ChromBPNet outputs and trained models

2025-04-24