Introduction

Here, we show some various use cases of the pre-computed ChromBPNet-related analysis products (i.e. accessibility predictions, contribution scores, and motif istances). Then, we show how to load the trained cell type-specific ChromBPNet model to make new predictions.

We make heavy use of the tangermeme and bpnetlite packages by Jacob Schreiber for working with trained models in pytorch. Please see the tangermeme documentation for more ways to use ChromBPNet models.

See our more detailed documentation on analysis products and data types here.

Run in the tangermeme environment on lab cluster.

Setup

%load_ext autoreload
%autoreload 2

# deal with matplotlib plots
%matplotlib inline

# display all outputs in a cell
get_ipython().ast_node_interactivity = 'all'

from IPython.display import display, HTML
# misc
import sys
import os
import numpy as np
import random
from tqdm import tqdm
import pandas as pd

# io
import h5py
from pyfaidx import Fasta

# ML
import bpnetlite
from bpnetlite import ChromBPNet
from bpnetlite import BPNet
from bpnetlite.bpnet import CountWrapper
from bpnetlite.attribute import deep_lift_shap

import torch


# plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn; seaborn.set_style('white')
import matplotlib.ticker as ticker
import plotnine as pn
import logomaker
import tangermeme
from tangermeme.plot import plot_logo

from tangermeme.utils import characters
from tangermeme.utils import one_hot_encode
from tangermeme import io


print(f"numpy version: {np.__version__}")
print(f"tangermeme version: {tangermeme.__version__}")
print(f"bpnetlite version: {bpnetlite.__version__}")
print(f"torch version: {torch.__version__}")

# os.environ['CUDA_VISIBLE_DEVICES']='0'
os.environ['CUDA_VISIBLE_DEVICES']='MIG-40f43250-998e-586a-ac37-d6520e92590f'

torch.manual_seed(100)

# editable text in PDFs
# https://stackoverflow.com/a/54111532
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
numpy version: 2.0.2
tangermeme version: 0.4.3
bpnetlite version: 0.8.1
torch version: 2.3.1+cu121
<torch._C.Generator at 0x7f90176f38f0>

Check we have GPU access:

torch.cuda.device_count()
1
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
dev

torch.cuda.is_available()

t1 = torch.randn(1,2).to(dev)
t1
t1.device
device(type='cuda')
True
tensor([[ 0.3607, -0.2859]], device='cuda:0')
device(type='cuda', index=0)

Variables

ChromBPNet models take in sequences of length INPUTLEN (2,114 bp) and returns accessibility in the central OUTPUTLEN (1,000 bp) region of the input.

Thus, we define some helpful variables below. We will use these to compute relative coordinates in some cases for plotting tracks which are genomically aligned.

INPUTLEN = 2114
OUTPUTLEN = 1000
MIDPOINT = INPUTLEN // 2
SHIFT_REL_INPUT = np.int64((2114 - 1000) / 2)
SHIFT_REL_INPUT

NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

LOGO_ALPHABET = 'ACGT'

# finemo colorscheme
LOGO_COLORS = {"A": '#109648', "C": '#255C99', "G": '#F7B32B', "T": '#D62839'}

# logomaker colorscheme
LOGO_COLORS2= {
        'G': [1, .65, 0],
        'T': [1, 0, 0],
        'C': [0, 0, 1],
        'A': [0, .5, 0]
    }

CHAR_IGNORE = ['QWERYUIOPSDFHJKLZXVBNM']
np.int64(557)

Paths

with open("../DURGA_DIRS.txt", 'r') as f:
	proj_in = f.readline().strip()
	proj_out = f.readline().strip()

with open("../AK_PROJ_DIR.txt", 'r') as f:
    kundaje_dir = f.readline().strip()
genome_fa = os.path.join(kundaje_dir, "refs/hg38/chrombpnet/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta")
hg38 = Fasta(genome_fa)

Load and visualize pre-computed predictions and contribution scores

We provide peaks used for model training, accessibility predictions, and per-nucleotide contribution scores for each peak in every cell type. We can load those outputs to inspect predicted accessibility and contribution scores at peaks of interest.

In this section, we point to internal local paths, but each of these files can be obtained from our data repositories on Zenodo.

We use the the extract_loci function from tangermeme, which loads the peaks as one-hot encoded sequences, along with the outputs (predictions, contribution scores) for each sequence.

# path to bed file for peaks
peaks_bed = os.path.join(proj_in, "00-inputs/chrombpnet_peaks/Brain_c0__peaks_bpnet.narrowPeak.gz")

# path to bigwig with bias-corrected predictions
preds_bw    = os.path.join(proj_in, "01-models/predictions/bias_corrected/Brain_c10_avg_chrombpnet_nobias.bw")

# path to bigwig with contribution scores
contribs_bw = os.path.join(proj_in, "01-models/contribs/bias_Heart_c0_thresh0.4/Brain_c0/average_shaps.counts.bw")

X_peaks, y_peaks = io.extract_loci(peaks_bed, genome_fa,
								   signals = [preds_bw, contribs_bw],
								   ignore = CHAR_IGNORE)

The X_peaks variable contains the one-hot encoded sequences, with shape (num_peaks, 4, INPUTLEN).

X_peaks.shape
X_peaks[0]
torch.Size([117140, 4, 2114])
tensor([[0, 0, 0,  ..., 0, 1, 0],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int8)

The y_peaks variable contains scalar values for predicted accessibility and contribution score at each nucleotide, with shape (num_peaks, 2, OUTPUTLEN).

y_peaks.shape
y_peaks[0]
torch.Size([117140, 2, 1000])
tensor([[ 0.0043,  0.0043,  0.0043,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0009, -0.0006, -0.0006]])

We also load in the peak coordinates:

X_peaks_coords = pd.read_csv(peaks_bed, sep='\t', header=None, names = NARROWPEAK_SCHEMA)
X_peaks_coords.head()
chr start end 1 2 3 4 5 6 summit
0 chr1 9641 10641 . . . . . . 500
1 chr1 9905 10905 . . . . . . 500
2 chr1 180976 181976 . . . . . . 500
3 chr1 777919 778919 . . . . . . 500
4 chr1 778224 779224 . . . . . . 500

We also load in the contribution scores in h5 format, which will allow us to plot them as sequence logos.

# load contribution scores
y_contribs = h5py.File(proj_in + "01-models/contribs/bias_Heart_c0_thresh0.4/Brain_c0/average_shaps.counts.h5", "r")
y_contribs.keys()
<KeysViewHDF5 ['projected_shap', 'raw', 'shap']>

We also load in the hits aka predictive motif instances:

instances = pd.read_csv(proj_out + "03-chrombpnet/02-compendium/hits_unified_motifs/reconciled_per_celltype_peaks/Brain_c0/counts_v0.23_a0.8_all/hits_unique.reconciled.annotated.tsv.gz", sep = "\t")
instances.head()

# print column names
instances.columns
seqnames start end width strand start_untrimmed end_untrimmed motif_name source hit_coefficient ... motif_name_unlabeled pattern_class distToGeneStart nearestGene peakType distToTSS nearestTSS GC N distToPeakSummit
0 chr1 10175 10180 5 - 10158 10188 456|ZEB/SNAI 1 0.961838 ... neg_patterns.neg.Average_12__merged_pattern_0 neg_patterns 1690 DDX11L1 Promoter 1690 ENST00000456328.2 0.4000 0 36
1 chr1 10503 10509 6 - 10497 10527 456|ZEB/SNAI 2 0.985665 ... neg_patterns.neg.Average_12__merged_pattern_1 neg_patterns 1362 DDX11L1 Promoter 1362 ENST00000456328.2 0.6667 0 100
2 chr1 10555 10561 6 + 10545 10575 4|BCL11A/Brepressive 2 0.432991 ... neg_patterns.neg.Thymus__Average_6__merged_pat... neg_patterns 1310 DDX11L1 Promoter 1310 ENST00000456328.2 0.5000 0 152
3 chr1 10566 10575 9 + 10560 10590 436|SP/KLF 2 6.695272 ... pos_patterns.pos.Average_212__merged_pattern_0 pos_patterns 1297 DDX11L1 Promoter 1297 ENST00000456328.2 0.8889 0 165
4 chr1 10642 10652 10 - 10629 10659 400|NRF1 2 3.583460 ... pos_patterns.pos.Average_159__merged_pattern_0 pos_patterns 1221 DDX11L1 Promoter 1221 ENST00000456328.2 0.9000 0 241

5 rows × 24 columns

Index(['seqnames', 'start', 'end', 'width', 'strand', 'start_untrimmed',
       'end_untrimmed', 'motif_name', 'source', 'hit_coefficient',
       'hit_correlation', 'hit_importance', 'peak_name', 'peak_id',
       'motif_name_unlabeled', 'pattern_class', 'distToGeneStart',
       'nearestGene', 'peakType', 'distToTSS', 'nearestTSS', 'GC', 'N',
       'distToPeakSummit'],
      dtype='object')

The extracted sequences are length 2114, because of the default in_window parameter of tangermeme.io.extract_loci, and matches the input length of ChromBPNet.

Therefore, the start of the 1,000 bp peak within the 2,114 bp window should be position (2114 - 1000) / 2 = 557. We can confirm that the next 10 bases of the one-hot encoded sequence match what we see at the beginning of the peak in the reference:

# taking the 31861th peak:
i = 31861

# reference squence
hg38[X_peaks_coords.iloc[i]['chr']][X_peaks_coords.iloc[i]['start']:X_peaks_coords.iloc[i]['start'] + 10].seq

# one-hot encoded sequence converted back to nucleotides
characters(X_peaks[i][:, 557:567])
'GCCTGCAAGG'
'GCCTGCAAGG'

Now we can plot the predictions and contribution scores for a peak of interest near FOXG1.

Let's also get the motif instances in this region:

instances_local = instances[instances['peak_id']==i]
instances_local

# convert these to relative coordinates for annotation
annotations = instances_local[['start', 'end', 'motif_name', 'hit_coefficient', 'strand']]
annotations.loc[:, 'start'] = annotations['start'] - X_peaks_coords.iloc[i]['start'] + SHIFT_REL_INPUT
annotations.loc[:, 'end'] = annotations['end'] - X_peaks_coords.iloc[i]['start'] + SHIFT_REL_INPUT

# rename motif_name to name
annotations = annotations.rename(columns={'hit_coefficient': 'score'})

# reorder the columns
annotations = annotations[['motif_name', 'start', 'end', 'strand', 'score']]
seqnames start end width strand start_untrimmed end_untrimmed motif_name source hit_coefficient ... motif_name_unlabeled pattern_class distToGeneStart nearestGene peakType distToTSS nearestTSS GC N distToPeakSummit
545198 chr14 28766432 28766440 8 + 28766420 28766450 157|ETS:ELF/ETV#1 1 0.134989 ... pos_patterns.pos.Average_306__merged_pattern_0 pos_patterns 350 FOXG1 Promoter 350 ENST00000313071.7 0.5000 0 293
545199 chr14 28766664 28766671 7 - 28766650 28766680 367|NFY 2 89.228280 ... pos_patterns.pos.Average_276__merged_pattern_1 pos_patterns 118 FOXG1 Promoter 118 ENST00000313071.7 0.5714 0 61
545200 chr14 28766709 28766716 7 - 28766695 28766725 367|NFY 2 69.797290 ... pos_patterns.pos.Average_276__merged_pattern_1 pos_patterns 73 FOXG1 Promoter 73 ENST00000313071.7 0.4286 0 16

3 rows × 24 columns

We highlight a region which we will zoom into, and we see two nice NFY sites standing out.

fig, ax = plt.subplots(3, 1, figsize = (11, 6))

# plot the predicted profile for the top hit
ax[0].plot(y_peaks[i][0], linewidth=1, label = "bias-corrected", color = 'red')
ax[0].set_ylabel("Predicted profile")
ax[0].legend(fontsize = 8)
ax[0].axvspan(400, 600, alpha=0.4, color='yellow')
ax[0].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# contribs
ax[1].plot(y_peaks[i][1], linewidth=1, label = "contrib.", color = "blue")
ax[1].set_ylabel("Contribution scores")
ax[1].legend(fontsize = 8)
ax[1].axvspan(400, 600, alpha=0.4, color='yellow')
ax[1].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# plot the contrbs in the central 200 bp as a sequence logo
tangermeme.plot.plot_logo(np.float64(y_contribs['projected_shap']['seq'][i]),
						  start = SHIFT_REL_INPUT + 400, end = SHIFT_REL_INPUT + 600, ax=ax[2],
						  # add in the annotations from the motif instances
						  annotations = annotations)
ax[2].set_ylabel("Contrib. (zoom)")
ax[2].set_ylim(-0.05, 0.1)
ax[2].set_xticks(
	ticks = [0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 199],
	labels = [-100, -80, -60, -40, -20, 0, 20, 40, 60, 80, 100])

# put a title at the very top
plt.xlabel("Relative genomic position")
plt.suptitle("peak: " + X_peaks_coords.iloc[i]['chr'] + ":" + str(X_peaks_coords.iloc[i]['start']) + "-" + str(X_peaks_coords.iloc[i]['end']) + " in Brain_c0", fontsize = 12)
plt.tight_layout()
plt.show();

This matches what we see in the genome browser at this locus:

image.png

Load a model

We can download the tar file of trained models for each cell type from Zenodo:

And then extract the models:

$ tar -xvf Brain_c0.gz
x Brain_c0/
x Brain_c0/Brain_c0__fold_0__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_2__chrombpnet.h5
x Brain_c0/Brain_c0__fold_4__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_4__chrombpnet.h5
x Brain_c0/Brain_c0__fold_3__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_1__chrombpnet.h5
x Brain_c0/Brain_c0__fold_0__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_0__chrombpnet.h5
x Brain_c0/Brain_c0__fold_3__chrombpnet.h5
x Brain_c0/Brain_c0__fold_1__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_2__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_4__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_2__bias_model_scaled.h5
x Brain_c0/Brain_c0__fold_1__chrombpnet_nobias.h5
x Brain_c0/Brain_c0__fold_3__chrombpnet_nobias.h5
$ ls Brain_c0
Brain_c0__fold_0__bias_model_scaled.h5 Brain_c0__fold_2__chrombpnet.h5
Brain_c0__fold_0__chrombpnet_nobias.h5 Brain_c0__fold_3__bias_model_scaled.h5
Brain_c0__fold_0__chrombpnet.h5        Brain_c0__fold_3__chrombpnet_nobias.h5
Brain_c0__fold_1__bias_model_scaled.h5 Brain_c0__fold_3__chrombpnet.h5
Brain_c0__fold_1__chrombpnet_nobias.h5 Brain_c0__fold_4__bias_model_scaled.h5
Brain_c0__fold_1__chrombpnet.h5        Brain_c0__fold_4__chrombpnet_nobias.h5
Brain_c0__fold_2__bias_model_scaled.h5 Brain_c0__fold_4__chrombpnet.h5
Brain_c0__fold_2__chrombpnet_nobias.h5

Generally, we will want to use *_chrombpnet_nobias.h5 models for all downstream work, which produces bias-corrected model predictions. See our documentation here for more details.

To load a pre-trained ChromBPNet model, we can use the BPNet class from the bpnetlite package. The ChromBPNet models return two outputs: the profile predictions a vector of logits over 1,000 bp, and the log counts as a scalar value.

To convert the logits to accessibility profiles, we softmax the logits and scale by the exponentiated log counts. Throughout the code, we use a pytorch wrapper implemented in code/03-chrombpnet/tangermeme_utils/wrappers.py, but for convenience, we show the wrapper here:

class ChromBPNetWrapper(torch.nn.Module):
    """A wrapper class that returns counts and transformed profiles
    from a bias-corrected ChromBPNet model.

    This class takes in a trained model, specifically expecting a BPNet model
    from a trained bias-corrected ChromBPNet model (i.e. `chrombpnet_nobias.h5`),
    _not_ the full accessibility model.

    It returns outputs the same shape as the BPNet model, but with the predicted
    profile logits softmaxed and scaled by the exponent of the predicted counts.
    This is for convenience when plotting, since in e.g. bigwigs, we are inspecting
    the scaled profile, not the unnormalized logits.

    Parameters
    ----------
    model: torch.nn.Module
        A torch model to be wrapped.
    """

    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, X):

        y = self.model(X)

        # predicted profile logits
        y_profile = y[0]

        # predicted log counts
        y_counts = y[1]

        # softmax the logits to get profile probabilities
        y_profile = torch.nn.functional.softmax(y_profile, dim = -1)

        # scale the profile by the exponentiated predicted ln(counts)
        y_profile = y_profile * torch.exp(y_counts).unsqueeze(2)

        return y_profile, y_counts

First, we load the model for one fold:

brain_model = ChromBPNetWrapper(BPNet.from_chrombpnet(
	filename=os.path.join(proj_in, f"01-models/models/bias_Heart_c0_thresh0.4/Brain_c0/fold_0/models/chrombpnet_nobias.h5")))

Predict and interpret for a region of interest

Now we can run a forward pass through the model to get predicted outputs. Let's use a different peak region, near the SLC17A7 gene.

Check out the tutorial on prediction from tangermeme for more details: https://tangermeme.readthedocs.io/en/latest/tutorials/Tutorial_A3_Deep_LIFT_SHAP.html

We use "contribution", "attribution", "interpretation", and "importance" interchangeably.

j = 54288
y_1 = tangermeme.predict.predict(brain_model, X_peaks[j].unsqueeze(0).float())

As expected, the model returns two values, one for each model head: the profiles, and the ocunts. Since we used our model wrapper, the profile values are already scaled by the exponentiated log counts and represent predicted accessibility profiles. The counts are predicted log counts.

len(y_1)
y_1[0].shape
y_1[1].shape
2
torch.Size([1, 1, 1000])
torch.Size([1, 1])

Next we run model interpretation. Note that for convenience in calculating the attributions, we need to use the CountWrapper class from bpnet-lite to get the counts predictions only, since the attributions algorithm (DeepLIFT/SHAP) can only explain a single output at a time.

We also use bpnetlite.attribute.deep_lift_shap as our attribution/model interpretation function, which automatically registers the layers that are necessary for using Chrom BPNet models.

Check out the tangermeme tutorial on interpretation for more details: https://tangermeme.readthedocs.io/en/latest/tutorials/Tutorial_A3_Deep_LIFT_SHAP.html

brain_model_counts = CountWrapper(BPNet.from_chrombpnet(
	filename=os.path.join(proj_in, f"01-models/models/bias_Heart_c0_thresh0.4/Brain_c0/fold_0/models/chrombpnet_nobias.h5")))

# we need to pass a batch of shape (n_seqs, 4, 1000), so we add an extra dimension here 
# since we're just passing in a single sequence
attr_1 = deep_lift_shap(brain_model_counts, X_peaks[j].unsqueeze(0).float(), random_state=0, print_convergence_deltas=True)
tensor([4.5419e-04, 1.3351e-05, 2.0528e-04, 1.1563e-04, 1.2207e-04, 1.8907e-04,
        2.4796e-05, 2.8729e-04, 3.2043e-04, 1.3494e-04, 1.0395e-04, 2.5105e-04,
        2.1458e-05, 9.5367e-06, 3.8862e-04, 1.4544e-04, 1.8716e-04, 2.6727e-04,
        1.1206e-04, 2.2173e-04], device='cuda:0', grad_fn=<AbsBackward0>)

As expected, every nucleotide in the input region (INPUTLEN, 2,114 bp) is assigned a contirubtion score to the output, therefore the output is of shape (num_peaks, 4, INPUTLEN).

attr_1.shape
attr_1.sum(axis = 1).shape
torch.Size([1, 4, 2114])
torch.Size([1, 2114])

We can make a similar plot as above, with the newly calculated predictions and contribution scores.

y_1[0].squeeze(0).T.shape
attr_1.sum(axis = 1).T.shape
attr_1.shape
torch.Size([1000, 1])
torch.Size([2114, 1])
torch.Size([1, 4, 2114])

In this case, we can see a few sites popping up in the contribution tracks, including a BHLH site (E-box) in the center (CAGATG), which contributes the most to accessibility in this region.

fig, ax = plt.subplots(3, 1, figsize = (11, 6))

# plot the predicted profile for the top hit, in the shape (1000, 1)
ax[0].plot(y_1[0].squeeze(0).T, linewidth=1, label = "bias-corrected", color = 'red')
ax[0].set_ylabel("Predicted profile")
ax[0].legend(fontsize = 8)
ax[0].axvspan(400, 600, alpha=0.4, color='yellow')
ax[0].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# contribs - the central 1,000 bp only to align with the output window
ax[1].plot(attr_1.sum(axis = 1).T[SHIFT_REL_INPUT:(SHIFT_REL_INPUT+1000), :], linewidth=1, label = "contrib.", color = "blue")
ax[1].set_ylabel("Contribution scores")
ax[1].legend(fontsize = 8)
ax[1].axvspan(400, 600, alpha=0.4, color='yellow')
ax[1].set_xticks(ticks = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 999], labels = [-500, -400, -300, -200, -100, 0, +100, +200, +300, +400, +500])

# plot the contrbs in the central 200 bp as a sequence logo
tangermeme.plot.plot_logo(np.float64(attr_1.squeeze(0)),
						  start = SHIFT_REL_INPUT + 400, end = SHIFT_REL_INPUT + 600, ax=ax[2])
ax[2].set_ylabel("Contrib. (zoom)")
ax[2].set_ylim(-0.05, 0.15)
ax[2].set_xticks(
	ticks = [0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 199],
	labels = [-100, -80, -60, -40, -20, 0, 20, 40, 60, 80, 100])

# put a title at the very top
plt.xlabel("Relative genomic position")
plt.suptitle("peak: " + X_peaks_coords.iloc[j]['chr'] + ":" + str(X_peaks_coords.iloc[j]['start']) + "-" + str(X_peaks_coords.iloc[j]['end']) + " in Brain_c0", fontsize = 12)
plt.tight_layout()
plt.show();

Predict the effect of a variant

Let's start from the sequences in the same peak as above:

X_peaks_coords.iloc[j]
chr          chr19
start     49436891
end       49437891
1                .
2                .
3                .
4                .
5                .
6                .
summit         500
Name: 54288, dtype: object
# get 2,114 bp ref/alt sequencearound the 1,000 bp peak
ref_seq = str(hg38['chr19'][X_peaks_coords.iloc[j]['start'] - SHIFT_REL_INPUT : X_peaks_coords.iloc[j]['end'] + SHIFT_REL_INPUT])

# get the relative position of the variant within the peak
var_pos_input = 1057
var_pos_output = 1057 + SHIFT_REL_INPUT

# get the alt seq
alt_seq = (
		ref_seq[: var_pos_input]
		+ "C"
		+ ref_seq[var_pos_input + 1 :]
	)

# convert to one-hot
ref_ohe = one_hot_encode(ref_seq).type(torch.float32).unsqueeze(0)
alt_ohe = one_hot_encode(alt_seq).type(torch.float32).unsqueeze(0)

ref_ohe.shape
alt_ohe.shape
torch.Size([1, 4, 2114])
torch.Size([1, 4, 2114])

Predict accessibility for each allele:

y_ref = tangermeme.predict.predict(brain_model, ref_ohe)
y_alt = tangermeme.predict.predict(brain_model, alt_ohe)

Interpret counts output for each allele:

attr_ref = deep_lift_shap(brain_model_counts, ref_ohe, random_state=0)
attr_alt = deep_lift_shap(brain_model_counts, alt_ohe, random_state=0)

attr_ref.shape
attr_alt.shape
torch.Size([1, 4, 2114])
torch.Size([1, 4, 2114])

Plot the results:

attr_ref.shape
torch.Size([1, 4, 2114])

In this example, mutating a G to C in the E-box results in a decrease in predicted accessibility.

See our Methods section for the scores we use to quantify the effects of variants on accessibility. See also the ChromBPNet paper, Pampari et al, biorxiv 2024, for a more detailed discussion of techniques for quantifying variant effect.

fig, ax = plt.subplots(4, 1, figsize = (11, 6))

alpha = 0.3
contrib_flank = 70

# reshape the predictions
pred_ref = y_ref[0].squeeze()
pred_alt = y_alt[0].squeeze()

# get the attributions in the central 1,000 bp
attr_ref_short = attr_ref.T[(INPUTLEN // 2 - 500):(INPUTLEN // 2 + 500)]
attr_alt_short = attr_alt.T[(INPUTLEN // 2 - 500):(INPUTLEN // 2 + 500)]

# prediction in 1,000 bp
ax[0].plot(pred_ref, linewidth = 0.5, label = "REF", color = "gray")
ax[0].plot(pred_alt, linewidth = 0.5, label = "ALT", color = "red")
ax[0].set_ylabel("Pred. \naccessibility")
ax[0].axvspan(500 - contrib_flank, 500 + contrib_flank, alpha = alpha, color='yellow')
ax[0].legend(fontsize = 8)

# attribution in 1,000 bp
ax[1].plot(attr_ref_short.sum(axis = 1).squeeze(), linewidth = 0.5, label = "REF", color = "gray")
ax[1].plot(attr_alt_short.sum(axis = 1).squeeze(), linewidth = 0.5, label = "ALT", color = "red")
ax[1].set_ylabel("Contrib. \nscores")
ax[1].axvspan(500 - contrib_flank, 500 + contrib_flank, alpha = alpha, color = 'yellow')

# zoomed contribs (REF)
tangermeme.plot.plot_logo(np.float64(attr_ref.squeeze(0)), start = var_pos_input - contrib_flank, end = var_pos_input + contrib_flank, ax=ax[2], ylim = (-0.05, 0.5))
ax[2].set_ylim(-0.05, 0.15)
ax[2].axvspan(contrib_flank - 0.5, contrib_flank + 0.5, alpha = alpha - 0.1, color = 'gray')
ax[2].set_ylabel("Contrib. \nREF")

# zoomed contribs (ALT)
tangermeme.plot.plot_logo(np.float64(attr_alt.squeeze(0)), start = var_pos_input - contrib_flank, end = var_pos_input + contrib_flank, ax=ax[3], ylim = (-0.05, 0.5)),
ax[3].set_ylim(-0.05, 0.15)
ax[3].axvspan(contrib_flank - 0.5, contrib_flank + 0.5, alpha = alpha - 0.1, color = 'red')
ax[3].set_ylabel("Contrib. \nALT")

# put a title at the very top
plt.suptitle("Predicted accessibility and contributions for example variant", fontsize = 12)
plt.xlabel("Relative genomic position (bp)")
plt.tight_layout()

plt.show();