Source code for matcha.Matcher

import collections
import itertools
import re
import subprocess

import numpy as np

import _matcha

class Matcher:
    """
    Barcode matcher python wrapper

    Args:
        sequences (List[str]): Barcode DNA sequences
        labels (List[str]): Labels for barcode sequences (optional)
        _matcher (_matcha.Matcher): C++ matcher object to use
    """
    def __init__(self, sequences, _matcher, labels=None):     
        self.sequences = np.array(sequences)
        self.binary_sequences = np.array([_matcha.stringToBinary(seq) for seq in self.sequences])
        if labels is None:
            labels = self.sequences
        self.labels = np.array(labels)
        self.sequence_length = len(self.sequences[0])

        self._matcher = _matcher
        self._matcher.add_sequences(self.sequences)
        self._matcher.add_labels(labels)

    def match_all(self, sequences, start=0):
        """Match all sequences in a list
        
        Args:
            sequences (List[str]): Query sequences
            start (int): 0-based position of first base to use in barcode match
        
        Returns:
            collections.namedtuple: Named tuple. Field seq is binary encoding of best matching sequence.
                Other fields give match quality and may vary based on algorithm.
                Fields by algorithm:
                    list: dist, second_best_dist -- hamming distance to best and second best matches respectively
        """
        end = start + self.sequence_length
        result = self._matcher.match_all(sequences, start, end)
        return self.process_matches(result)

    def process_matches(self, match_result):
        """Process a match result quality based on the type of algorithm used"""
        best_match, raw_quality = match_result[0], match_result[1]
        return MatchResult(best_match, raw_quality & 63, np.right_shift(raw_quality, 6), self.labels)
   
[docs]class ListMatcher(Matcher):
    """
    List matcher iterates through possible matches in a list. Slow for many valid barcodes,
    but has no limits on the maximum number of mismatches. As a rule of thumb, this is the best choice if you
    have under 20 valid sequences, but you should probably not use it to match over 100 sequences

    Args:
        sequences (List[str]): Barcode DNA sequences
        labels (List[str]): Labels for barcode sequences (optional)
    """
    def __init__(self, sequences, labels=None):   
        _matcher = _matcha.ListMatcher()
        super().__init__(sequences, _matcher, labels)  

[docs]class MatchResult:
    """
    Container type for match results.

    Attributes:
        label (numpy.ndarray): String array of labels for the best match. Only valid if labels were provided while creating the matcher object
        dist (numpy.ndarray): Integer array of distances (number of mismatches) to the best match
        second_best_dist (numpy.ndarray): Integer array of distances (number of mismatches) to the second-best match
        match (numpy.ndarray): Integer array of indexes of the best match in the Matcher object's list of valid sequences
    """
    def __init__(self, match, dist, second_best_dist, labels):
        self.match = match
        self.dist = dist
        self.second_best_dist = second_best_dist
        self._labels = labels
    
    @property
    def label(self):
        return self._labels[self.match]

[docs]class HashMatcher(Matcher):
    """
    Hash matcher uses hash tables of subsequences for barcode search, using the algorithm of Norouzi et al. https://arxiv.org/pdf/1307.2982.pdf. 
    Fast for many valid barcodes and few possible mismatches. Increasing the max_mismatches will decrease performance. For
    10x-style barcodes (16bp, ~1M valid barcodes), recommended settings are max_mismatches=1, subsequence_count=2

    Args:
        sequences (List[str]): Barcode DNA sequences
        max_mismatches (int): Maximum mismatches to match against
        subsequence_count (int): Number of subsequence indexes to use. 
            In general, use lower subsequence_count for larger number of valid labels,
            and higher subsequence_count for searching against a larger number of mismatches.
        labels (List[str]): Labels for barcode sequences (optional)

    """
    def __init__(self, sequences, max_mismatches, subsequence_count, labels=None):   
        # Get masks for extracting subsequences
        # Stripe base indexes for each subsequence, in case there are contiguous chunks of similarity in valid barcodes
        valid_bases = list(range(len(sequences[0])))
        subsequence_indexes = [valid_bases[i::subsequence_count] for i in range(subsequence_count)]
        subsequence_indexes.sort(key = lambda l: len(l))
        subsequence_masks = [self.get_mask(indexes) for indexes in subsequence_indexes]

        # Calculate masks for finding neigbors during lookup
        # Terminology as from equation 3 in: https://arxiv.org/pdf/1307.2982.pdf 
        mismatch_masks = []
        r_prime = max_mismatches // subsequence_count 
        a = max_mismatches % subsequence_count
        for i in range(subsequence_count):
            mismatch_range = r_prime if i <= a else r_prime - 1
            mismatch_masks.append(self.get_mismatch_masks(subsequence_indexes[i], mismatch_range))

        _matcher = _matcha.HashMatcher(subsequence_masks, mismatch_masks, max_mismatches)
        super().__init__(sequences, _matcher, labels)  

    @staticmethod
    def get_mask(indexes):
        mask = 0
        for i in indexes:
            mask |= 3 << (i*2)
        return mask

    @staticmethod
    def get_mismatch_masks(indexes, mismatch_range):
        """Return a (xor) mask for every possible sequence neighbor within a given mismatch range"""
        mismatch_masks = []

        mismatch_positions = itertools.chain(
            *(itertools.combinations(indexes, m) for m in range(mismatch_range+1))
        )
        for pos in mismatch_positions:
            for vals in itertools.product([1,2,3], repeat=len(pos)):
                mask = 0
                for p, v in zip(pos, vals):
                    mask |= v << (p*2)
                mismatch_masks.append(mask)
        return mismatch_masks