Source code for matcha.Matcher

import collections
import itertools
import re
import subprocess

import numpy as np

import _matcha

class Matcher:
    """
    Barcode matcher python wrapper

    Args:
        sequences (List[str]): Barcode DNA sequences
        labels (List[str]): Labels for barcode sequences (optional)
        _matcher (_matcha.Matcher): C++ matcher object to use
    """
    def __init__(self, sequences, _matcher, labels=None):     
        self.sequences = np.array(sequences)
        self.binary_sequences = np.array([_matcha.stringToBinary(seq) for seq in self.sequences])
        if labels is None:
            labels = self.sequences
        self.labels = np.array(labels)
        self.sequence_length = len(self.sequences[0])

        self._matcher = _matcher
        self._matcher.add_sequences(self.sequences)
        self._matcher.add_labels(labels)

    def match_all(self, sequences, start=0):
        """Match all sequences in a list
        
        Args:
            sequences (List[str]): Query sequences
            start (int): 0-based position of first base to use in barcode match
        
        Returns:
            collections.namedtuple: Named tuple. Field seq is binary encoding of best matching sequence.
                Other fields give match quality and may vary based on algorithm.
                Fields by algorithm:
                    list: dist, second_best_dist -- hamming distance to best and second best matches respectively
        """
        end = start + self.sequence_length
        result = self._matcher.match_all(sequences, start, end)
        return self.process_matches(result)

    def process_matches(self, match_result):
        """Process a match result quality based on the type of algorithm used"""
        best_match, raw_quality = match_result[0], match_result[1]
        return MatchResult(best_match, raw_quality & 63, np.right_shift(raw_quality, 6), self.labels)
   
[docs]class ListMatcher(Matcher): """ List matcher iterates through possible matches in a list. Slow for many valid barcodes, but has no limits on the maximum number of mismatches. As a rule of thumb, this is the best choice if you have under 20 valid sequences, but you should probably not use it to match over 100 sequences Args: sequences (List[str]): Barcode DNA sequences labels (List[str]): Labels for barcode sequences (optional) """ def __init__(self, sequences, labels=None): _matcher = _matcha.ListMatcher() super().__init__(sequences, _matcher, labels)
[docs]class MatchResult: """ Container type for match results. Attributes: label (numpy.ndarray): String array of labels for the best match. Only valid if labels were provided while creating the matcher object dist (numpy.ndarray): Integer array of distances (number of mismatches) to the best match second_best_dist (numpy.ndarray): Integer array of distances (number of mismatches) to the second-best match match (numpy.ndarray): Integer array of indexes of the best match in the Matcher object's list of valid sequences """ def __init__(self, match, dist, second_best_dist, labels): self.match = match self.dist = dist self.second_best_dist = second_best_dist self._labels = labels @property def label(self): return self._labels[self.match]
[docs]class HashMatcher(Matcher): """ Hash matcher uses hash tables of subsequences for barcode search, using the algorithm of Norouzi et al. https://arxiv.org/pdf/1307.2982.pdf. Fast for many valid barcodes and few possible mismatches. Increasing the max_mismatches will decrease performance. For 10x-style barcodes (16bp, ~1M valid barcodes), recommended settings are max_mismatches=1, subsequence_count=2 Args: sequences (List[str]): Barcode DNA sequences max_mismatches (int): Maximum mismatches to match against subsequence_count (int): Number of subsequence indexes to use. In general, use lower subsequence_count for larger number of valid labels, and higher subsequence_count for searching against a larger number of mismatches. labels (List[str]): Labels for barcode sequences (optional) """ def __init__(self, sequences, max_mismatches, subsequence_count, labels=None): # Get masks for extracting subsequences # Stripe base indexes for each subsequence, in case there are contiguous chunks of similarity in valid barcodes valid_bases = list(range(len(sequences[0]))) subsequence_indexes = [valid_bases[i::subsequence_count] for i in range(subsequence_count)] subsequence_indexes.sort(key = lambda l: len(l)) subsequence_masks = [self.get_mask(indexes) for indexes in subsequence_indexes] # Calculate masks for finding neigbors during lookup # Terminology as from equation 3 in: https://arxiv.org/pdf/1307.2982.pdf mismatch_masks = [] r_prime = max_mismatches // subsequence_count a = max_mismatches % subsequence_count for i in range(subsequence_count): mismatch_range = r_prime if i <= a else r_prime - 1 mismatch_masks.append(self.get_mismatch_masks(subsequence_indexes[i], mismatch_range)) _matcher = _matcha.HashMatcher(subsequence_masks, mismatch_masks, max_mismatches) super().__init__(sequences, _matcher, labels) @staticmethod def get_mask(indexes): mask = 0 for i in indexes: mask |= 3 << (i*2) return mask @staticmethod def get_mismatch_masks(indexes, mismatch_range): """Return a (xor) mask for every possible sequence neighbor within a given mismatch range""" mismatch_masks = [] mismatch_positions = itertools.chain( *(itertools.combinations(indexes, m) for m in range(mismatch_range+1)) ) for pos in mismatch_positions: for vals in itertools.product([1,2,3], repeat=len(pos)): mask = 0 for p, v in zip(pos, vals): mask |= v << (p*2) mismatch_masks.append(mask) return mismatch_masks