1# -----------------------------------------------------------------------------
2#  Copyright (c) 2013--, scikit-bio development team.
3#
4#  Distributed under the terms of the Modified BSD License.
5#
6#  The full license is in the file COPYING.txt, distributed with this software.
7# -----------------------------------------------------------------------------
8
9from cpython cimport bool
10import numpy as np
11cimport numpy as cnp
12from skbio.sequence import Protein, Sequence
13
14cdef extern from "_lib/ssw.h":
15
16    ctypedef struct s_align:
17        cnp.uint16_t score1
18        cnp.uint16_t score2
19        cnp.int32_t ref_begin1
20        cnp.int32_t ref_end1
21        cnp.int32_t read_begin1
22        cnp.int32_t read_end1
23        cnp.int32_t ref_end2
24        cnp.uint32_t* cigar
25        cnp.int32_t cigarLen
26
27    ctypedef struct s_profile:
28        pass
29
30    cdef s_profile* ssw_init(const cnp.int8_t* read,
31                             const cnp.int32_t readLen,
32                             const cnp.int8_t* mat,
33                             const cnp.int32_t n,
34                             const cnp.int8_t score_size)
35
36    cdef void init_destroy(s_profile* p)
37
38    cdef s_align* ssw_align(const s_profile* prof,
39                            const cnp.int8_t* ref,
40                            cnp.int32_t refLen,
41                            const cnp.uint8_t weight_gapO,
42                            const cnp.uint8_t weight_gapE,
43                            const cnp.uint8_t flag,
44                            const cnp.uint16_t filters,
45                            const cnp.int32_t filterd,
46                            const cnp.int32_t maskLen)
47
48    cdef void align_destroy(s_align* a)
49
50np_aa_table = np.array([
51    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
52    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
53    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
54    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
55    23,  0, 20,  4,  3,  6, 13,  7,  8,  9, 23, 11, 10, 12,  2, 23,
56    14,  5,  1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23,
57    23,  0, 20,  4,  3,  6, 13,  7,  8,  9, 23, 11, 10, 12,  2, 23,
58    14,  5,  1, 15, 16, 23, 19, 17, 22, 18, 21, 23, 23, 23, 23, 23])
59
60np_nt_table = np.array([
61    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
62    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
63    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
64    4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
65    4,  0,  4,  1,  4,  4,  4,  2,  4,  4,  4,  4,  4,  4,  4,  4,
66    4,  4,  4,  4,  3,  0,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
67    4,  0,  4,  1,  4,  4,  4,  2,  4,  4,  4,  4,  4,  4,  4,  4,
68    4,  4,  4,  4,  3,  0,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4])
69
70mid_table = np.array(['M', 'I', 'D'])
71
72
73cdef class AlignmentStructure:
74    """Wraps the result of an alignment c struct so it is accessible to Python
75
76    Notes
77    -----
78    `cigar` may be empty depending on parameters used.
79
80    `target_begin` and `query_begin` may be -1 depending on parameters used.
81
82    Developer note: `read_sequence` is an alias for `query_sequence` used by
83    ssw.c as is `reference_sequence` for `target_sequence`
84    """
85    cdef s_align *p
86    cdef str read_sequence
87    cdef str reference_sequence
88    cdef int index_starts_at
89    cdef str _cigar_string
90
91    def __cinit__(self, read_sequence, reference_sequence, index_starts_at):
92        # We use `read_sequence` and `reference_sequence` here as they are
93        # treated sematically as a private output of ssw.c like the `s_align`
94        # struct
95        self.read_sequence = read_sequence
96        self.reference_sequence = reference_sequence
97        self.index_starts_at = index_starts_at
98
99    cdef __constructor(self, s_align* pointer):
100        self.p = pointer
101
102    def __dealloc__(self):
103        if self.p is not NULL:
104            align_destroy(self.p)
105
106    def __getitem__(self, key):
107        return getattr(self, key)
108
109    def __repr__(self):
110        data = ['optimal_alignment_score', 'suboptimal_alignment_score',
111                'query_begin', 'query_end', 'target_begin',
112                'target_end_optimal', 'target_end_suboptimal', 'cigar',
113                'query_sequence', 'target_sequence']
114        return "{\n%s\n}" % ',\n'.join([
115            "    {!r}: {!r}".format(k, self[k]) for k in data])
116
117    def __str__(self):
118        score = "Score: %d" % self.optimal_alignment_score
119        if self.query_sequence and self.cigar:
120            target = self.aligned_target_sequence
121            query = self.aligned_query_sequence
122            align_len = len(query)
123            if align_len > 13:
124                target = target[:10] + "..."
125                query = query[:10] + "..."
126
127            length = "Length: %d" % align_len
128            return "\n".join([query, target, score, length])
129        return score
130
131    @property
132    def optimal_alignment_score(self):
133        """Optimal alignment score
134
135        Returns
136        -------
137        int
138            The optimal alignment score
139
140        """
141        return self.p.score1
142
143    @property
144    def suboptimal_alignment_score(self):
145        """Suboptimal alignment score
146
147        Returns
148        -------
149        int
150            The suboptimal alignment score
151
152        """
153        return self.p.score2
154
155    @property
156    def target_begin(self):
157        """Character index where the target's alignment begins
158
159        Returns
160        -------
161        int
162            The character index of the target sequence's alignment's beginning
163
164        Notes
165        -----
166        The result is a 0-based index by default
167
168        """
169        return self.p.ref_begin1 + self.index_starts_at if (self.p.ref_begin1
170                                                            >= 0) else -1
171
172    @property
173    def target_end_optimal(self):
174        """Character index where the target's optimal alignment ends
175
176        Returns
177        -------
178        int
179            The character index of the target sequence's optimal alignment's
180             end
181
182        Notes
183        -----
184        The result is a 0-based index by default
185
186        """
187        return self.p.ref_end1 + self.index_starts_at
188
189    @property
190    def target_end_suboptimal(self):
191        """Character index where the target's suboptimal alignment ends
192
193        Returns
194        -------
195        int
196            The character index of the target sequence's suboptimal alignment's
197             end
198
199        Notes
200        -----
201        The result is a 0-based index by default
202
203        """
204        return self.p.ref_end2 + self.index_starts_at
205
206    @property
207    def query_begin(self):
208        """Returns the character index at which the query sequence begins
209
210        Returns
211        -------
212        int
213            The character index of the query sequence beginning
214
215        Notes
216        -----
217        The result is a 0-based index by default
218
219        """
220        return self.p.read_begin1 + self.index_starts_at if (self.p.read_begin1
221                                                             >= 0) else -1
222
223    @property
224    def query_end(self):
225        """Character index at where query sequence ends
226
227        Returns
228        -------
229        int
230            The character index of the query sequence ending
231
232        Notes
233        -----
234        The result is a 0-based index by default
235
236        """
237        return self.p.read_end1 + self.index_starts_at
238
239    @property
240    def cigar(self):
241        """Cigar formatted string for the optimal alignment
242
243        Returns
244        -------
245        str
246            The cigar string of the optimal alignment
247
248        Notes
249        -----
250        The cigar string format is described in [1]_ and [2]_.
251
252        If there is no cigar or optimal alignment, this will return an empty
253        string
254
255        References
256        ----------
257        .. [1] http://genome.sph.umich.edu/wiki/SAM
258        .. [2] http://samtools.github.io/hts-specs/SAMv1.pdf
259
260        """
261        # Memoization! (1/2)
262        if self._cigar_string is not None:
263            return self._cigar_string
264        cigar_list = []
265        for i in range(self.p.cigarLen):
266            # stored the same as that in BAM format,
267            # high 28 bits: length, low 4 bits: M/I/D (0/1/2)
268
269            # Length, remove first 4 bits
270            cigar_list.append(str(self.p.cigar[i] >> 4))
271            # M/I/D, lookup first 4 bits in the mid_table
272            cigar_list.append(mid_table[self.p.cigar[i] & 0xf])
273        # Memoization! (2/2)
274        self._cigar_string = "".join(cigar_list)
275        return self._cigar_string
276
277    @property
278    def query_sequence(self):
279        """Query sequence
280
281        Returns
282        -------
283        str
284            The query sequence
285
286        """
287        return self.read_sequence
288
289    @property
290    def target_sequence(self):
291        """Target sequence
292
293        Returns
294        -------
295        str
296            The target sequence
297
298        """
299        return self.reference_sequence
300
301    @property
302    def aligned_query_sequence(self):
303        """Returns the query sequence aligned by the cigar
304
305        Returns
306        -------
307        str
308            Aligned query sequence
309
310        Notes
311        -----
312        This will return `None` if `suppress_sequences` was True when this
313        object was created
314
315        """
316        if self.query_sequence:
317            return self._get_aligned_sequence(self.query_sequence,
318                                              self._tuples_from_cigar(),
319                                              self.query_begin, self.query_end,
320                                              "D")
321        return None
322
323    @property
324    def aligned_target_sequence(self):
325        """Returns the target sequence aligned by the cigar
326
327        Returns
328        -------
329        str
330            Aligned target sequence
331
332        Notes
333        -----
334        This will return `None` if `suppress_sequences` was True when this
335        object was created
336
337        """
338        if self.target_sequence:
339            return self._get_aligned_sequence(self.target_sequence,
340                                              self._tuples_from_cigar(),
341                                              self.target_begin,
342                                              self.target_end_optimal,
343                                              "I")
344        return None
345
346    def set_zero_based(self, is_zero_based):
347        """Set the aligment indices to start at 0 if True else 1 if False
348
349        """
350        if is_zero_based:
351            self.index_starts_at = 0
352        else:
353            self.index_starts_at = 1
354
355    def is_zero_based(self):
356        """Returns True if alignment inidices start at 0 else False
357
358        Returns
359        -------
360        bool
361            Whether the alignment inidices start at 0
362
363        """
364        return self.index_starts_at == 0
365
366    def _get_aligned_sequence(self, sequence, tuple_cigar, begin, end,
367                              gap_type):
368        # Save the original index scheme and then set it to 0 (1/2)
369        orig_z_base = self.is_zero_based()
370        self.set_zero_based(True)
371        aligned_sequence = []
372        seq = sequence[begin:end + 1]
373        index = 0
374        for length, mid in tuple_cigar:
375            if mid == gap_type:
376                aligned_sequence += ['-' * length]
377            else:
378                aligned_sequence += [seq[index:index + length]]
379                index += length
380        # Our sequence end is sometimes beyond the cigar:
381        aligned_sequence += [seq[index:end - begin + 1]]
382        # Revert our index scheme to the original (2/2)
383        self.set_zero_based(orig_z_base)
384        return "".join(aligned_sequence)
385
386    def _tuples_from_cigar(self):
387        tuples = []
388        length_stack = []
389        for character in self.cigar:
390            if character.isdigit():
391                length_stack.append(character)
392            else:
393                tuples.append((int("".join(length_stack)), character))
394                length_stack = []
395        return tuples
396
397cdef class StripedSmithWaterman:
398    """Performs a striped (banded) Smith Waterman Alignment.
399
400    First a StripedSmithWaterman object must be instantiated with a query
401    sequence. The resulting object is then callable with a target sequence and
402    may be reused on a large collection of target sequences.
403
404    Parameters
405    ----------
406    query_sequence : string
407        The query sequence, this may be upper or lowercase from the set of
408        {A, C, G, T, N} (nucleotide) or from the set of
409        {A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V, B, Z, X, *
410        } (protein)
411    gap_open_penalty : int, optional
412        The penalty applied to creating a gap in the alignment. This CANNOT
413        be 0.
414        Default is 5.
415    gap_extend_penalty : int, optional
416        The penalty applied to extending a gap in the alignment. This CANNOT
417        be 0.
418        Default is 2.
419    score_size : int, optional
420        If your estimated best alignment score is < 255 this should be 0.
421        If your estimated best alignment score is >= 255, this should be 1.
422        If you don't know, this should be 2.
423        Default is 2.
424    mask_length : int, optional
425        The distance between the optimal and suboptimal alignment ending
426        position >= mask_length. We suggest to use len(query_sequence)/2, if
427        you don't have special concerns.
428        Detailed description of mask_length: After locating the optimal
429        alignment ending position, the suboptimal alignment score can be
430        heuristically found by checking the second largest score in the array
431        that contains the maximal score of each column of the SW matrix. In
432        order to avoid picking the scores that belong to the alignments
433        sharing the partial best alignment, SSW C library masks the reference
434        loci nearby (mask length = mask_length) the best alignment ending
435        position and locates the second largest score from the unmasked
436        elements.
437        Default is 15.
438    mask_auto : bool, optional
439        This will automatically set the used mask length to be
440        max(int(len(`query_sequence`)/2), `mask_length`).
441        Default is True.
442    score_only : bool, optional
443        This will prevent the best alignment beginning positions (BABP) and the
444        cigar from being returned as a result. This overrides any setting on
445        `score_filter`, `distance_filter`, and `override_skip_babp`. It has the
446        highest precedence.
447        Default is False.
448    score_filter : int, optional
449        If set, this will prevent the cigar and best alignment beginning
450        positions (BABP) from being returned if the optimal alignment score is
451        less than `score_filter` saving some time computationally. This filter
452        may be overridden by `score_only` (prevents BABP and cigar, regardless
453        of other arguments), `distance_filter` (may prevent cigar, but will
454        cause BABP to be calculated), and `override_skip_babp` (will ensure
455        BABP) returned.
456        Default is None.
457    distance_filter : int, optional
458        If set, this will prevent the cigar from being returned if the length
459        of the `query_sequence` or the `target_sequence` is less than
460        `distance_filter` saving some time computationally. The results of
461        this filter may be overridden by `score_only` (prevents BABP and cigar,
462        regardless of other arguments), and `score_filter` (may prevent cigar).
463        `override_skip_babp` has no effect with this filter applied, as BABP
464        must be calculated to perform the filter.
465        Default is None.
466    override_skip_babp : bool, optional
467        When True, the best alignment beginning positions (BABP) will always be
468        returned unless `score_only` is set to True.
469        Default is False.
470    protein : bool, optional
471        When True, the `query_sequence` and `target_sequence` will be read as
472        protein sequence. When False, the `query_sequence` and
473        `target_sequence` will be read as nucleotide sequence. If True, a
474        `substitution_matrix` must be supplied.
475        Default is False.
476    match_score : int, optional
477        When using a nucleotide sequence, the match_score is the score added
478        when a match occurs. This is ignored if `substitution_matrix` is
479        provided.
480        Default is 2.
481    mismatch_score : int, optional
482        When using a nucleotide sequence, the mismatch is the score subtracted
483        when a mismatch occurs. This should be a negative integer.
484        This is ignored if `substitution_matrix` is provided.
485        Default is -3.
486    substitution_matrix : 2D dict, optional
487        Provides the score for each possible substitution of sequence
488        characters. This may be used for protein or nucleotide sequences. The
489        entire set of possible combinations for the relevant sequence type MUST
490        be enumerated in the dict of dicts. This will override `match_score`
491        and `mismatch_score`. Required when `protein` is True.
492        Default is None.
493    suppress_sequences : bool, optional
494        If True, the query and target sequences will not be returned for
495        convenience.
496        Default is False.
497    zero_index : bool, optional
498        If True, all inidices will start at 0. If False, all inidices will
499        start at 1.
500        Default is True.
501
502    Notes
503    -----
504    This is a wrapper for the SSW package [1]_.
505
506    `mask_length` has to be >= 15, otherwise the suboptimal alignment
507    information will NOT be returned.
508
509    `match_score` is a positive integer and `mismatch_score` is a negative
510    integer.
511
512    `match_score` and `mismatch_score` are only meaningful in the context of
513    nucleotide sequences.
514
515    A substitution matrix must be provided when working with protein sequences.
516
517    References
518    ----------
519    .. [1] Zhao, Mengyao, Wan-Ping Lee, Erik P. Garrison, & Gabor T.
520       Marth. "SSW Library: An SIMD Smith-Waterman C/C++ Library for
521       Applications". PLOS ONE (2013). Web. 11 July 2014.
522       http://www.plosone.org/article/info:doi/10.1371/journal.pone.0082138
523
524    """
525    cdef s_profile *profile
526    cdef cnp.uint8_t gap_open_penalty
527    cdef cnp.uint8_t gap_extend_penalty
528    cdef cnp.uint8_t bit_flag
529    cdef cnp.uint16_t score_filter
530    cdef cnp.int32_t distance_filter
531    cdef cnp.int32_t mask_length
532    cdef str read_sequence
533    cdef int index_starts_at
534    cdef bool is_protein
535    cdef bool suppress_sequences
536    cdef cnp.ndarray __KEEP_IT_IN_SCOPE_read
537    cdef cnp.ndarray __KEEP_IT_IN_SCOPE_matrix
538
539    def __cinit__(self, query_sequence,
540                  gap_open_penalty=5,  # BLASTN Default
541                  gap_extend_penalty=2,  # BLASTN Default
542                  score_size=2,  # BLASTN Default
543                  mask_length=15,  # Minimum length for a suboptimal alignment
544                  mask_auto=True,
545                  score_only=False,
546                  score_filter=None,
547                  distance_filter=None,
548                  override_skip_babp=False,
549                  protein=False,
550                  match_score=2,  # BLASTN Default
551                  mismatch_score=-3,  # BLASTN Default
552                  substitution_matrix=None,
553                  suppress_sequences=False,
554                  zero_index=True):
555        # initalize our values
556        self.read_sequence = query_sequence
557        if gap_open_penalty <= 0:
558            raise ValueError("`gap_open_penalty` must be > 0")
559        self.gap_open_penalty = gap_open_penalty
560        if gap_extend_penalty <= 0:
561            raise ValueError("`gap_extend_penalty` must be > 0")
562        self.gap_extend_penalty = gap_extend_penalty
563        self.distance_filter = 0 if distance_filter is None else \
564            distance_filter
565        self.score_filter = 0 if score_filter is None else score_filter
566        self.suppress_sequences = suppress_sequences
567        self.is_protein = protein
568        self.bit_flag = self._get_bit_flag(override_skip_babp, score_only)
569        # http://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html
570        # Dijkstra knows what's up:
571        self.index_starts_at = 0 if zero_index else 1
572        # set up our matrix
573        cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] matrix
574        if substitution_matrix is None:
575            if protein:
576                raise Exception("Must provide a substitution matrix for"
577                                " protein sequences")
578            matrix = self._build_match_matrix(match_score, mismatch_score)
579        else:
580            matrix = self._convert_dict2d_to_matrix(substitution_matrix)
581        # Set up our mask_length
582        # Mask is recommended to be max(query_sequence/2, 15)
583        if mask_auto:
584            self.mask_length = len(query_sequence) / 2
585            if self.mask_length < mask_length:
586                self.mask_length = mask_length
587        else:
588            self.mask_length = mask_length
589
590        cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] read_seq
591        read_seq = self._seq_converter(query_sequence)
592
593        cdef cnp.int32_t read_length
594        read_length = len(query_sequence)
595
596        cdef cnp.int8_t s_size
597        s_size = score_size
598
599        cdef cnp.int32_t m_width
600        m_width = 24 if self.is_protein else 5
601
602        cdef s_profile* p
603        self.profile = ssw_init(<cnp.int8_t*> read_seq.data,
604                                read_length,
605                                <cnp.int8_t*> matrix.data,
606                                m_width,
607                                s_size)
608
609        # A hack to keep the python GC from eating our data
610        self.__KEEP_IT_IN_SCOPE_read = read_seq
611        self.__KEEP_IT_IN_SCOPE_matrix = matrix
612
613    def __call__(self, target_sequence):
614        """Align `target_sequence` to `query_sequence`
615
616        Parameters
617        ----------
618        target_sequence : str
619
620        Returns
621        -------
622        skbio.alignment.AlignmentStructure
623            The resulting alignment.
624
625        """
626        reference_sequence = target_sequence
627        cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] reference
628        reference = self._seq_converter(reference_sequence)
629
630        cdef cnp.int32_t ref_length
631        ref_length = len(reference_sequence)
632
633        cdef s_align *align
634        align = ssw_align(self.profile, <cnp.int8_t*> reference.data,
635                          ref_length, self.gap_open_penalty,
636                          self.gap_extend_penalty, self.bit_flag,
637                          self.score_filter, self.distance_filter,
638                          self.mask_length)
639
640        # Cython won't let me do this correctly, so duplicate code ahoy:
641        if self.suppress_sequences:
642            alignment = AlignmentStructure("", "", self.index_starts_at)
643        else:
644            alignment = AlignmentStructure(self.read_sequence,
645                                           reference_sequence,
646                                           self.index_starts_at)
647        alignment.__constructor(align)  # Hack to get a pointer through
648        return alignment
649
650    def __dealloc__(self):
651        if self.profile is not NULL:
652            init_destroy(self.profile)
653
654    def _get_bit_flag(self, override_skip_babp, score_only):
655        bit_flag = 0
656        if score_only:
657            return bit_flag
658        if override_skip_babp:
659            bit_flag = bit_flag | 0x8
660        if self.distance_filter != 0:
661            bit_flag = bit_flag | 0x4
662        if self.score_filter != 0:
663            bit_flag = bit_flag | 0x2
664        if bit_flag == 0 or bit_flag == 8:
665            bit_flag = bit_flag | 0x1
666        return bit_flag
667
668    cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] _seq_converter(
669            self,
670            sequence):
671        cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] seq
672        seq = np.empty(len(sequence), dtype=np.int8)
673        if self.is_protein:
674            for i, char in enumerate(sequence):
675                seq[i] = np_aa_table[ord(char)]
676        else:
677            for i, char in enumerate(sequence):
678                seq[i] = np_nt_table[ord(char)]
679        return seq
680
681    cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] \
682            _build_match_matrix(self, match_score, mismatch_score):
683        sequence_order = "ACGTN"
684        dict2d = {}
685        for row in sequence_order:
686            dict2d[row] = {}
687            for column in sequence_order:
688                if column == 'N' or row == 'N':
689                    dict2d[row][column] = 0
690                else:
691                    dict2d[row][column] = match_score if row == column \
692                        else mismatch_score
693        return self._convert_dict2d_to_matrix(dict2d)
694
695    cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] \
696            _convert_dict2d_to_matrix(self, dict2d):
697        if self.is_protein:
698            sequence_order = "ARNDCQEGHILKMFPSTWYVBZX*"
699        else:
700            sequence_order = "ACGTN"
701        cdef int i = 0
702        length = len(sequence_order)
703        cdef cnp.ndarray[cnp.int8_t, ndim = 1, mode = "c"] py_list_matrix = \
704            np.empty(length*length, dtype=np.int8)
705        for row in sequence_order:
706            for column in sequence_order:
707                py_list_matrix[i] = dict2d[row][column]
708                i += 1
709        return py_list_matrix
710