1import pytest
2
3from dnaio import Sequence
4from cutadapt.adapters import (
5    RemoveAfterMatch,
6    RemoveBeforeMatch,
7    FrontAdapter,
8    BackAdapter,
9    PrefixAdapter,
10    SuffixAdapter,
11    LinkedAdapter,
12    MultipleAdapters,
13    IndexedPrefixAdapters,
14    IndexedSuffixAdapters,
15)
16
17
18def test_back_adapter_absolute_number_of_errors():
19    adapter = BackAdapter(
20        sequence="GATCGGAAGA",
21        max_errors=1,
22        min_overlap=3,
23    )
24    assert adapter.max_error_rate == 1 / 10
25
26
27def test_front_adapter_partial_occurrence_in_back():
28    adapter = FrontAdapter("CTGAATT", max_errors=0, min_overlap=4)
29    assert adapter.match_to("GGGGGCTGAA") is None
30
31
32def test_back_adapter_partial_occurrence_in_front():
33    adapter = BackAdapter("CTGAATT", max_errors=0, min_overlap=4)
34    assert adapter.match_to("AATTGGGGGGG") is None
35
36
37def test_issue_52():
38    adapter = BackAdapter(
39        sequence='GAACTCCAGTCACNNNNN',
40        max_errors=0.12,
41        min_overlap=5,
42        read_wildcards=False,
43        adapter_wildcards=True)
44    sequence = "CCCCAGAACTACAGTCCCGGC"
45    am = RemoveAfterMatch(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2,
46        adapter=adapter, sequence=sequence)
47    assert am.wildcards() == 'GGC'
48    """
49    The result above should actually be 'CGGC' since the correct
50    alignment is this one:
51
52    adapter         GAACTCCAGTCACNNNNN
53    mismatches           X     X
54    read       CCCCAGAACTACAGTC-CCGGC
55
56    Since we do not keep the alignment, guessing 'GGC' is the best we
57    can currently do.
58    """
59
60
61def test_issue_80():
62    # This issue turned out to not be an actual issue with the alignment
63    # algorithm. The following alignment is found because it has more matches
64    # than the 'obvious' one:
65    #
66    # TCGTATGCCGTCTTC
67    # =========X==XX=
68    # TCGTATGCCCTC--C
69    #
70    # This is correct, albeit a little surprising, since an alignment without
71    # indels would have only two errors.
72
73    adapter = BackAdapter(
74        sequence="TCGTATGCCGTCTTC",
75        max_errors=0.2,
76        min_overlap=3,
77        read_wildcards=False,
78        adapter_wildcards=False)
79    result = adapter.match_to("TCGTATGCCCTCC")
80    assert result.errors == 3, result
81    assert result.astart == 0, result
82    assert result.astop == 15, result
83
84
85@pytest.mark.xfail(strict=True)
86def test_back_adapter_indel_and_exact_occurrence():
87    adapter = BackAdapter(
88        sequence="GATCGGAAGA",
89        max_errors=0.1,
90        min_overlap=3,
91    )
92    match = adapter.match_to("GATCGTGAAGAGATCGGAAGA")
93    # We want the leftmost match of these two possible ones:
94    # GATCGTGAAGAGATCGGAAGA
95    # GATCG-GAAGA
96    #            GATCGGAAGA
97    assert match.errors == 0
98    assert match.matches == 10
99    assert match.astart == 0
100    assert match.astop == 10
101    assert match.rstart == 0
102    assert match.rstop == 10
103
104
105def test_back_adapter_indel_and_mismatch_occurrence():
106    adapter = BackAdapter(
107        sequence="GATCGGAAGA",
108        max_errors=0.1,
109        min_overlap=3,
110    )
111    match = adapter.match_to("CTGGATCGGAGAGCCGTAGATCGGGAGAGGC")
112    # CTGGATCGGA-GAGCCGTAGATCGGGAGAGGC
113    #    ||||||| ||      ||||||X|||
114    #    GATCGGAAGA      GATCGGAAGA
115    assert match.errors == 1
116    assert match.matches == 9
117    assert match.astart == 0
118    assert match.astop == 10
119    assert match.rstart == 3
120    assert match.rstop == 12
121
122
123def test_str():
124    a = BackAdapter('ACGT', max_errors=0.1)
125    str(a)
126    str(a.match_to("TTACGT"))
127
128
129def test_prefix_with_indels_one_mismatch():
130    a = PrefixAdapter(
131        sequence="GCACATCT",
132        max_errors=0.15,
133        min_overlap=1,
134        read_wildcards=False,
135        adapter_wildcards=False,
136        indels=True,
137    )
138    result = a.match_to("GCACATCGGAA")
139    assert result.errors == 1
140    assert result.matches == 7
141    assert result.astart == 0
142    assert result.astop == 8
143    assert result.rstart == 0
144    assert result.rstop == 8
145
146
147def test_prefix_with_indels_two_mismatches():
148    a = PrefixAdapter(
149        sequence="GCACATTT",
150        max_errors=0.3,
151        min_overlap=1,
152        read_wildcards=False,
153        adapter_wildcards=False,
154        indels=True,
155    )
156    result = a.match_to("GCACATCGGAA")
157    assert result.errors == 2
158    assert result.matches == 6
159    assert result.astart == 0
160    assert result.astop == 8
161    assert result.rstart == 0
162    assert result.rstop == 8
163
164
165def test_linked_adapter():
166    front_adapter = PrefixAdapter('AAAA', min_overlap=4)
167    back_adapter = BackAdapter('TTTT', min_overlap=3)
168
169    linked_adapter = LinkedAdapter(
170        front_adapter, back_adapter, front_required=True, back_required=False, name='name')
171    assert linked_adapter.front_adapter.min_overlap == 4
172    assert linked_adapter.back_adapter.min_overlap == 3
173
174    read = Sequence(name='seq', sequence='AAAACCCCCTTTT')
175    trimmed = linked_adapter.match_to(read.sequence).trimmed(read)
176    assert trimmed.name == 'seq'
177    assert trimmed.sequence == 'CCCCC'
178
179
180def test_info_record():
181    adapter = BackAdapter(
182        sequence='GAACTCCAGTCACNNNNN',
183        max_errors=0.12,
184        min_overlap=5,
185        read_wildcards=False,
186        adapter_wildcards=True,
187        name="Foo")
188    read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC')
189    am = RemoveAfterMatch(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2,
190        adapter=adapter, sequence=read.sequence)
191    assert am.get_info_records(read) == [[
192        "",
193        2,
194        5,
195        21,
196        'CCCCA',
197        'GAACTACAGTCCCGGC',
198        '',
199        'Foo',
200        '',
201        '',
202        '',
203    ]]
204
205
206def test_random_match_probabilities():
207    a = BackAdapter('A', max_errors=0.1).create_statistics()
208    assert a.back.random_match_probabilities(0.5) == [1, 0.25]
209    assert a.back.random_match_probabilities(0.2) == [1, 0.4]
210
211    for s in ('ACTG', 'XMWH'):
212        a = BackAdapter(s, max_errors=0.1).create_statistics()
213        assert a.back.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4]
214        assert a.back.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1]
215
216    a = FrontAdapter('GTCA', max_errors=0.1).create_statistics()
217    assert a.front.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4]
218    assert a.front.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1]
219
220
221def test_add_adapter_statistics():
222    stats = BackAdapter('A', name='name', max_errors=0.1).create_statistics()
223    end_stats = stats.back
224    end_stats.adjacent_bases['A'] = 7
225    end_stats.adjacent_bases['C'] = 19
226    end_stats.adjacent_bases['G'] = 23
227    end_stats.adjacent_bases['T'] = 42
228    end_stats.adjacent_bases[''] = 45
229
230    end_stats.errors[10][0] = 100
231    end_stats.errors[10][1] = 11
232    end_stats.errors[10][2] = 3
233    end_stats.errors[20][0] = 600
234    end_stats.errors[20][1] = 66
235    end_stats.errors[20][2] = 6
236
237    stats2 = BackAdapter('A', name='name', max_errors=0.1).create_statistics()
238    end_stats2 = stats2.back
239    end_stats2.adjacent_bases['A'] = 43
240    end_stats2.adjacent_bases['C'] = 31
241    end_stats2.adjacent_bases['G'] = 27
242    end_stats2.adjacent_bases['T'] = 8
243    end_stats2.adjacent_bases[''] = 5
244    end_stats2.errors[10][0] = 234
245    end_stats2.errors[10][1] = 14
246    end_stats2.errors[10][3] = 5
247    end_stats2.errors[15][0] = 90
248    end_stats2.errors[15][1] = 17
249    end_stats2.errors[15][2] = 2
250
251    stats += stats2
252    r = stats.back
253
254    assert r.adjacent_bases == {'A': 50, 'C': 50, 'G': 50, 'T': 50, '': 50}
255    assert r.errors == {
256        10: {0: 334, 1: 25, 2: 3, 3: 5},
257        15: {0: 90, 1: 17, 2: 2},
258        20: {0: 600, 1: 66, 2: 6},
259    }
260
261
262def test_linked_matches_property():
263    """Accessing matches property of non-anchored linked adapters"""
264    # Issue #265
265    front_adapter = FrontAdapter("GGG")
266    back_adapter = BackAdapter("TTT")
267    la = LinkedAdapter(front_adapter, back_adapter, front_required=False, back_required=False, name='name')
268    assert la.match_to("AAAATTTT").matches == 3
269
270
271@pytest.mark.parametrize("adapter_class", [PrefixAdapter, SuffixAdapter])
272def test_no_indels_empty_read(adapter_class):
273    # Issue #376
274    adapter = adapter_class("ACGT", indels=False)
275    adapter.match_to("")
276
277
278def test_prefix_match_with_n_wildcard_in_read():
279    adapter = PrefixAdapter("NNNACGT", indels=False)
280    match = adapter.match_to("TTTACGTAAAA")
281    assert match is not None and (0, 7) == (match.rstart, match.rstop)
282    match = adapter.match_to("NTTACGTAAAA")
283    assert match is not None and (0, 7) == (match.rstart, match.rstop)
284
285
286def test_suffix_match_with_n_wildcard_in_read():
287    adapter = SuffixAdapter("ACGTNNN", indels=False)
288    match = adapter.match_to("TTTTACGTTTT")
289    assert match is not None and (4, 11) == (match.rstart, match.rstop)
290    match = adapter.match_to("TTTTACGTCNC")
291    assert match is not None and (4, 11) == (match.rstart, match.rstop)
292
293
294def test_multiple_adapters():
295    a1 = BackAdapter("GTAGTCCCGC")
296    a2 = BackAdapter("GTAGTCCCCC")
297    ma = MultipleAdapters([a1, a2])
298    match = ma.match_to("ATACCCCTGTAGTCCCC")
299    assert match.adapter is a2
300
301
302def test_indexed_prefix_adapters():
303    adapters = [
304        PrefixAdapter("GAAC", indels=False),
305        PrefixAdapter("TGCT", indels=False),
306    ]
307    ma = IndexedPrefixAdapters(adapters)
308    match = ma.match_to("GAACTT")
309    assert match.adapter is adapters[0]
310    match = ma.match_to("TGCTAA")
311    assert match.adapter is adapters[1]
312    assert ma.match_to("GGGGGGG") is None
313
314
315def test_indexed_prefix_adapters_incorrect_type():
316    with pytest.raises(ValueError):
317        IndexedPrefixAdapters([
318            PrefixAdapter("GAAC", indels=False),
319            SuffixAdapter("TGCT", indels=False),
320        ])
321
322
323def test_indexed_very_similar(caplog):
324    IndexedPrefixAdapters([
325        PrefixAdapter("GAAC", max_errors=1, indels=False),
326        PrefixAdapter("GAAG", max_errors=1, indels=False),
327    ])
328    assert "cannot be assigned uniquely" in caplog.text
329
330
331def test_indexed_too_high_k():
332    with pytest.raises(ValueError) as e:
333        IndexedPrefixAdapters([
334            PrefixAdapter("ACGTACGT", max_errors=3, indels=False),
335            PrefixAdapter("AAGGTTCC", max_errors=2, indels=False),
336        ])
337    assert "Error rate too high" in e.value.args[0]
338
339
340def test_indexed_suffix_adapters():
341    adapters = [
342        SuffixAdapter("GAAC", indels=False),
343        SuffixAdapter("TGCT", indels=False),
344    ]
345    ma = IndexedSuffixAdapters(adapters)
346    match = ma.match_to("TTGAAC")
347    assert match.adapter is adapters[0]
348    match = ma.match_to("AATGCT")
349    assert match.adapter is adapters[1]
350
351
352def test_indexed_suffix_adapters_incorrect_type():
353    with pytest.raises(ValueError):
354        IndexedSuffixAdapters([
355            SuffixAdapter("GAAC", indels=False),
356            PrefixAdapter("TGCT", indels=False),
357        ])
358
359
360def test_multi_prefix_adapter_with_indels():
361    adapters = [
362        PrefixAdapter("GTAC", max_errors=1, indels=True),
363        PrefixAdapter("TGCT", max_errors=1, indels=True),
364    ]
365    ma = IndexedPrefixAdapters(adapters)
366    match = ma.match_to("GATACGGG")
367    assert match.adapter is adapters[0]
368    match = ma.match_to("TAGCTAA")
369    assert match.adapter is adapters[1]
370
371
372def test_indexed_prefix_adapters_with_n_wildcard():
373    sequence = "GGTCCAGA"
374    ma = IndexedPrefixAdapters([PrefixAdapter(sequence, max_errors=1, indels=False)])
375    for i in range(len(sequence)):
376        t = sequence[:i] + "N" + sequence[i+1:] + "TGCT"
377        result = ma.match_to(t)
378        assert isinstance(result, RemoveBeforeMatch)
379        assert (result.rstart, result.rstop) == (0, 8)
380        assert result.errors == 1
381        assert result.matches == 7
382