1import pytest 2 3from dnaio import Sequence 4from cutadapt.adapters import ( 5 RemoveAfterMatch, 6 RemoveBeforeMatch, 7 FrontAdapter, 8 BackAdapter, 9 PrefixAdapter, 10 SuffixAdapter, 11 LinkedAdapter, 12 MultipleAdapters, 13 IndexedPrefixAdapters, 14 IndexedSuffixAdapters, 15) 16 17 18def test_back_adapter_absolute_number_of_errors(): 19 adapter = BackAdapter( 20 sequence="GATCGGAAGA", 21 max_errors=1, 22 min_overlap=3, 23 ) 24 assert adapter.max_error_rate == 1 / 10 25 26 27def test_front_adapter_partial_occurrence_in_back(): 28 adapter = FrontAdapter("CTGAATT", max_errors=0, min_overlap=4) 29 assert adapter.match_to("GGGGGCTGAA") is None 30 31 32def test_back_adapter_partial_occurrence_in_front(): 33 adapter = BackAdapter("CTGAATT", max_errors=0, min_overlap=4) 34 assert adapter.match_to("AATTGGGGGGG") is None 35 36 37def test_issue_52(): 38 adapter = BackAdapter( 39 sequence='GAACTCCAGTCACNNNNN', 40 max_errors=0.12, 41 min_overlap=5, 42 read_wildcards=False, 43 adapter_wildcards=True) 44 sequence = "CCCCAGAACTACAGTCCCGGC" 45 am = RemoveAfterMatch(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, 46 adapter=adapter, sequence=sequence) 47 assert am.wildcards() == 'GGC' 48 """ 49 The result above should actually be 'CGGC' since the correct 50 alignment is this one: 51 52 adapter GAACTCCAGTCACNNNNN 53 mismatches X X 54 read CCCCAGAACTACAGTC-CCGGC 55 56 Since we do not keep the alignment, guessing 'GGC' is the best we 57 can currently do. 58 """ 59 60 61def test_issue_80(): 62 # This issue turned out to not be an actual issue with the alignment 63 # algorithm. The following alignment is found because it has more matches 64 # than the 'obvious' one: 65 # 66 # TCGTATGCCGTCTTC 67 # =========X==XX= 68 # TCGTATGCCCTC--C 69 # 70 # This is correct, albeit a little surprising, since an alignment without 71 # indels would have only two errors. 72 73 adapter = BackAdapter( 74 sequence="TCGTATGCCGTCTTC", 75 max_errors=0.2, 76 min_overlap=3, 77 read_wildcards=False, 78 adapter_wildcards=False) 79 result = adapter.match_to("TCGTATGCCCTCC") 80 assert result.errors == 3, result 81 assert result.astart == 0, result 82 assert result.astop == 15, result 83 84 85@pytest.mark.xfail(strict=True) 86def test_back_adapter_indel_and_exact_occurrence(): 87 adapter = BackAdapter( 88 sequence="GATCGGAAGA", 89 max_errors=0.1, 90 min_overlap=3, 91 ) 92 match = adapter.match_to("GATCGTGAAGAGATCGGAAGA") 93 # We want the leftmost match of these two possible ones: 94 # GATCGTGAAGAGATCGGAAGA 95 # GATCG-GAAGA 96 # GATCGGAAGA 97 assert match.errors == 0 98 assert match.matches == 10 99 assert match.astart == 0 100 assert match.astop == 10 101 assert match.rstart == 0 102 assert match.rstop == 10 103 104 105def test_back_adapter_indel_and_mismatch_occurrence(): 106 adapter = BackAdapter( 107 sequence="GATCGGAAGA", 108 max_errors=0.1, 109 min_overlap=3, 110 ) 111 match = adapter.match_to("CTGGATCGGAGAGCCGTAGATCGGGAGAGGC") 112 # CTGGATCGGA-GAGCCGTAGATCGGGAGAGGC 113 # ||||||| || ||||||X||| 114 # GATCGGAAGA GATCGGAAGA 115 assert match.errors == 1 116 assert match.matches == 9 117 assert match.astart == 0 118 assert match.astop == 10 119 assert match.rstart == 3 120 assert match.rstop == 12 121 122 123def test_str(): 124 a = BackAdapter('ACGT', max_errors=0.1) 125 str(a) 126 str(a.match_to("TTACGT")) 127 128 129def test_prefix_with_indels_one_mismatch(): 130 a = PrefixAdapter( 131 sequence="GCACATCT", 132 max_errors=0.15, 133 min_overlap=1, 134 read_wildcards=False, 135 adapter_wildcards=False, 136 indels=True, 137 ) 138 result = a.match_to("GCACATCGGAA") 139 assert result.errors == 1 140 assert result.matches == 7 141 assert result.astart == 0 142 assert result.astop == 8 143 assert result.rstart == 0 144 assert result.rstop == 8 145 146 147def test_prefix_with_indels_two_mismatches(): 148 a = PrefixAdapter( 149 sequence="GCACATTT", 150 max_errors=0.3, 151 min_overlap=1, 152 read_wildcards=False, 153 adapter_wildcards=False, 154 indels=True, 155 ) 156 result = a.match_to("GCACATCGGAA") 157 assert result.errors == 2 158 assert result.matches == 6 159 assert result.astart == 0 160 assert result.astop == 8 161 assert result.rstart == 0 162 assert result.rstop == 8 163 164 165def test_linked_adapter(): 166 front_adapter = PrefixAdapter('AAAA', min_overlap=4) 167 back_adapter = BackAdapter('TTTT', min_overlap=3) 168 169 linked_adapter = LinkedAdapter( 170 front_adapter, back_adapter, front_required=True, back_required=False, name='name') 171 assert linked_adapter.front_adapter.min_overlap == 4 172 assert linked_adapter.back_adapter.min_overlap == 3 173 174 read = Sequence(name='seq', sequence='AAAACCCCCTTTT') 175 trimmed = linked_adapter.match_to(read.sequence).trimmed(read) 176 assert trimmed.name == 'seq' 177 assert trimmed.sequence == 'CCCCC' 178 179 180def test_info_record(): 181 adapter = BackAdapter( 182 sequence='GAACTCCAGTCACNNNNN', 183 max_errors=0.12, 184 min_overlap=5, 185 read_wildcards=False, 186 adapter_wildcards=True, 187 name="Foo") 188 read = Sequence(name="abc", sequence='CCCCAGAACTACAGTCCCGGC') 189 am = RemoveAfterMatch(astart=0, astop=17, rstart=5, rstop=21, matches=15, errors=2, 190 adapter=adapter, sequence=read.sequence) 191 assert am.get_info_records(read) == [[ 192 "", 193 2, 194 5, 195 21, 196 'CCCCA', 197 'GAACTACAGTCCCGGC', 198 '', 199 'Foo', 200 '', 201 '', 202 '', 203 ]] 204 205 206def test_random_match_probabilities(): 207 a = BackAdapter('A', max_errors=0.1).create_statistics() 208 assert a.back.random_match_probabilities(0.5) == [1, 0.25] 209 assert a.back.random_match_probabilities(0.2) == [1, 0.4] 210 211 for s in ('ACTG', 'XMWH'): 212 a = BackAdapter(s, max_errors=0.1).create_statistics() 213 assert a.back.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] 214 assert a.back.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] 215 216 a = FrontAdapter('GTCA', max_errors=0.1).create_statistics() 217 assert a.front.random_match_probabilities(0.5) == [1, 0.25, 0.25**2, 0.25**3, 0.25**4] 218 assert a.front.random_match_probabilities(0.2) == [1, 0.4, 0.4*0.1, 0.4*0.1*0.4, 0.4*0.1*0.4*0.1] 219 220 221def test_add_adapter_statistics(): 222 stats = BackAdapter('A', name='name', max_errors=0.1).create_statistics() 223 end_stats = stats.back 224 end_stats.adjacent_bases['A'] = 7 225 end_stats.adjacent_bases['C'] = 19 226 end_stats.adjacent_bases['G'] = 23 227 end_stats.adjacent_bases['T'] = 42 228 end_stats.adjacent_bases[''] = 45 229 230 end_stats.errors[10][0] = 100 231 end_stats.errors[10][1] = 11 232 end_stats.errors[10][2] = 3 233 end_stats.errors[20][0] = 600 234 end_stats.errors[20][1] = 66 235 end_stats.errors[20][2] = 6 236 237 stats2 = BackAdapter('A', name='name', max_errors=0.1).create_statistics() 238 end_stats2 = stats2.back 239 end_stats2.adjacent_bases['A'] = 43 240 end_stats2.adjacent_bases['C'] = 31 241 end_stats2.adjacent_bases['G'] = 27 242 end_stats2.adjacent_bases['T'] = 8 243 end_stats2.adjacent_bases[''] = 5 244 end_stats2.errors[10][0] = 234 245 end_stats2.errors[10][1] = 14 246 end_stats2.errors[10][3] = 5 247 end_stats2.errors[15][0] = 90 248 end_stats2.errors[15][1] = 17 249 end_stats2.errors[15][2] = 2 250 251 stats += stats2 252 r = stats.back 253 254 assert r.adjacent_bases == {'A': 50, 'C': 50, 'G': 50, 'T': 50, '': 50} 255 assert r.errors == { 256 10: {0: 334, 1: 25, 2: 3, 3: 5}, 257 15: {0: 90, 1: 17, 2: 2}, 258 20: {0: 600, 1: 66, 2: 6}, 259 } 260 261 262def test_linked_matches_property(): 263 """Accessing matches property of non-anchored linked adapters""" 264 # Issue #265 265 front_adapter = FrontAdapter("GGG") 266 back_adapter = BackAdapter("TTT") 267 la = LinkedAdapter(front_adapter, back_adapter, front_required=False, back_required=False, name='name') 268 assert la.match_to("AAAATTTT").matches == 3 269 270 271@pytest.mark.parametrize("adapter_class", [PrefixAdapter, SuffixAdapter]) 272def test_no_indels_empty_read(adapter_class): 273 # Issue #376 274 adapter = adapter_class("ACGT", indels=False) 275 adapter.match_to("") 276 277 278def test_prefix_match_with_n_wildcard_in_read(): 279 adapter = PrefixAdapter("NNNACGT", indels=False) 280 match = adapter.match_to("TTTACGTAAAA") 281 assert match is not None and (0, 7) == (match.rstart, match.rstop) 282 match = adapter.match_to("NTTACGTAAAA") 283 assert match is not None and (0, 7) == (match.rstart, match.rstop) 284 285 286def test_suffix_match_with_n_wildcard_in_read(): 287 adapter = SuffixAdapter("ACGTNNN", indels=False) 288 match = adapter.match_to("TTTTACGTTTT") 289 assert match is not None and (4, 11) == (match.rstart, match.rstop) 290 match = adapter.match_to("TTTTACGTCNC") 291 assert match is not None and (4, 11) == (match.rstart, match.rstop) 292 293 294def test_multiple_adapters(): 295 a1 = BackAdapter("GTAGTCCCGC") 296 a2 = BackAdapter("GTAGTCCCCC") 297 ma = MultipleAdapters([a1, a2]) 298 match = ma.match_to("ATACCCCTGTAGTCCCC") 299 assert match.adapter is a2 300 301 302def test_indexed_prefix_adapters(): 303 adapters = [ 304 PrefixAdapter("GAAC", indels=False), 305 PrefixAdapter("TGCT", indels=False), 306 ] 307 ma = IndexedPrefixAdapters(adapters) 308 match = ma.match_to("GAACTT") 309 assert match.adapter is adapters[0] 310 match = ma.match_to("TGCTAA") 311 assert match.adapter is adapters[1] 312 assert ma.match_to("GGGGGGG") is None 313 314 315def test_indexed_prefix_adapters_incorrect_type(): 316 with pytest.raises(ValueError): 317 IndexedPrefixAdapters([ 318 PrefixAdapter("GAAC", indels=False), 319 SuffixAdapter("TGCT", indels=False), 320 ]) 321 322 323def test_indexed_very_similar(caplog): 324 IndexedPrefixAdapters([ 325 PrefixAdapter("GAAC", max_errors=1, indels=False), 326 PrefixAdapter("GAAG", max_errors=1, indels=False), 327 ]) 328 assert "cannot be assigned uniquely" in caplog.text 329 330 331def test_indexed_too_high_k(): 332 with pytest.raises(ValueError) as e: 333 IndexedPrefixAdapters([ 334 PrefixAdapter("ACGTACGT", max_errors=3, indels=False), 335 PrefixAdapter("AAGGTTCC", max_errors=2, indels=False), 336 ]) 337 assert "Error rate too high" in e.value.args[0] 338 339 340def test_indexed_suffix_adapters(): 341 adapters = [ 342 SuffixAdapter("GAAC", indels=False), 343 SuffixAdapter("TGCT", indels=False), 344 ] 345 ma = IndexedSuffixAdapters(adapters) 346 match = ma.match_to("TTGAAC") 347 assert match.adapter is adapters[0] 348 match = ma.match_to("AATGCT") 349 assert match.adapter is adapters[1] 350 351 352def test_indexed_suffix_adapters_incorrect_type(): 353 with pytest.raises(ValueError): 354 IndexedSuffixAdapters([ 355 SuffixAdapter("GAAC", indels=False), 356 PrefixAdapter("TGCT", indels=False), 357 ]) 358 359 360def test_multi_prefix_adapter_with_indels(): 361 adapters = [ 362 PrefixAdapter("GTAC", max_errors=1, indels=True), 363 PrefixAdapter("TGCT", max_errors=1, indels=True), 364 ] 365 ma = IndexedPrefixAdapters(adapters) 366 match = ma.match_to("GATACGGG") 367 assert match.adapter is adapters[0] 368 match = ma.match_to("TAGCTAA") 369 assert match.adapter is adapters[1] 370 371 372def test_indexed_prefix_adapters_with_n_wildcard(): 373 sequence = "GGTCCAGA" 374 ma = IndexedPrefixAdapters([PrefixAdapter(sequence, max_errors=1, indels=False)]) 375 for i in range(len(sequence)): 376 t = sequence[:i] + "N" + sequence[i+1:] + "TGCT" 377 result = ma.match_to(t) 378 assert isinstance(result, RemoveBeforeMatch) 379 assert (result.rstart, result.rstop) == (0, 8) 380 assert result.errors == 1 381 assert result.matches == 7 382