1 #include "bcftools.pysam.h"
2
3 /* str_finder.c -- Short Tandem Repeat finder.
4 Originally from Crumble (https://github.com/jkbonfield/crumble)
5
6 Copyright (C) 2015-2016, 2021 Genome Research Ltd.
7
8 Author: James Bonfield <jkb@sanger.ac.uk>
9
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE. */
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdint.h>
32 #include <ctype.h>
33
34 #include "str_finder.h"
35 #include "utlist.h"
36
37 #define MAX(a,b) ((a)>(b)?(a):(b))
38 #define MIN(a,b) ((a)<(b)?(a):(b))
39
40 typedef unsigned char uc;
41
add_rep(rep_ele ** list,char * cons,int clen,int pos,int rlen,int lower_only,unsigned int w)42 static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
43 int lower_only, unsigned int w) {
44 rep_ele *el, *tmp, *prev;
45 char *cp1, *cp2, *cp_end;
46 int i;
47
48 // Already handled this in previous overlap?
49 if (*list) {
50 tmp = DL_TAIL(*list);
51 if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos)
52 return;
53 }
54
55 // Find current and last occurence of repeated word.
56
57 cp2 = &cons[pos+1];
58 // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
59
60 for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads
61 if (*cp1 == '*')
62 continue;
63 else
64 i++;
65 while (*cp1 == '*')
66 cp1--;
67
68
69 // Scan ahead to see how much further it goes.
70 cp_end = &cons[clen];
71 while (cp2 < cp_end) {
72 if (*cp1 != *cp2)
73 break;
74
75 w<<=2;
76 w|=*cp2;
77 cp1++;
78 cp2++;
79 }
80
81 if (!(el = malloc(sizeof(*el))))
82 return;
83
84 el->end = pos + cp2-&cons[pos+1];
85 el->rep_len = rlen;
86 pos++;
87 while (rlen--) {
88 while (cons[--pos] == '*');
89 while (cons[--pos] == '*');
90 }
91 //pos++;
92 while (pos > 1 && cons[pos-1] == '*') pos--;
93 el->start = pos;
94
95 // Check it meets the lower-case only criteria
96 if (lower_only) {
97 int lc = 0;
98 for (i = el->start; i <= el->end; i++) {
99 if (islower(cons[i])) {
100 lc = 1;
101 break;
102 }
103 }
104
105 if (!lc) {
106 free(el);
107 return;
108 }
109 }
110
111 // Remove any older items on the list that are entirely contained within el
112 if (*list) {
113 tmp = DL_TAIL(*list);
114 do {
115 prev = tmp->prev;
116 if (tmp->end < el->start)
117 break;
118
119 if (tmp->start >= el->start) {
120 DL_DELETE(*list, tmp);
121 free(tmp);
122 }
123
124 if (tmp == DL_HEAD(*list))
125 break;
126 tmp = prev;
127 } while (*list);
128 }
129
130 DL_APPEND(*list, el);
131
132 return;
133 }
134
135 /*
136 * Finds repeated homopolymers up to 8-mers.
137 * Note this assumes cons is 0-3, so N of 4 may rarely give false hits.
138 *
139 * Returns a list of rep_ele structs holding the start,end tuples of repeats;
140 * NULL on failure.
141 */
find_STR(char * cons,int len,int lower_only)142 rep_ele *find_STR(char *cons, int len, int lower_only) {
143 int i, j;
144 uint32_t w = 0;
145 rep_ele *reps = NULL;
146
147 for (i = j = 0; i < len && j < 15; i++) {
148 if (cons[i] == '*') continue;
149
150 w <<= 2;
151 w |= cons[i];
152 //printf("%3d %c w=%08x\n", i, cons[i], w);
153 if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
154 add_rep(&reps, cons, len, i, 1, lower_only, w);
155 if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
156 add_rep(&reps, cons, len, i, 2, lower_only, w);
157 if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
158 add_rep(&reps, cons, len, i, 3, lower_only, w);
159 if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
160 add_rep(&reps, cons, len, i, 4, lower_only, w);
161 if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
162 add_rep(&reps, cons, len, i, 5, lower_only, w);
163 if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
164 add_rep(&reps, cons, len, i, 6, lower_only, w);
165 if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
166 add_rep(&reps, cons, len, i, 7, lower_only, w);
167
168 j++;
169 }
170
171 for (; i < len; i++) {
172 if (cons[i] == '*') continue;
173
174 w <<= 2;
175 w |= cons[i];
176 //printf("%3d %c w=%08x\n", i, cons[i], w);
177 if ((w&0xffff) == ((w>>16)&0xffff))
178 add_rep(&reps, cons, len, i, 8, lower_only, w);
179 else if ((w&0x3fff) == ((w>>14)&0x3fff))
180 add_rep(&reps, cons, len, i, 7, lower_only, w);
181 else if ((w&0x0fff) == ((w>>12)&0x0fff))
182 add_rep(&reps, cons, len, i, 6, lower_only, w);
183 else if ((w&0x03ff) == ((w>>10)&0x03ff))
184 add_rep(&reps, cons, len, i, 5, lower_only, w);
185 else if ((w&0x00ff) == ((w>> 8)&0x00ff))
186 add_rep(&reps, cons, len, i, 4, lower_only, w);
187 else if ((w&0x003f) == ((w>> 6)&0x003f))
188 add_rep(&reps, cons, len, i, 3, lower_only, w);
189 else if ((w&0x000f) == ((w>> 4)&0x000f))
190 add_rep(&reps, cons, len, i, 2, lower_only, w);
191 else if ((w&0x0003) == ((w>> 2)&0x0003))
192 add_rep(&reps, cons, len, i, 1, lower_only, w);
193 }
194
195 return reps;
196 }
197
198 /* -----------------------------------------------------------------------------
199 * Computes repeat regions in the consensus and then provides a bit mask
200 * indicating the extend of the STRs.
201 *
202 * The purpose of this is to identify where a read needs to span the entire
203 * region in order to validate how many copies of a repeat word are present.
204 * This only really has a major impact when indels are involved.
205 *
206 * For example, given this multiple alignment:
207 *
208 * S1 GATCGGACGAGAG
209 * S2 GATCGGACGAGAGAGAGAGAGT
210 * S3 GATCGGACGAGAGAGAGAG**TCGGAC
211 * S4 GGACGAGAGAGAGAGAGTCGGAC
212 * S5 CGAGAGAGAGAG**TCGGAC
213 * S6 AGAGAGAGTCGGAC
214 *
215 * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last
216 * (S1 and S6) sequences do not span and so we do not know which allele they
217 * match. Specifically as the pad is at the right hand end, the alignment of
218 * S6 gives incorrect weight to the consensus as it is stating AG when it
219 * may actually be ** at that point.
220 *
221 * By identifying the repeats we can soft clip as follows:
222 *
223 * S1 GATCGGACgagag
224 * S2 GATCGGACGAGAGAGAGAGAGT
225 * S3 GATCGGACGAGAGAGAGAG**TCGGAC
226 * S4 GGACGAGAGAGAGAGAGTCGGAC
227 * S5 CGAGAGAGAGAG**TCGGAC
228 * S6 agagagagTCGGAC
229 *
230 * Returns an array of STR vs no-STR values.
231 * 0 => non repetitive.
232 * 1+ => repeat with consecutive bit-number for repeat size.
233 *
234 * Eg: AGGGGAGGAGAAGAC
235 * 1111 1111
236 * 2222222
237 * 444444
238 * => 011331137754440
239 */
cons_mark_STR(char * cons,int len,int lower_only)240 char *cons_mark_STR(char *cons, int len, int lower_only) {
241 rep_ele *reps, *elt, *tmp;
242 char *str;
243
244 str = calloc(1, len);
245 reps = find_STR(cons, len, lower_only);
246
247 DL_FOREACH_SAFE(reps, elt, tmp) {
248 int i, v = 0;
249
250 //printf("%2d .. %2d %.*s\n", elt->start, elt->end,
251 // elt->end - elt->start+1, &cons[elt->start]);
252
253 // What is there?
254 for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++)
255 v |= str[i];
256
257 for (i = 0; i < 8; i++) {
258 if (!(v&(1<<i)))
259 break;
260 }
261 v = (i == 8) ? 1 : (1<<i);
262
263 // Add new if available, or just overload 1 if not
264 for (i = elt->start; i <= elt->end; i++)
265 str[i] |= v;
266
267 DL_DELETE(reps, elt);
268 free(elt);
269 }
270
271 return str;
272 }
273