1 #include "bcftools.pysam.h"
2 
3 /*  str_finder.c -- Short Tandem Repeat finder.
4     Originally from Crumble (https://github.com/jkbonfield/crumble)
5 
6     Copyright (C) 2015-2016, 2021 Genome Research Ltd.
7 
8     Author: James Bonfield <jkb@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdint.h>
32 #include <ctype.h>
33 
34 #include "str_finder.h"
35 #include "utlist.h"
36 
37 #define MAX(a,b) ((a)>(b)?(a):(b))
38 #define MIN(a,b) ((a)<(b)?(a):(b))
39 
40 typedef unsigned char uc;
41 
add_rep(rep_ele ** list,char * cons,int clen,int pos,int rlen,int lower_only,unsigned int w)42 static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen,
43 		    int lower_only, unsigned int w) {
44     rep_ele *el, *tmp, *prev;
45     char *cp1, *cp2, *cp_end;
46     int i;
47 
48     // Already handled this in previous overlap?
49     if (*list) {
50 	tmp = DL_TAIL(*list);
51 	if (tmp->start <= pos-rlen*2+1 && tmp->end >= pos)
52 	    return;
53     }
54 
55     // Find current and last occurence of repeated word.
56 
57     cp2 = &cons[pos+1];
58     // If unpadded, this is quicker: cp1 = &cons[pos+1-rlen];
59 
60     for (cp1 = &cons[pos], i = 1; i < rlen; cp1--) // compensate for pads
61 	if (*cp1 == '*')
62 	    continue;
63 	else
64 	    i++;
65     while (*cp1 == '*')
66 	cp1--;
67 
68 
69     // Scan ahead to see how much further it goes.
70     cp_end = &cons[clen];
71     while (cp2 < cp_end) {
72 	if (*cp1 != *cp2)
73 	    break;
74 
75 	w<<=2;
76 	w|=*cp2;
77 	cp1++;
78 	cp2++;
79     }
80 
81     if (!(el = malloc(sizeof(*el))))
82 	return;
83 
84     el->end   = pos + cp2-&cons[pos+1];
85     el->rep_len = rlen;
86     pos++;
87     while (rlen--) {
88 	while (cons[--pos] == '*');
89 	while (cons[--pos] == '*');
90     }
91     //pos++;
92     while (pos > 1 && cons[pos-1] == '*') pos--;
93     el->start = pos;
94 
95     // Check it meets the lower-case only criteria
96     if (lower_only) {
97 	int lc = 0;
98 	for (i = el->start; i <= el->end; i++) {
99 	    if (islower(cons[i])) {
100 		lc = 1;
101 		break;
102 	    }
103 	}
104 
105 	if (!lc) {
106 	    free(el);
107 	    return;
108 	}
109     }
110 
111     // Remove any older items on the list that are entirely contained within el
112     if (*list) {
113 	tmp = DL_TAIL(*list);
114 	do {
115 	    prev = tmp->prev;
116 	    if (tmp->end < el->start)
117 		break;
118 
119 	    if (tmp->start >= el->start) {
120 		DL_DELETE(*list, tmp);
121 		free(tmp);
122 	    }
123 
124 	    if (tmp == DL_HEAD(*list))
125 		break;
126 	    tmp = prev;
127 	} while (*list);
128     }
129 
130     DL_APPEND(*list, el);
131 
132     return;
133 }
134 
135 /*
136  * Finds repeated homopolymers up to 8-mers.
137  * Note this assumes cons is 0-3, so N of 4 may rarely give false hits.
138  *
139  * Returns a list of rep_ele structs holding the start,end tuples of repeats;
140  *         NULL on failure.
141  */
find_STR(char * cons,int len,int lower_only)142 rep_ele *find_STR(char *cons, int len, int lower_only) {
143     int i, j;
144     uint32_t w = 0;
145     rep_ele *reps = NULL;
146 
147     for (i = j = 0; i < len && j < 15; i++) {
148 	if (cons[i] == '*') continue;
149 
150 	w <<= 2;
151 	w |= cons[i];
152 	//printf("%3d %c w=%08x\n", i, cons[i], w);
153 	if (j>= 1 && (w&0x0003) == ((w>> 2)&0x0003))
154 	    add_rep(&reps, cons, len, i, 1, lower_only, w);
155 	if (j>= 3 && (w&0x000f) == ((w>> 4)&0x000f))
156 	    add_rep(&reps, cons, len, i, 2, lower_only, w);
157 	if (j>= 5 && (w&0x003f) == ((w>> 6)&0x003f))
158 	    add_rep(&reps, cons, len, i, 3, lower_only, w);
159 	if (j>= 7 && (w&0x00ff) == ((w>> 8)&0x00ff))
160 	    add_rep(&reps, cons, len, i, 4, lower_only, w);
161 	if (j>= 9 && (w&0x03ff) == ((w>>10)&0x03ff))
162 	    add_rep(&reps, cons, len, i, 5, lower_only, w);
163 	if (j>=11 && (w&0x0fff) == ((w>>12)&0x0fff))
164 	    add_rep(&reps, cons, len, i, 6, lower_only, w);
165 	if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff))
166 	    add_rep(&reps, cons, len, i, 7, lower_only, w);
167 
168 	j++;
169     }
170 
171     for (; i < len; i++) {
172 	if (cons[i] == '*') continue;
173 
174 	w <<= 2;
175 	w |= cons[i];
176 	//printf("%3d %c w=%08x\n", i, cons[i], w);
177 	if ((w&0xffff) == ((w>>16)&0xffff))
178 	    add_rep(&reps, cons, len, i, 8, lower_only, w);
179 	else if ((w&0x3fff) == ((w>>14)&0x3fff))
180 	    add_rep(&reps, cons, len, i, 7, lower_only, w);
181 	else if ((w&0x0fff) == ((w>>12)&0x0fff))
182 	    add_rep(&reps, cons, len, i, 6, lower_only, w);
183 	else if ((w&0x03ff) == ((w>>10)&0x03ff))
184 	    add_rep(&reps, cons, len, i, 5, lower_only, w);
185 	else if ((w&0x00ff) == ((w>> 8)&0x00ff))
186 	    add_rep(&reps, cons, len, i, 4, lower_only, w);
187 	else if ((w&0x003f) == ((w>> 6)&0x003f))
188 	    add_rep(&reps, cons, len, i, 3, lower_only, w);
189 	else if ((w&0x000f) == ((w>> 4)&0x000f))
190 	    add_rep(&reps, cons, len, i, 2, lower_only, w);
191 	else if ((w&0x0003) == ((w>> 2)&0x0003))
192 	    add_rep(&reps, cons, len, i, 1, lower_only, w);
193     }
194 
195     return reps;
196 }
197 
198 /* -----------------------------------------------------------------------------
199  * Computes repeat regions in the consensus and then provides a bit mask
200  * indicating the extend of the STRs.
201  *
202  * The purpose of this is to identify where a read needs to span the entire
203  * region in order to validate how many copies of a repeat word are present.
204  * This only really has a major impact when indels are involved.
205  *
206  * For example, given this multiple alignment:
207  *
208  * S1 GATCGGACGAGAG
209  * S2 GATCGGACGAGAGAGAGAGAGT
210  * S3 GATCGGACGAGAGAGAGAG**TCGGAC
211  * S4     GGACGAGAGAGAGAGAGTCGGAC
212  * S5        CGAGAGAGAGAG**TCGGAC
213  * S6              AGAGAGAGTCGGAC
214  *
215  * We have subseq of GAGAGAGAGAG** vs GAGAGAGAGAGAG. The first and last
216  * (S1 and S6) sequences do not span and so we do not know which allele they
217  * match. Specifically as the pad is at the right hand end, the alignment of
218  * S6 gives incorrect weight to the consensus as it is stating AG when it
219  * may actually be ** at that point.
220  *
221  * By identifying the repeats we can soft clip as follows:
222  *
223  * S1 GATCGGACgagag
224  * S2 GATCGGACGAGAGAGAGAGAGT
225  * S3 GATCGGACGAGAGAGAGAG**TCGGAC
226  * S4     GGACGAGAGAGAGAGAGTCGGAC
227  * S5        CGAGAGAGAGAG**TCGGAC
228  * S6              agagagagTCGGAC
229  *
230  * Returns an array of STR vs no-STR values.
231  *         0  => non repetitive.
232  *         1+ => repeat with consecutive bit-number for repeat size.
233  *
234  * Eg:  AGGGGAGGAGAAGAC
235  *       1111  1111
236  *         2222222
237  *              444444
238  * =>   011331137754440
239  */
cons_mark_STR(char * cons,int len,int lower_only)240 char *cons_mark_STR(char *cons, int len, int lower_only) {
241     rep_ele *reps, *elt, *tmp;
242     char *str;
243 
244     str = calloc(1, len);
245     reps = find_STR(cons, len, lower_only);
246 
247     DL_FOREACH_SAFE(reps, elt, tmp) {
248 	int i, v = 0;
249 
250 	//printf("%2d .. %2d %.*s\n", elt->start, elt->end,
251 	//       elt->end - elt->start+1, &cons[elt->start]);
252 
253 	// What is there?
254 	for (i = MAX(elt->start-1,0); i <= MIN(elt->end+1,len-1); i++)
255 	    v |= str[i];
256 
257 	for (i = 0; i < 8; i++) {
258 	    if (!(v&(1<<i)))
259 		break;
260 	}
261 	v = (i == 8) ? 1 : (1<<i);
262 
263 	// Add new if available, or just overload 1 if not
264 	for (i = elt->start; i <= elt->end; i++)
265 	    str[i] |= v;
266 
267 	DL_DELETE(reps, elt);
268 	free(elt);
269     }
270 
271     return str;
272 }
273