1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include <search/ref-variation.h>
28 #include <search/smith-waterman.h>
29
30 #include <klib/rc.h>
31 #include <klib/refcount.h>
32 #include <klib/text.h>
33
34 #include <insdc/insdc.h>
35
36 #include <stdlib.h>
37 #include <string.h>
38 #include <ctype.h>
39
40 #if WINDOWS
41 #include <intrin.h>
42 #ifndef __builtin_popcount
43 #define __builtin_popcount __popcnt
44 #endif
45 #endif
46
47 #include <sysalloc.h>
48
49 #ifndef min
50 #define min(x,y) ((y) < (x) ? (y) : (x))
51 #endif
52
53 #ifndef max
54 #define max(x,y) ((y) >= (x) ? (y) : (x))
55 #endif
56
57 #define max4(x1, x2, x3, x4) (max( max((x1),(x2)), max((x3),(x4)) ))
58
59 #define COMPARE_4NA 0
60 #define CACHE_MAX_ROWS 0 /* and columns as well */
61 #define GAP_SCORE_LINEAR 0
62 #define SIMILARITY_MATCH 2
63 #define SIMILARITY_MISMATCH -1
64 #define SW_DEBUG_PRINT 0
65
66 struct RefVariation
67 {
68 KRefcount refcount;
69
70 INSDC_dna_text* var_buffer; /* in the case of deletion
71 it contains <ref_base_before><allele><ref_base_after>
72 otherwise it contains allele only */
73 INSDC_dna_text const* allele; /* points to allele in the var_buffer */
74 size_t allele_start;
75 size_t var_buffer_size;
76 size_t allele_size;
77 size_t allele_len_on_ref;
78 };
79
80
81 #if COMPARE_4NA == 1
82
83 unsigned char const map_char_to_4na [256] =
84 {
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
90 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
91 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
92 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
101 };
102
compare_4na(INSDC_dna_text ch2na,INSDC_dna_text ch4na)103 static int compare_4na ( INSDC_dna_text ch2na, INSDC_dna_text ch4na )
104 {
105 unsigned char bits4na = map_char_to_4na [(unsigned char)ch4na];
106 unsigned char bits2na = map_char_to_4na [(unsigned char)ch2na];
107
108 /*return (bits2na & bits4na) != 0 ? 2 : -1;*/
109
110 unsigned char popcnt4na = (unsigned char) __builtin_popcount ( bits4na );
111
112 return (bits2na & bits4na) != 0 ? 12 / popcnt4na : -6;
113 }
114 #endif
115
similarity_func(INSDC_dna_text ch2na,INSDC_dna_text ch4na)116 static int similarity_func (INSDC_dna_text ch2na, INSDC_dna_text ch4na)
117 {
118 #if COMPARE_4NA == 1
119 return compare_4na ( ch2na, ch4na );
120 #else
121 return tolower(ch2na) == tolower(ch4na) ? SIMILARITY_MATCH : SIMILARITY_MISMATCH;
122 #endif
123 }
124
gap_score_const(size_t idx)125 static int gap_score_const ( size_t idx )
126 {
127 return -1;
128 }
129
gap_score_linear(size_t idx)130 static int gap_score_linear ( size_t idx )
131 {
132 #if COMPARE_4NA == 1
133 return -6*(int)idx;
134 #else
135 return -(int)idx;
136 #endif
137 }
138
139 static int (*gap_score_func) (size_t ) =
140 #if GAP_SCORE_LINEAR != 0
141 gap_score_linear
142 #else
143 gap_score_const
144 #endif
145 ;
146
147 typedef struct ValueIndexPair
148 {
149 size_t index;
150 int value;
151 } ValueIndexPair;
152
153
get_char(INSDC_dna_text const * str,size_t size,size_t pos,bool reverse)154 static char get_char (INSDC_dna_text const* str, size_t size, size_t pos, bool reverse)
155 {
156 if ( !reverse )
157 return str [pos];
158 else
159 return str [size - pos - 1];
160 }
161
calculate_similarity_matrix(INSDC_dna_text const * text,size_t size_text,INSDC_dna_text const * query,size_t size_query,bool gap_score_constant,int * matrix,bool reverse,int * max_score,size_t * max_row,size_t * max_col)162 rc_t calculate_similarity_matrix (
163 INSDC_dna_text const* text, size_t size_text,
164 INSDC_dna_text const* query, size_t size_query,
165 bool gap_score_constant,
166 int* matrix, bool reverse,
167 int* max_score, size_t* max_row, size_t* max_col )
168 {
169
170 size_t ROWS = size_text + 1;
171 size_t COLUMNS = size_query + 1;
172 size_t i, j;
173
174 /* arrays to store maximums for all previous rows and columns */
175 #if CACHE_MAX_ROWS != 0
176
177 ValueIndexPair* vec_max_cols = NULL;
178 ValueIndexPair* vec_max_rows = NULL;
179
180 vec_max_cols = calloc ( COLUMNS, sizeof vec_max_cols [ 0 ] );
181 if ( vec_max_cols == NULL)
182 return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
183
184 vec_max_rows = calloc ( ROWS, sizeof vec_max_rows [ 0 ] );
185 if ( vec_max_rows == NULL)
186 {
187 free (vec_max_cols);
188 return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
189 }
190
191 #endif
192 gap_score_func = gap_score_constant ? gap_score_const : gap_score_linear;
193
194 if ( max_score != NULL )
195 {
196 *max_score = 0;
197 }
198 if ( max_row != NULL )
199 {
200 *max_row = 0;
201 }
202 if ( max_col != NULL )
203 {
204 *max_col = 0;
205 }
206 // init 1st row and column with zeros
207 memset ( matrix, 0, COLUMNS * sizeof(matrix[0]) );
208 for ( i = 1; i < ROWS; ++i )
209 matrix [i * COLUMNS] = 0;
210
211 for ( i = 1; i < ROWS; ++i )
212 {
213 for ( j = 1; j < COLUMNS; ++j )
214 {
215 #if CACHE_MAX_ROWS == 0
216 size_t k, l;
217 #endif
218 int cur_score_del, cur_score_ins;
219 int sim = similarity_func (
220 get_char (text, size_text, i-1, reverse),
221 get_char (query, size_query, j-1, reverse) );
222
223 #if CACHE_MAX_ROWS != 0
224 /* TODO: incorrect logic: we cache max{matrix[x,y]}
225 instead of max{matrix[x,y] + gap_score_func(v)}
226 when it's fixed we probably will need to make adjustments here
227 */
228 cur_score_del = vec_max_cols[j].value + gap_score_func(j - vec_max_cols[j].index);
229 #else
230 cur_score_del = -1;
231 for ( k = 1; k < i; ++k )
232 {
233 int cur = matrix [ (i - k)*COLUMNS + j ] + gap_score_func(k);
234 if ( cur > cur_score_del )
235 cur_score_del = cur;
236 }
237 #endif
238
239 #if CACHE_MAX_ROWS != 0
240 /* TODO: incorrect logic: we cache max{matrix[x,y]}
241 instead of max{matrix[x,y] + gap_score_func(v)}
242 when it's fixed we probably will need to make adjustments here
243 */
244 cur_score_ins = vec_max_rows[i].value + gap_score_func(i - vec_max_rows[i].index);;
245 #else
246
247 cur_score_ins = -1;
248 for ( l = 1; l < j; ++l )
249 {
250 int cur = matrix [ i*COLUMNS + (j - l) ] + gap_score_func(l);
251 if ( cur > cur_score_ins )
252 cur_score_ins = cur;
253 }
254 #endif
255 {
256 int score = max4 ( 0,
257 matrix[(i-1)*COLUMNS + j - 1] + sim,
258 cur_score_del,
259 cur_score_ins);
260 matrix[i*COLUMNS + j] = score;
261 if ( max_score != NULL && score > *max_score )
262 {
263 *max_score = score;
264 if ( max_row != NULL )
265 {
266 *max_row = i;
267 }
268 if ( max_col != NULL )
269 {
270 *max_col = j;
271 }
272 }
273 }
274
275 #if CACHE_MAX_ROWS != 0
276 /* TODO: incorrect logic: we cache max{matrix[x,y]}
277 instead of max{matrix[x,y] + gap_score_func(v)}
278 */
279 if ( matrix[i*COLUMNS + j] > vec_max_cols[j].value )
280 {
281 vec_max_cols[j].value = matrix[i*COLUMNS + j];
282 vec_max_cols[j].index = j;
283 }
284 #if GAP_SCORE_LINEAR != 0
285 vec_max_cols[j].value += gap_score_func(1);
286 #endif
287
288 #endif
289 #if CACHE_MAX_ROWS != 0
290 /* TODO: incorrect logic: we cache max{matrix[x,y]}
291 instead of max{matrix[x,y] + gap_score_func(v)}
292 */
293 if ( matrix[i*COLUMNS + j] > vec_max_rows[i].value )
294 {
295 vec_max_rows[i].value = matrix[i*COLUMNS + j];
296 vec_max_rows[i].index = i;
297 }
298 #if GAP_SCORE_LINEAR != 0
299 vec_max_rows[i].value += gap_score_func(1);
300 #endif
301
302 #endif
303 }
304 }
305
306 #if CACHE_MAX_ROWS != 0
307 free (vec_max_cols);
308 free (vec_max_rows);
309 #endif
310
311 return 0;
312 }
313
314 void
sw_find_indel_box(int * matrix,size_t ROWS,size_t COLUMNS,int * ret_row_start,int * ret_row_end,int * ret_col_start,int * ret_col_end)315 sw_find_indel_box ( int* matrix, size_t ROWS, size_t COLUMNS,
316 int* ret_row_start, int* ret_row_end,
317 int* ret_col_start, int* ret_col_end )
318 {
319 size_t max_row = 0, max_col = 0;
320 size_t max_i = ROWS*COLUMNS - 1;
321
322 size_t i = max_i, j;
323 int prev_indel = 0;
324
325 max_row = max_i / COLUMNS;
326 max_col = max_i % COLUMNS;
327
328 *ret_row_start = *ret_row_end = *ret_col_start = *ret_col_end = -1;
329
330 i = max_row;
331 j = max_col;
332
333 /* traceback to (0,0)-th element of the matrix */
334 while (1)
335 {
336 if (i > 0 && j > 0)
337 {
338 /* TODO: ? strong '>' - because we want to prefer indels over matches/mismatches here
339 (expand the window of ambiguity as much as possible)*/
340 if ( matrix [(i - 1)*COLUMNS + (j - 1)] >= matrix [i*COLUMNS + (j - 1)] &&
341 matrix [(i - 1)*COLUMNS + (j - 1)] >= matrix [(i - 1)*COLUMNS + j])
342 {
343 int diag_diff = matrix [i*COLUMNS + j] - matrix [(i - 1)*COLUMNS + (j - 1)];
344 int mismatch = diag_diff == SIMILARITY_MATCH ? 0 : 1;
345
346 if (mismatch && *ret_row_end == -1 )
347 {
348 *ret_row_end = (int)i;
349 *ret_col_end = (int)j;
350 }
351
352 --i;
353 --j;
354
355 if (prev_indel || mismatch)
356 {
357 *ret_row_start = (int)i;
358 *ret_col_start = (int)j;
359 }
360
361 prev_indel = 0;
362 }
363 else if ( matrix [(i - 1)*COLUMNS + (j - 1)] < matrix [i*COLUMNS + (j - 1)] )
364 {
365 if ( *ret_row_end == -1 )
366 {
367 *ret_row_end = (int)i;
368 *ret_col_end = (int)j;
369 }
370 --j;
371 prev_indel = 1;
372 }
373 else
374 {
375 if ( *ret_row_end == -1 )
376 {
377 *ret_row_end = (int)i;
378 *ret_col_end = (int)j;
379 }
380 --i;
381 prev_indel = 1;
382 }
383 }
384 else if ( i > 0 )
385 {
386 if ( *ret_row_end == -1 )
387 {
388 *ret_row_end = (int)i;
389 *ret_col_end = 0;
390 }
391 *ret_row_start = 0;
392 *ret_col_start = 0;
393 break;
394 }
395 else if ( j > 0 )
396 {
397 if ( *ret_row_end == -1 )
398 {
399 *ret_row_end = 0;
400 *ret_col_end = (int)j;
401 }
402 *ret_row_start = 0;
403 *ret_col_start = 0;
404 break;
405 }
406 else
407 {
408 break;
409 }
410 }
411 }
412
413 #if SW_DEBUG_PRINT != 0
414 #include <stdio.h>
print_matrix(int const * matrix,char const * ref_slice,size_t ref_slice_size,char const * query,size_t query_size,bool reverse)415 void print_matrix ( int const* matrix,
416 char const* ref_slice, size_t ref_slice_size,
417 char const* query, size_t query_size,
418 bool reverse )
419 {
420 size_t COLUMNS = ref_slice_size + 1;
421 size_t ROWS = query_size + 1;
422
423 int print_width = 2;
424 size_t i, j;
425
426 printf (" %*c ", print_width, '-');
427 for (j = 1; j < COLUMNS; ++j)
428 printf ("%*c ", print_width, get_char ( ref_slice, ref_slice_size, j-1, reverse ));
429 printf ("\n");
430
431 for (i = 0; i < ROWS; ++i)
432 {
433 if ( i == 0 )
434 printf ("%c ", '-');
435 else
436 printf ("%c ", get_char (query, query_size, i-1, reverse ));
437
438 for (j = 0; j < COLUMNS; ++j)
439 {
440 printf ("%*d ", print_width, matrix[i*COLUMNS + j]);
441 }
442 printf ("\n");
443 }
444 }
445
446 #endif
447
448 /*
449 FindRefVariationBounds uses Smith-Waterman algorithm
450 to find theoretical bounds of the variation for
451 the given reference slice and the query (properly preapared,
452 i.e. containing sequences of bases at the beginning and
453 the end matching the reference)
454
455 ref_slice, ref_slice_size [IN] - the reference slice on which the
456 variation will be looked for
457 query, query_size [IN] - the query that represents the variation placed
458 inside the reference slice
459 ref_start, ref_len [OUT, NULL OK] - the region of ambiguity on the reference
460 have_indel [OUT] - pointer to flag indication if there is an insertion or deletion
461 (1 - there is an indel, 0 - there is match/mismatch only)
462 */
463 static
FindRefVariationBounds(INSDC_dna_text const * ref_slice,size_t ref_slice_size,INSDC_dna_text const * query,size_t query_size,size_t * ref_start,size_t * ref_len,bool * has_indel)464 rc_t FindRefVariationBounds (
465 INSDC_dna_text const* ref_slice, size_t ref_slice_size,
466 INSDC_dna_text const* query, size_t query_size,
467 size_t* ref_start, size_t* ref_len, bool * has_indel
468 )
469 {
470 /* building sw-matrix for chosen reference slice and sequence */
471
472 size_t COLUMNS = ref_slice_size + 1;
473 size_t ROWS = query_size + 1;
474 rc_t rc = 0;
475
476 bool gap_score_constant = ( GAP_SCORE_LINEAR == 0 );
477
478 int row_start, col_start, row_end, col_end;
479 int row_start_rev, col_start_rev, row_end_rev, col_end_rev;
480 int* matrix = malloc( ROWS * COLUMNS * sizeof (int) );
481 if (matrix == NULL)
482 return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
483 * has_indel = true;
484
485
486 /* forward scan */
487 rc = calculate_similarity_matrix ( query, query_size, ref_slice, ref_slice_size, gap_score_constant, matrix, false, NULL, NULL, NULL );
488 if ( rc != 0 )
489 goto free_resources;
490 #if SW_DEBUG_PRINT != 0
491 print_matrix ( matrix, ref_slice, ref_slice_size, query, query_size, false );
492 #endif
493
494 sw_find_indel_box ( matrix, ROWS, COLUMNS, &row_start, &row_end, &col_start, &col_end );
495 if ( row_start == -1 && row_end == -1 && col_start == -1 && col_end == -1 )
496 {
497 * has_indel = 0;
498 goto free_resources;
499 }
500 #if SW_DEBUG_PRINT != 0
501 printf ("start=(%d, %d), end=(%d, %d)\n", row_start, col_start, row_end, col_end);
502 #endif
503
504 /* reverse scan */
505 rc = calculate_similarity_matrix ( query, query_size, ref_slice, ref_slice_size, gap_score_constant, matrix, true, NULL, NULL, NULL );
506 if ( rc != 0 )
507 goto free_resources;
508 #if SW_DEBUG_PRINT != 0
509 print_matrix ( matrix, ref_slice, ref_slice_size, query, query_size, true );
510 #endif
511
512 sw_find_indel_box ( matrix, ROWS, COLUMNS, &row_start_rev, &row_end_rev, &col_start_rev, &col_end_rev );
513 #if SW_DEBUG_PRINT != 0
514 printf ("start_rev=(%d, %d), end_rev=(%d, %d)\n", row_start_rev, col_start_rev, row_end_rev, col_end_rev);
515 #endif
516 if ( row_start_rev != -1 || row_end_rev != -1 || col_start_rev != -1 || col_end_rev != -1 )
517 {
518 row_start = min ( (int)query_size - row_end_rev - 1, row_start );
519 row_end = max ( (int)query_size - row_start_rev - 1, row_end );
520 col_start = min ( (int)ref_slice_size - col_end_rev - 1, col_start );
521 col_end = max ( (int)ref_slice_size - col_start_rev - 1, col_end );
522 }
523 #if SW_DEBUG_PRINT != 0
524 printf ("COMBINED: start=(%d, %d), end=(%d, %d)\n", row_start, col_start, row_end, col_end);
525 #endif
526
527 if ( ref_start != NULL )
528 *ref_start = col_start + 1;
529 if ( ref_len != NULL )
530 *ref_len = col_end - col_start - 1;
531
532 free_resources:
533 free (matrix);
534
535 return rc;
536 }
537
538 /****************************************************/
539 /* yet another string helper */
540 typedef struct c_string_const
541 {
542 char const* str;
543 size_t size;
544 } c_string_const;
545
c_string_const_assign(c_string_const * self,char const * src,size_t size)546 static void c_string_const_assign ( c_string_const* self, char const* src, size_t size )
547 {
548 self -> str = src;
549 self -> size = size;
550 }
551
552 typedef struct c_string
553 {
554 char* str;
555 size_t size;
556 size_t capacity;
557 } c_string;
558
c_string_make(c_string * self,size_t capacity)559 static int c_string_make ( c_string* self, size_t capacity )
560 {
561 assert ( capacity > 0 );
562 self -> str = malloc (capacity + 1);
563 if ( self -> str != NULL )
564 {
565 self -> str [0] = '\0';
566 self -> size = 0;
567 self -> capacity = capacity;
568 return 1;
569 }
570 else
571 return 0;
572 }
573
c_string_destruct(c_string * self)574 static void c_string_destruct ( c_string* self )
575 {
576 if ( self->str != NULL )
577 {
578 free ( self -> str );
579 self -> str = NULL;
580 self -> size = 0;
581 self -> capacity = 0;
582 }
583 }
584
c_string_realloc_no_preserve(c_string * self,size_t new_capacity)585 static int c_string_realloc_no_preserve ( c_string* self, size_t new_capacity )
586 {
587 if ( self -> capacity < new_capacity )
588 {
589 c_string_destruct ( self );
590
591 return c_string_make ( self, new_capacity );
592 }
593 else
594 {
595 self -> str [0] = '\0';
596 self -> size = 0;
597 }
598
599 return 1;
600 }
601
c_string_assign(c_string * self,char const * src,size_t src_size)602 static int c_string_assign ( c_string* self, char const* src, size_t src_size )
603 {
604 assert ( self->capacity >= src_size );
605 if ( self->capacity < src_size && !c_string_realloc_no_preserve (self, max(self->capacity * 2, src_size)) )
606 return 0;
607
608 memmove ( self -> str, src, src_size );
609 self -> str [src_size] = '\0';
610 self -> size = src_size;
611
612 return 1;
613 }
614
c_string_append(c_string * self,char const * append,size_t append_size)615 static int c_string_append ( c_string* self, char const* append, size_t append_size)
616 {
617 if ( append_size != 0 )
618 {
619 size_t new_size = self->size + append_size;
620 if ( self->capacity >= new_size )
621 {
622 memmove ( self->str + self->size, append, append_size );
623 self->size = new_size;
624 self->str [new_size] = '\0';
625 }
626 else
627 {
628 size_t new_capacity = max (new_size + 1, self->capacity * 2);
629 char* new_str = malloc ( new_capacity );
630 if (new_str == NULL)
631 return 0;
632
633 memmove (new_str, self->str, self->size);
634 memmove (new_str + self->size, append, append_size );
635 new_str [ new_size ] = '\0';
636
637 c_string_destruct ( self );
638
639 self->str = new_str;
640 self->size = new_size;
641 self->capacity = new_capacity;
642 }
643 }
644
645 return 1;
646 }
647
648 #if 0
649 static int c_string_wrap ( c_string* self,
650 char const* prefix, size_t prefix_size,
651 char const* postfix, size_t postfix_size)
652 {
653 assert ( self -> str != NULL );
654 size_t new_size = self->size + prefix_size + postfix_size;
655
656 if ( new_size > self->capacity )
657 {
658 size_t new_capacity = max (new_size + 1, self->capacity * 2);
659 char* new_str = malloc ( new_capacity );
660 if (new_str == NULL)
661 return 0;
662
663 memmove ( new_str, prefix, prefix_size );
664 memmove ( new_str + prefix_size, self -> str, self -> size );
665 memmove ( new_str + prefix_size + self->size, postfix, postfix_size );
666 new_str [ new_size ] = '\0';
667
668 c_string_destruct ( self );
669
670 self->str = new_str;
671 self->size = new_size;
672 self->capacity = new_capacity;
673 }
674 else
675 {
676 memmove ( self->str + prefix_size, self->str, self->size );
677 memmove ( self->str, prefix, prefix_size );
678 memmove (self->str + prefix_size + self->size, postfix, postfix_size );
679 self->str [new_size] = '\0';
680 }
681 return 1;
682 }
683 #endif
684 /************************************************/
685
686
687 /*
688 returns true if a new ref_slice is selected
689 returns false if the new ref_slice is the same as the previous one passed in ref_slice
690 */
691
get_ref_slice(INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,size_t var_len_on_ref,size_t slice_expand_left,size_t slice_expand_right,c_string_const * ref_slice)692 static bool get_ref_slice (
693 INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
694 size_t var_len_on_ref,
695 size_t slice_expand_left, size_t slice_expand_right,
696 c_string_const* ref_slice)
697 {
698 size_t ref_start, ref_xend;
699 if ( ref_pos_var < slice_expand_left )
700 ref_start = 0;
701 else
702 ref_start = ref_pos_var - slice_expand_left;
703
704 if ( ref_pos_var + slice_expand_right + var_len_on_ref >= ref_size )
705 ref_xend = ref_size;
706 else
707 ref_xend = ref_pos_var + slice_expand_right + var_len_on_ref;
708
709 if ( ref_slice->str == ref + ref_start && ref_slice->size == ref_xend - ref_start)
710 return false;
711
712 c_string_const_assign ( ref_slice, ref + ref_start, ref_xend - ref_start );
713 return true;
714 }
715
716 #if 1
make_query(c_string_const const * ref_slice,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,int64_t var_start_pos_adj,c_string * query)717 static bool make_query ( c_string_const const* ref_slice,
718 INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
719 int64_t var_start_pos_adj, /* ref_pos adjusted to the beginning of ref_slice (in the simplest case - the middle of ref_slice) */
720 c_string* query
721 )
722 {
723 if ( !c_string_realloc_no_preserve (query, variation_size + ref_slice->size - var_len_on_ref) )
724 return false;
725
726 if ( !c_string_append (query, ref_slice->str, var_start_pos_adj) ||
727 !c_string_append (query, variation, variation_size) ||
728 !c_string_append (query, ref_slice->str + var_start_pos_adj + var_len_on_ref, ref_slice->size - var_start_pos_adj - var_len_on_ref) )
729 {
730 return false;
731 }
732
733 return true;
734 }
735
compose_variation(c_string_const const * ref,size_t ref_start,size_t ref_len,INSDC_dna_text const * query,size_t query_len,int64_t ref_pos_var,size_t var_len_on_ref,c_string * variation,char const ** pallele,size_t * pallele_size)736 static bool compose_variation ( c_string_const const* ref,
737 size_t ref_start, size_t ref_len,
738 INSDC_dna_text const* query, size_t query_len,
739 int64_t ref_pos_var, size_t var_len_on_ref,
740 c_string* variation, char const** pallele, size_t* pallele_size )
741 {
742 bool ret = true;
743
744 size_t ref_end_orig = (size_t)ref_pos_var + var_len_on_ref;
745 size_t ref_end_new = ref_start + ref_len;
746
747 size_t prefix_start = ref_start, prefix_len, query_trim_l;
748 size_t postfix_start = ref_end_orig, postfix_len, query_trim_r;
749
750 size_t query_len_new, var_len;
751
752 size_t allele_expanded_l = 0, allele_expanded_r = 0;
753
754 if ((int64_t)ref_start <= ref_pos_var) /* left bound is expanded */
755 {
756 prefix_len = (size_t)ref_pos_var - ref_start;
757 query_trim_l = 0;
758
759 assert ((int64_t)prefix_len >= 0);
760 }
761 else /* left bound is shrinked */
762 {
763 prefix_len = 0;
764 query_trim_l = ref_start - (size_t)ref_pos_var;
765
766 assert ((int64_t)query_trim_l >= 0);
767 }
768
769 if (ref_end_new >= ref_end_orig) /* right bound is expanded */
770 {
771 postfix_start = ref_end_orig;
772 postfix_len = ref_end_new - ref_end_orig;
773 query_trim_r = 0;
774 }
775 else /* right bound is shrinked */
776 {
777 postfix_start = ref_end_new;
778 postfix_len = 0;
779 query_trim_r = ref_end_orig - ref_end_new;
780 }
781
782 /*
783 special case: pure match/mismatch
784 algorithm gives ref_len = 0, but in this case
785 we want to have variation = input query
786 */
787 if ( ref_len == 0 && query_len == var_len_on_ref )
788 {
789 assert ( prefix_len == 0 );
790 assert ( postfix_len == 0 );
791 assert ( query_trim_l == 0 );
792 assert ( query_trim_r > 0 );
793
794 query_trim_r = 0;
795 }
796 /*
797 special case: deletion
798 we have to create a query whis is the allele
799 expanded one base to the left and to the right
800 on the reference if possible
801 */
802 else if ( var_len_on_ref > query_len )
803 {
804 if ( prefix_start > 0 )
805 {
806 allele_expanded_l = 1;
807 prefix_start -= 1;
808 ++prefix_len;
809 }
810
811 if ( postfix_start + postfix_len + 1 < ref->size)
812 {
813 allele_expanded_r = 1;
814 ++postfix_len;
815 }
816 }
817
818 query_len_new = query_len - query_trim_l - query_trim_r;
819 assert ((int64_t)query_len_new >= 0);
820 var_len = prefix_len + query_len_new + postfix_len;
821
822 if ( var_len > 0 )
823 {
824 /* non-empty variation */
825 if ( !c_string_realloc_no_preserve( variation, var_len ) )
826 ret = false;
827
828 if ( prefix_len > 0 )
829 ret = ret && c_string_assign (variation, ref->str + prefix_start, prefix_len);
830
831 if ( query_len_new > 0 )
832 ret = ret && c_string_append (variation, query + query_trim_l, query_len_new);
833
834 if ( postfix_len > 0 )
835 ret = ret && c_string_append (variation, ref->str + postfix_start, postfix_len);
836
837 if ( ! ret )
838 c_string_destruct ( variation );
839
840 *pallele = variation->str + allele_expanded_l;
841 *pallele_size = variation->size - allele_expanded_l - allele_expanded_r;
842 }
843 else
844 {
845 /* in this case there is no query - don't allocate anything */
846 ret = true;
847 assert ( 0 ); /* since we expand deletions,
848 this code shall not be reached.
849 theoretically it can be reached if
850 reference has length == 0 only */
851 }
852
853 return ret;
854 }
855
856 #endif
857
858 #if 0
859 static bool make_query_ (
860 INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
861 INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
862 size_t slice_expand_left, size_t slice_expand_right,
863 c_string* query
864 )
865 {
866 size_t ref_prefix_start, ref_prefix_len, ref_suffix_start, ref_suffix_len;
867 if ( !c_string_realloc_no_preserve (query, variation_size + slice_expand_left + slice_expand_right + var_len_on_ref) )
868 return false;
869
870 if ( ref_pos_var < slice_expand_left )
871 {
872 ref_prefix_start = 0;
873 ref_prefix_len = slice_expand_left - (ref_pos_var - 1);
874 }
875 else
876 {
877 ref_prefix_start = ref_pos_var - slice_expand_left;
878 ref_prefix_len = slice_expand_left;
879 }
880
881 ref_suffix_start = ref_pos_var + var_len_on_ref;
882
883 if ( ref_suffix_start + slice_expand_right >= ref_size )
884 ref_suffix_len = ref_size - (slice_expand_right + 1);
885 else
886 ref_suffix_len = slice_expand_right;
887
888 if ( !c_string_append (query, ref + ref_prefix_start, ref_prefix_len) ||
889 !c_string_append (query, variation, variation_size) ||
890 !c_string_append (query, ref + ref_suffix_start, ref_suffix_len) )
891 {
892 return false;
893 }
894
895 return true;
896 }
897 #endif
898
899 /*
900 FindRefVariationRegionIUPAC_SW uses Smith-Waterman algorithm
901 to find theoretical bounds of the variation for
902 the given reference, position on the reference
903 and the raw query, or variation to look for at the given
904 reference position
905
906 ref, ref_size [IN] - the reference on which the
907 variation will be looked for
908 ref_pos_var [IN] - the position on reference to look for the variation
909 variation, variation_size [IN] - the variation to look for at the ref_pos_var
910 var_len_on_ref [IN] - the length of the variation on the reference, e.g.:
911 - mismatch, 2 bases: variation = "XY", var_len_on_ref = 2
912 - deletion, 3 bases: variation = "", var_len_on_ref = 3
913 - insertion, 2 bases: variation = "XY", var_len_on_ref = 0
914
915 p_ref_start, p_ref_len [OUT, NULL OK] - the region of ambiguity on the reference
916 (return values)
917 */
918
FindRefVariationRegionIUPAC_SW(INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,size_t * p_ref_start,size_t * p_ref_len)919 static rc_t CC FindRefVariationRegionIUPAC_SW (
920 INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
921 INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
922 size_t* p_ref_start, size_t* p_ref_len
923 )
924 {
925 rc_t rc = 0;
926
927 size_t var_half_len = 1;/*variation_size / 2 + 1;*/
928
929 size_t exp_l = var_half_len;
930 size_t exp_r = var_half_len;
931
932 /* previous start and end for reference slice */
933 int64_t slice_start = -1, slice_end = -1;
934
935 c_string_const ref_slice;
936 c_string query;
937
938 size_t ref_start = 0, ref_len = 0;
939
940 ref_slice.str = NULL;
941 ref_slice.size = 0;
942
943 if ( !c_string_make ( & query, ( variation_size + 1 ) * 2 ) )
944 return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
945
946 while ( 1 )
947 {
948 int64_t new_slice_start, new_slice_end;
949 int64_t ref_pos_adj;
950 int cont = 0;
951 bool has_indel = false;
952
953 /* get new expanded slice and check if it has not reached the bounds of ref */
954 bool slice_expanded = get_ref_slice ( ref, ref_size, ref_pos_var, var_len_on_ref, exp_l, exp_r, & ref_slice );
955 if ( !slice_expanded )
956 break;
957
958 /* get ref_pos relative to ref_slice start and new slice_start and end */
959 ref_pos_adj = (int64_t)ref_pos_var - ( ref_slice.str - ref );
960 new_slice_start = ref_slice.str - ref;
961 new_slice_end = new_slice_start + ref_slice.size;
962
963 /* compose a new query for newly extended ref slice */
964 /*if ( !make_query_( ref, ref_size, ref_pos_var, variation, variation_size, var_len_on_ref, exp_l, exp_r, & query ) )*/
965 if ( !make_query ( & ref_slice, variation, variation_size, var_len_on_ref, ref_pos_adj, & query ) )
966 {
967 rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
968 goto free_resources;
969 }
970
971 rc = FindRefVariationBounds ( ref_slice.str, ref_slice.size,
972 query.str, query.size, & ref_start, & ref_len, & has_indel );
973
974 /* if there are no indels report that there is no ambiguity
975 for the given ref_pos_var: region starting at ref_pos_var has length = 0
976 ambiguity
977 */
978 if ( !has_indel )
979 {
980 ref_start = ref_pos_adj;
981 ref_len = 0;
982 }
983
984 if ( rc != 0 )
985 goto free_resources;
986
987 /* if we've found the ambiguity window starting at the very
988 beginning of the ref slice and if we're still able to extend
989 ref slice to the left (haven't bumped into boundary) - extend to the left
990 and repeat the search
991 */
992 if ( ref_start == 0 && (slice_start == -1 || new_slice_start != slice_start ) )
993 {
994 exp_l *= 2;
995 cont = 1;
996 }
997
998 /* if we've found the ambiguity window ending at the very
999 end of the ref slice and if we're still able to extend
1000 ref slice to the right (haven't bumped into boundary) - extend to the right
1001 and repeat the search
1002 */
1003 if ( ref_start + ref_len == ref_slice.size && (slice_end == -1 || new_slice_end != slice_end) )
1004 {
1005 exp_r *= 2;
1006 cont = 1;
1007 }
1008
1009 if ( !cont )
1010 break;
1011
1012 slice_start = new_slice_start;
1013 slice_end = new_slice_end;
1014 }
1015 if ( p_ref_start != NULL )
1016 *p_ref_start = ref_start + (ref_slice.str - ref);
1017 if ( p_ref_len != NULL )
1018 *p_ref_len = ref_len;
1019
1020 free_resources:
1021 c_string_destruct ( &query );
1022
1023 return rc;
1024 }
1025
1026 /*
1027 FindRefVariationRegionIUPAC_RA uses Rolling-bulldozer algorithm
1028 to find theoretical bounds of the variation for
1029 the given reference, position on the reference
1030 and the raw query, or variation to look for at the given
1031 reference position
1032
1033 ref, ref_size [IN] - the reference on which the
1034 variation will be looked for
1035 ref_pos_var [IN] - the position on reference to look for the variation
1036 variation, variation_size [IN] - the variation to look for at the ref_pos_var
1037 var_len_on_ref [IN] - the length of the variation on the reference, e.g.:
1038 - mismatch, 2 bases: variation = "XY", var_len_on_ref = 2
1039 - deletion, 3 bases: variation = "", var_len_on_ref = 3
1040 - insertion, 2 bases: variation = "XY", var_len_on_ref = 0
1041
1042 p_ref_start, p_ref_len [OUT, NULL OK] - the region of ambiguity on the reference
1043 (return values)
1044 */
1045
FindRefVariationRegionIUPAC_RA(INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,size_t * p_ref_start,size_t * p_ref_len)1046 static rc_t CC FindRefVariationRegionIUPAC_RA (
1047 INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
1048 INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
1049 size_t* p_ref_start, size_t* p_ref_len
1050 )
1051 {
1052 rc_t rc = 0;
1053 size_t del_pos_start, del_pos_xend;
1054 size_t ins_pos_start, ins_pos_xend;
1055
1056 /* Stage 1: trying to expand deletion */
1057
1058 /* expanding to the left */
1059 if (var_len_on_ref > 0)
1060 {
1061 for (del_pos_start = ref_pos_var;
1062 del_pos_start != 0 && ref[del_pos_start-1] == ref[del_pos_start-1 + var_len_on_ref];
1063 --del_pos_start);
1064
1065 /* expanding to the right */
1066 for (del_pos_xend = ref_pos_var + var_len_on_ref;
1067 del_pos_xend < ref_size && ref[del_pos_xend] == ref[del_pos_xend - var_len_on_ref];
1068 ++del_pos_xend);
1069 }
1070 else
1071 {
1072 del_pos_start = ref_pos_var;
1073 del_pos_xend = ref_pos_var;
1074 }
1075
1076 /* Stage 2: trying to expand insertion */
1077
1078 /* expanding to the left */
1079 /* roll first repetition to the left (avoiding % operation) */
1080 if (variation_size > 0)
1081 {
1082 if (del_pos_start > 0)
1083 {
1084 for (ins_pos_start = ref_pos_var; ins_pos_start != 0; --ins_pos_start)
1085 {
1086 size_t pos_in_var = ins_pos_start-1 - ref_pos_var + variation_size;
1087 size_t pos_in_ref = ins_pos_start-1;
1088 if ( (int64_t)pos_in_var == -1l || ref[pos_in_ref] != variation[pos_in_var] )
1089 break;
1090 }
1091 /* roll beyond first repetition (still avoiding %) - now can compare with reference rather than with variation */
1092 for (; ins_pos_start != 0 && ref[ins_pos_start-1] == ref[ins_pos_start-1 + variation_size];
1093 --ins_pos_start);
1094 }
1095 else
1096 ins_pos_start = 0;
1097
1098 /* roll first repetition to the right (avoiding % operation) */
1099 if (del_pos_xend < ref_size)
1100 {
1101 for (ins_pos_xend = ref_pos_var + var_len_on_ref; ins_pos_xend < ref_size; ++ins_pos_xend)
1102 {
1103 size_t pos_in_var = ins_pos_xend - ref_pos_var - var_len_on_ref;
1104 if (pos_in_var == variation_size || ref[ins_pos_xend] != variation[pos_in_var])
1105 break;
1106 }
1107 /* roll beyond first repetition (still avoiding %) - now can compare with reference rather than with variation */
1108 if (ins_pos_xend - ref_pos_var - var_len_on_ref == variation_size)
1109 {
1110 for (; ins_pos_xend < ref_size && ref[ins_pos_xend] == ref[ins_pos_xend - variation_size];
1111 ++ins_pos_xend);
1112 }
1113 }
1114 else
1115 ins_pos_xend = ref_size;
1116 }
1117 else
1118 {
1119 ins_pos_start = ref_pos_var;
1120 ins_pos_xend = ref_pos_var;
1121 }
1122
1123
1124 if (del_pos_start > ins_pos_start)
1125 del_pos_start = ins_pos_start;
1126 if (del_pos_xend < ins_pos_xend)
1127 del_pos_xend = ins_pos_xend;
1128
1129 if ( p_ref_start != NULL )
1130 *p_ref_start = del_pos_start;
1131 if ( p_ref_len != NULL )
1132 *p_ref_len = del_pos_xend - del_pos_start;
1133
1134 return rc;
1135 }
1136
1137 /*
1138 FindRefVariationRegionIUPAC
1139 to find theoretical bounds of the variation for
1140 the given reference, position on the reference
1141 and the raw query, or variation to look for at the given
1142 reference position
1143
1144 alg - algorithm to use for the search (one of RefVarAlg enum)
1145 ref, ref_size [IN] - the reference on which the
1146 variation will be looked for
1147 ref_pos_var [IN] - the position on reference to look for the variation
1148 variation, variation_size [IN] - the variation to look for at the ref_pos_var
1149 var_len_on_ref [IN] - the length of the variation on the reference, e.g.:
1150 - mismatch, 2 bases: variation = "XY", var_len_on_ref = 2
1151 - deletion, 3 bases: variation = "", var_len_on_ref = 3
1152 - insertion, 2 bases: variation = "XY", var_len_on_ref = 0
1153
1154 p_ref_start, p_ref_len [OUT, NULL OK] - the region of ambiguity on the reference
1155 (return values)
1156 */
1157
FindRefVariationRegionIUPAC(RefVarAlg alg,INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,size_t * p_ref_start,size_t * p_ref_len)1158 static rc_t CC FindRefVariationRegionIUPAC (
1159 RefVarAlg alg, INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
1160 INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
1161 size_t* p_ref_start, size_t* p_ref_len
1162 )
1163 {
1164 switch (alg)
1165 {
1166 case refvarAlgSW:
1167 return FindRefVariationRegionIUPAC_SW ( ref, ref_size, ref_pos_var, variation, variation_size, var_len_on_ref, p_ref_start, p_ref_len );
1168 case refvarAlgRA:
1169 return FindRefVariationRegionIUPAC_RA ( ref, ref_size, ref_pos_var, variation, variation_size, var_len_on_ref, p_ref_start, p_ref_len );
1170 }
1171 return RC ( rcVDB, rcExpression, rcConstructing, rcParam, rcUnrecognized );
1172 }
1173
RefVariationIUPACMake(RefVariation ** obj,INSDC_dna_text const * ref,size_t ref_len,size_t deletion_pos,size_t deletion_len,INSDC_dna_text const * insertion,size_t insertion_len,RefVarAlg alg)1174 rc_t CC RefVariationIUPACMake (RefVariation ** obj,
1175 INSDC_dna_text const* ref, size_t ref_len,
1176 size_t deletion_pos, size_t deletion_len,
1177 INSDC_dna_text const* insertion, size_t insertion_len
1178 #if REF_VAR_ALG
1179 , RefVarAlg alg
1180 #endif
1181 )
1182 {
1183 struct RefVariation* new_obj;
1184 rc_t rc = 0;
1185
1186 if ( ( insertion_len == 0 && deletion_len == 0 )
1187 || ref_len == 0 )
1188 {
1189 return RC (rcText, rcString, rcSearching, rcParam, rcEmpty);
1190 }
1191
1192 if ( (deletion_pos + deletion_len) > ref_len )
1193 {
1194 return RC (rcText, rcString, rcSearching, rcParam, rcOutofrange);
1195 }
1196
1197 assert ( obj != NULL );
1198
1199 new_obj = calloc ( 1, sizeof * new_obj );
1200
1201 if ( new_obj == NULL )
1202 {
1203 rc = RC ( rcVDB, rcExpression, rcConstructing, rcMemory, rcExhausted );
1204 }
1205 else
1206 {
1207 size_t ref_window_start = 0, ref_window_len = 0;
1208 rc = FindRefVariationRegionIUPAC ( alg, ref, ref_len,
1209 deletion_pos,
1210 insertion, insertion_len, deletion_len,
1211 & ref_window_start, & ref_window_len );
1212 if ( rc != 0 )
1213 {
1214 free ( new_obj );
1215 new_obj = NULL;
1216 }
1217 else
1218 {
1219 size_t allele_size = 0;
1220 char const* allele = NULL;
1221
1222 c_string_const ref_str;
1223
1224 c_string var_str;
1225 var_str.capacity = var_str.size = 0;
1226 var_str.str = NULL;
1227
1228 c_string_const_assign ( & ref_str, ref, ref_len );
1229
1230 if ( ! compose_variation ( & ref_str,
1231 ref_window_start, ref_window_len,
1232 insertion, insertion_len,
1233 deletion_pos, deletion_len, & var_str,
1234 & allele, & allele_size ) )
1235 {
1236 rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
1237 free ( new_obj );
1238 new_obj = NULL;
1239 }
1240 else
1241 {
1242 KRefcountInit ( & new_obj->refcount, 1, "RefVariation", "make", "ref-var" );
1243 /* moving var_str to the object (so no need to destruct var_str */
1244
1245 new_obj->var_buffer = var_str.str;
1246 new_obj->var_buffer_size = var_str.size;
1247
1248 new_obj->allele = allele;
1249 new_obj->allele_size = allele_size;
1250
1251 new_obj->allele_start = ref_window_start;
1252 new_obj->allele_len_on_ref = ref_window_len == 0 && insertion_len == deletion_len
1253 ? deletion_len : ref_window_len;
1254 }
1255 }
1256 }
1257
1258 * obj = new_obj;
1259
1260 /* TODO: if Kurt insists, return non-zero rc if var_start == 0 or var_start + var_len == ref_size */
1261 return rc;
1262 }
1263
1264
RefVariationAddRef(RefVariation const * self)1265 rc_t CC RefVariationAddRef ( RefVariation const* self )
1266 {
1267 if ( self != NULL )
1268 {
1269 switch ( KRefcountAdd ( & self -> refcount, "RefVariation" ) )
1270 {
1271 case krefLimit:
1272 return RC ( rcVDB, rcExpression, rcAttaching, rcRange, rcExcessive );
1273 }
1274 }
1275 return 0;
1276 }
1277
RefVariationIUPACWhack(RefVariation * self)1278 static rc_t CC RefVariationIUPACWhack ( RefVariation* self )
1279 {
1280 KRefcountWhack ( & self -> refcount, "RefVariation" );
1281
1282 assert ( self->var_buffer != NULL || self->var_buffer_size == 0 );
1283 if ( self->var_buffer != NULL )
1284 free ( self->var_buffer );
1285
1286 memset ( self, 0, sizeof * self );
1287
1288 free ( self );
1289
1290 return 0;
1291 }
1292
RefVariationRelease(RefVariation const * self)1293 rc_t CC RefVariationRelease ( RefVariation const* self )
1294 {
1295 if ( self != NULL )
1296 {
1297 switch ( KRefcountDrop ( & self -> refcount, "RefVariation" ) )
1298 {
1299 case krefWhack:
1300 return RefVariationIUPACWhack ( ( RefVariation* ) self );
1301 case krefNegative:
1302 return RC ( rcVDB, rcExpression, rcReleasing, rcRange, rcExcessive );
1303 }
1304 }
1305 return 0;
1306 }
1307
RefVariationGetIUPACSearchQuery(RefVariation const * self,INSDC_dna_text const ** query,size_t * query_len,size_t * query_start)1308 rc_t CC RefVariationGetIUPACSearchQuery ( RefVariation const* self,
1309 INSDC_dna_text const ** query, size_t * query_len, size_t * query_start )
1310 {
1311 if ( self == NULL )
1312 return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1313
1314 if ( query != NULL )
1315 * query = self->var_buffer;
1316 if ( query_len != NULL )
1317 * query_len = self->var_buffer_size;
1318 if ( query_start != NULL )
1319 * query_start = self->allele_start - (self->allele - self->var_buffer);
1320
1321 return 0;
1322 }
1323
RefVariationGetSearchQueryLenOnRef(RefVariation const * self,size_t * query_len_on_ref)1324 rc_t CC RefVariationGetSearchQueryLenOnRef ( RefVariation const* self, size_t * query_len_on_ref )
1325 {
1326 if ( self == NULL )
1327 return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1328
1329 if ( query_len_on_ref != NULL )
1330 * query_len_on_ref = self->allele_len_on_ref + self->var_buffer_size - self->allele_size;
1331
1332 return 0;
1333 }
1334
RefVariationGetAllele(RefVariation const * self,INSDC_dna_text const ** allele,size_t * allele_len,size_t * allele_start)1335 rc_t CC RefVariationGetAllele ( RefVariation const* self,
1336 INSDC_dna_text const ** allele, size_t * allele_len, size_t * allele_start )
1337 {
1338 if ( self == NULL )
1339 return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1340
1341 if ( allele != NULL )
1342 * allele = self->allele;
1343 if ( allele_len != NULL )
1344 * allele_len = self->allele_size;
1345 if ( allele_start != NULL )
1346 * allele_start = self->allele_start;
1347
1348 return 0;
1349 }
1350
RefVariationGetAlleleLenOnRef(RefVariation const * self,size_t * allele_len_on_ref)1351 rc_t CC RefVariationGetAlleleLenOnRef ( RefVariation const* self, size_t * allele_len_on_ref )
1352 {
1353 if ( self == NULL )
1354 return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1355
1356 if ( allele_len_on_ref != NULL )
1357 * allele_len_on_ref = self->allele_len_on_ref;
1358
1359 return 0;
1360 }
1361
1362 //////////////// Search-oriented SmithWaterman+
1363
1364 struct SmithWaterman
1365 {
1366 char* query;
1367 size_t query_size;
1368 size_t max_rows;
1369 int* matrix; // originally NULL, grows as needed to hold enough memory for query_size * max_rows
1370 };
1371
SmithWatermanMake(SmithWaterman ** p_self,const char * p_query)1372 LIB_EXPORT rc_t CC SmithWatermanMake( SmithWaterman** p_self, const char* p_query )
1373 {
1374 rc_t rc = 0;
1375
1376 if( p_self != NULL && p_query != NULL )
1377 {
1378 SmithWaterman* ret = malloc ( sizeof ( SmithWaterman ) );
1379 if ( ret != NULL )
1380 {
1381 ret -> query = string_dup_measure ( p_query, & ret -> query_size );
1382 if ( ret -> query != NULL )
1383 {
1384 ret -> max_rows = 0;
1385 ret -> matrix = NULL;
1386 *p_self = ret;
1387 return 0;
1388 }
1389 else
1390 {
1391 rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
1392 }
1393 free ( ret );
1394 }
1395 else
1396 {
1397 rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
1398 }
1399 }
1400 else
1401 {
1402 rc = RC(rcText, rcString, rcSearching, rcParam, rcNull);
1403 }
1404
1405 return rc;
1406 }
1407
1408 LIB_EXPORT void CC
SmithWatermanWhack(SmithWaterman * self)1409 SmithWatermanWhack( SmithWaterman* self )
1410 {
1411 free ( self -> matrix );
1412 free ( self -> query );
1413 free ( self );
1414 }
1415
1416 LIB_EXPORT rc_t CC
SmithWatermanFindFirst(SmithWaterman * p_self,uint32_t p_threshold,const char * p_buf,size_t p_buf_size,SmithWatermanMatch * p_match)1417 SmithWatermanFindFirst( SmithWaterman* p_self, uint32_t p_threshold, const char* p_buf, size_t p_buf_size, SmithWatermanMatch* p_match )
1418 {
1419 rc_t rc = 0;
1420 int score;
1421 size_t max_row;
1422 size_t max_col;
1423
1424 if (p_buf_size == 0)
1425 {
1426 return SILENT_RC(rcText, rcString, rcSearching, rcQuery, rcNotFound);
1427 }
1428
1429 if ( p_buf_size > p_self -> max_rows )
1430 {
1431 /* calculate_similarity_matrix adds a row and a column, adjust matrix dimensions accordingly */
1432 int* new_matrix = realloc ( p_self -> matrix, (p_self->query_size + 1) * (p_buf_size + 1) * sizeof(*p_self->matrix) );
1433 if ( new_matrix == NULL )
1434 { /* p_self -> matrix is unchanged and can be reused */
1435 return RC ( rcText, rcString, rcSearching, rcMemory, rcExhausted );
1436 }
1437 p_self -> max_rows = p_buf_size;
1438 p_self -> matrix = new_matrix;
1439 }
1440 /*TODO: pass threshold into calculate_similarity_matrix, have it stop as soon as the score is sufficient */
1441 rc = calculate_similarity_matrix ( p_buf, p_buf_size, p_self -> query, p_self -> query_size, false, p_self -> matrix, false, &score, &max_row, &max_col );
1442 if ( rc == 0 )
1443 {
1444 if ( p_threshold > p_self->query_size * 2 )
1445 {
1446 p_threshold = p_self->query_size * 2;
1447 }
1448 if ( score >= p_threshold )
1449 {
1450 if ( p_match != NULL )
1451 {
1452 /* walk back from the max score row */
1453 const size_t Columns = p_self->query_size + 1;
1454 int row = max_row;
1455 int col = max_col;
1456 while ( row > 0 && col > 0 )
1457 {
1458 int curr = p_self -> matrix [ row*Columns + col ];
1459 if ( curr == 0 )
1460 {
1461 break;
1462 }
1463 else
1464 {
1465 int left = p_self -> matrix [ row * Columns + (col - 1) ];
1466 int up = p_self -> matrix [ (row - 1)*Columns + col ];
1467 int diag = p_self -> matrix [ (row - 1)*Columns + (col - 1) ];
1468 if ( diag >= left && diag >= up )
1469 {
1470 --row;
1471 --col;
1472 }
1473 else if ( diag < left )
1474 {
1475 --col;
1476 }
1477 else
1478 {
1479 --row;
1480 }
1481 }
1482 }
1483
1484 p_match -> position = row;
1485 p_match -> length = max_row - row;
1486 p_match -> score = score;
1487 }
1488 return 0;
1489 }
1490 rc = SILENT_RC ( rcText, rcString, rcSearching, rcQuery, rcNotFound );
1491 }
1492
1493 return rc;
1494 }
1495
1496
1497