1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include <search/ref-variation.h>
28 #include <search/smith-waterman.h>
29 
30 #include <klib/rc.h>
31 #include <klib/refcount.h>
32 #include <klib/text.h>
33 
34 #include <insdc/insdc.h>
35 
36 #include <stdlib.h>
37 #include <string.h>
38 #include <ctype.h>
39 
40 #if WINDOWS
41 #include <intrin.h>
42 #ifndef __builtin_popcount
43 #define __builtin_popcount __popcnt
44 #endif
45 #endif
46 
47 #include <sysalloc.h>
48 
49 #ifndef min
50 #define min(x,y) ((y) < (x) ? (y) : (x))
51 #endif
52 
53 #ifndef max
54 #define max(x,y) ((y) >= (x) ? (y) : (x))
55 #endif
56 
57 #define max4(x1, x2, x3, x4) (max( max((x1),(x2)), max((x3),(x4)) ))
58 
59 #define COMPARE_4NA 0
60 #define CACHE_MAX_ROWS 0 /* and columns as well */
61 #define GAP_SCORE_LINEAR 0
62 #define SIMILARITY_MATCH 2
63 #define SIMILARITY_MISMATCH -1
64 #define SW_DEBUG_PRINT 0
65 
66 struct RefVariation
67 {
68     KRefcount refcount;
69 
70     INSDC_dna_text* var_buffer; /* in the case of deletion
71         it contains <ref_base_before><allele><ref_base_after>
72         otherwise it contains allele only */
73     INSDC_dna_text const* allele; /* points to allele in the var_buffer */
74     size_t allele_start;
75     size_t var_buffer_size;
76     size_t allele_size;
77     size_t allele_len_on_ref;
78 };
79 
80 
81 #if COMPARE_4NA == 1
82 
83 unsigned char const map_char_to_4na [256] =
84 {
85     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89     0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
90     0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
91     0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3,15, 0,
92     0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0,
93     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
101 };
102 
compare_4na(INSDC_dna_text ch2na,INSDC_dna_text ch4na)103 static int compare_4na ( INSDC_dna_text ch2na, INSDC_dna_text ch4na )
104 {
105     unsigned char bits4na = map_char_to_4na [(unsigned char)ch4na];
106     unsigned char bits2na = map_char_to_4na [(unsigned char)ch2na];
107 
108     /*return (bits2na & bits4na) != 0 ? 2 : -1;*/
109 
110     unsigned char popcnt4na = (unsigned char) __builtin_popcount ( bits4na );
111 
112     return (bits2na & bits4na) != 0 ? 12 / popcnt4na : -6;
113 }
114 #endif
115 
similarity_func(INSDC_dna_text ch2na,INSDC_dna_text ch4na)116 static int similarity_func (INSDC_dna_text ch2na, INSDC_dna_text ch4na)
117 {
118 #if COMPARE_4NA == 1
119     return compare_4na ( ch2na, ch4na );
120 #else
121     return tolower(ch2na) == tolower(ch4na) ? SIMILARITY_MATCH : SIMILARITY_MISMATCH;
122 #endif
123 }
124 
gap_score_const(size_t idx)125 static int gap_score_const ( size_t idx )
126 {
127     return -1;
128 }
129 
gap_score_linear(size_t idx)130 static int gap_score_linear ( size_t idx )
131 {
132 #if COMPARE_4NA == 1
133     return -6*(int)idx;
134 #else
135     return -(int)idx;
136 #endif
137 }
138 
139 static int (*gap_score_func) (size_t ) =
140 #if GAP_SCORE_LINEAR != 0
141     gap_score_linear
142 #else
143     gap_score_const
144 #endif
145 ;
146 
147 typedef struct ValueIndexPair
148 {
149     size_t index;
150     int value;
151 } ValueIndexPair;
152 
153 
get_char(INSDC_dna_text const * str,size_t size,size_t pos,bool reverse)154 static char get_char (INSDC_dna_text const* str, size_t size, size_t pos, bool reverse)
155 {
156     if ( !reverse )
157         return str [pos];
158     else
159         return str [size - pos - 1];
160 }
161 
calculate_similarity_matrix(INSDC_dna_text const * text,size_t size_text,INSDC_dna_text const * query,size_t size_query,bool gap_score_constant,int * matrix,bool reverse,int * max_score,size_t * max_row,size_t * max_col)162 rc_t calculate_similarity_matrix (
163     INSDC_dna_text const* text, size_t size_text,
164     INSDC_dna_text const* query, size_t size_query,
165     bool gap_score_constant,
166     int* matrix, bool reverse,
167     int* max_score, size_t* max_row, size_t* max_col )
168 {
169 
170     size_t ROWS = size_text + 1;
171     size_t COLUMNS = size_query + 1;
172     size_t i, j;
173 
174     /* arrays to store maximums for all previous rows and columns */
175 #if CACHE_MAX_ROWS != 0
176 
177     ValueIndexPair* vec_max_cols = NULL;
178     ValueIndexPair* vec_max_rows = NULL;
179 
180     vec_max_cols = calloc ( COLUMNS, sizeof vec_max_cols [ 0 ] );
181     if ( vec_max_cols == NULL)
182         return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
183 
184     vec_max_rows = calloc ( ROWS, sizeof vec_max_rows [ 0 ] );
185     if ( vec_max_rows == NULL)
186     {
187         free (vec_max_cols);
188         return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
189     }
190 
191 #endif
192     gap_score_func = gap_score_constant ? gap_score_const : gap_score_linear;
193 
194     if ( max_score != NULL )
195     {
196         *max_score = 0;
197     }
198     if ( max_row != NULL )
199     {
200         *max_row = 0;
201     }
202     if ( max_col != NULL )
203     {
204         *max_col = 0;
205     }
206     // init 1st row and column with zeros
207     memset ( matrix, 0, COLUMNS * sizeof(matrix[0]) );
208     for ( i = 1; i < ROWS; ++i )
209         matrix [i * COLUMNS] = 0;
210 
211     for ( i = 1; i < ROWS; ++i )
212     {
213         for ( j = 1; j < COLUMNS; ++j )
214         {
215 #if CACHE_MAX_ROWS == 0
216             size_t k, l;
217 #endif
218             int cur_score_del, cur_score_ins;
219             int sim = similarity_func (
220                             get_char (text, size_text, i-1, reverse),
221                             get_char (query, size_query, j-1, reverse) );
222 
223 #if CACHE_MAX_ROWS != 0
224             /* TODO: incorrect logic: we cache max{matrix[x,y]}
225                 instead of max{matrix[x,y] + gap_score_func(v)}
226                 when it's fixed we probably will need to make adjustments here
227             */
228             cur_score_del = vec_max_cols[j].value + gap_score_func(j - vec_max_cols[j].index);
229 #else
230             cur_score_del = -1;
231             for ( k = 1; k < i; ++k )
232             {
233                 int cur = matrix [ (i - k)*COLUMNS + j ] + gap_score_func(k);
234                 if ( cur > cur_score_del )
235                     cur_score_del = cur;
236             }
237 #endif
238 
239 #if CACHE_MAX_ROWS != 0
240             /* TODO: incorrect logic: we cache max{matrix[x,y]}
241                 instead of max{matrix[x,y] + gap_score_func(v)}
242                 when it's fixed we probably will need to make adjustments here
243             */
244             cur_score_ins = vec_max_rows[i].value + gap_score_func(i - vec_max_rows[i].index);;
245 #else
246 
247             cur_score_ins = -1;
248             for ( l = 1; l < j; ++l )
249             {
250                 int cur = matrix [ i*COLUMNS + (j - l) ] + gap_score_func(l);
251                 if ( cur > cur_score_ins )
252                     cur_score_ins = cur;
253             }
254 #endif
255             {
256                 int score = max4 ( 0,
257                                    matrix[(i-1)*COLUMNS + j - 1] + sim,
258                                    cur_score_del,
259                                    cur_score_ins);
260                 matrix[i*COLUMNS + j] = score;
261                 if ( max_score != NULL && score > *max_score )
262                 {
263                     *max_score = score;
264                     if ( max_row != NULL )
265                     {
266                         *max_row = i;
267                     }
268                     if ( max_col != NULL )
269                     {
270                         *max_col = j;
271                     }
272                 }
273             }
274 
275 #if CACHE_MAX_ROWS != 0
276             /* TODO: incorrect logic: we cache max{matrix[x,y]}
277                 instead of max{matrix[x,y] + gap_score_func(v)}
278             */
279             if ( matrix[i*COLUMNS + j] > vec_max_cols[j].value )
280             {
281                 vec_max_cols[j].value = matrix[i*COLUMNS + j];
282                 vec_max_cols[j].index = j;
283             }
284 #if GAP_SCORE_LINEAR != 0
285             vec_max_cols[j].value += gap_score_func(1);
286 #endif
287 
288 #endif
289 #if CACHE_MAX_ROWS != 0
290             /* TODO: incorrect logic: we cache max{matrix[x,y]}
291                 instead of max{matrix[x,y] + gap_score_func(v)}
292             */
293             if ( matrix[i*COLUMNS + j] > vec_max_rows[i].value )
294             {
295                 vec_max_rows[i].value = matrix[i*COLUMNS + j];
296                 vec_max_rows[i].index = i;
297             }
298 #if GAP_SCORE_LINEAR != 0
299             vec_max_rows[i].value += gap_score_func(1);
300 #endif
301 
302 #endif
303         }
304     }
305 
306 #if CACHE_MAX_ROWS != 0
307     free (vec_max_cols);
308     free (vec_max_rows);
309 #endif
310 
311     return 0;
312 }
313 
314 void
sw_find_indel_box(int * matrix,size_t ROWS,size_t COLUMNS,int * ret_row_start,int * ret_row_end,int * ret_col_start,int * ret_col_end)315 sw_find_indel_box ( int* matrix, size_t ROWS, size_t COLUMNS,
316     int* ret_row_start, int* ret_row_end,
317     int* ret_col_start, int* ret_col_end )
318 {
319     size_t max_row = 0, max_col = 0;
320     size_t max_i = ROWS*COLUMNS - 1;
321 
322     size_t i = max_i, j;
323     int prev_indel = 0;
324 
325     max_row = max_i / COLUMNS;
326     max_col = max_i % COLUMNS;
327 
328     *ret_row_start = *ret_row_end = *ret_col_start = *ret_col_end = -1;
329 
330     i = max_row;
331     j = max_col;
332 
333     /* traceback to (0,0)-th element of the matrix */
334     while (1)
335     {
336         if (i > 0 && j > 0)
337         {
338             /* TODO: ? strong '>' - because we want to prefer indels over matches/mismatches here
339             (expand the window of ambiguity as much as possible)*/
340             if ( matrix [(i - 1)*COLUMNS + (j - 1)] >= matrix [i*COLUMNS + (j - 1)] &&
341                 matrix [(i - 1)*COLUMNS + (j - 1)] >= matrix [(i - 1)*COLUMNS + j])
342             {
343                 int diag_diff = matrix [i*COLUMNS + j] - matrix [(i - 1)*COLUMNS + (j - 1)];
344                 int mismatch = diag_diff == SIMILARITY_MATCH ? 0 : 1;
345 
346                 if (mismatch && *ret_row_end == -1 )
347                 {
348                     *ret_row_end = (int)i;
349                     *ret_col_end = (int)j;
350                 }
351 
352                 --i;
353                 --j;
354 
355                 if (prev_indel || mismatch)
356                 {
357                     *ret_row_start = (int)i;
358                     *ret_col_start = (int)j;
359                 }
360 
361                 prev_indel = 0;
362             }
363             else if ( matrix [(i - 1)*COLUMNS + (j - 1)] < matrix [i*COLUMNS + (j - 1)] )
364             {
365                 if ( *ret_row_end == -1 )
366                 {
367                     *ret_row_end = (int)i;
368                     *ret_col_end = (int)j;
369                 }
370                 --j;
371                 prev_indel = 1;
372             }
373             else
374             {
375                 if ( *ret_row_end == -1 )
376                 {
377                     *ret_row_end = (int)i;
378                     *ret_col_end = (int)j;
379                 }
380                 --i;
381                 prev_indel = 1;
382             }
383         }
384         else if ( i > 0 )
385         {
386             if ( *ret_row_end == -1 )
387             {
388                 *ret_row_end = (int)i;
389                 *ret_col_end = 0;
390             }
391             *ret_row_start = 0;
392             *ret_col_start = 0;
393             break;
394         }
395         else if ( j > 0 )
396         {
397             if ( *ret_row_end == -1 )
398             {
399                 *ret_row_end = 0;
400                 *ret_col_end = (int)j;
401             }
402             *ret_row_start = 0;
403             *ret_col_start = 0;
404             break;
405         }
406         else
407         {
408             break;
409         }
410     }
411 }
412 
413 #if SW_DEBUG_PRINT != 0
414 #include <stdio.h>
print_matrix(int const * matrix,char const * ref_slice,size_t ref_slice_size,char const * query,size_t query_size,bool reverse)415 void print_matrix ( int const* matrix,
416                     char const* ref_slice, size_t ref_slice_size,
417                     char const* query, size_t query_size,
418                     bool reverse )
419 {
420     size_t COLUMNS = ref_slice_size + 1;
421     size_t ROWS = query_size + 1;
422 
423     int print_width = 2;
424     size_t i, j;
425 
426     printf ("  %*c ", print_width, '-');
427     for (j = 1; j < COLUMNS; ++j)
428         printf ("%*c ", print_width, get_char ( ref_slice, ref_slice_size, j-1, reverse ));
429     printf ("\n");
430 
431     for (i = 0; i < ROWS; ++i)
432     {
433         if ( i == 0 )
434             printf ("%c ", '-');
435         else
436             printf ("%c ", get_char (query, query_size, i-1, reverse ));
437 
438         for (j = 0; j < COLUMNS; ++j)
439         {
440             printf ("%*d ", print_width, matrix[i*COLUMNS + j]);
441         }
442         printf ("\n");
443     }
444 }
445 
446 #endif
447 
448 /*
449     FindRefVariationBounds uses Smith-Waterman algorithm
450     to find theoretical bounds of the variation for
451     the given reference slice and the query (properly preapared,
452     i.e. containing sequences of bases at the beginning and
453     the end matching the reference)
454 
455     ref_slice, ref_slice_size [IN] - the reference slice on which the
456                                      variation will be looked for
457     query, query_size [IN] - the query that represents the variation placed
458                              inside the reference slice
459     ref_start, ref_len [OUT, NULL OK] - the region of ambiguity on the reference
460     have_indel [OUT] - pointer to flag indication if there is an insertion or deletion
461                        (1 - there is an indel, 0 - there is match/mismatch only)
462 */
463 static
FindRefVariationBounds(INSDC_dna_text const * ref_slice,size_t ref_slice_size,INSDC_dna_text const * query,size_t query_size,size_t * ref_start,size_t * ref_len,bool * has_indel)464 rc_t FindRefVariationBounds (
465     INSDC_dna_text const* ref_slice, size_t ref_slice_size,
466     INSDC_dna_text const* query, size_t query_size,
467     size_t* ref_start, size_t* ref_len, bool * has_indel
468     )
469 {
470     /* building sw-matrix for chosen reference slice and sequence */
471 
472     size_t COLUMNS = ref_slice_size + 1;
473     size_t ROWS = query_size + 1;
474     rc_t rc = 0;
475 
476     bool gap_score_constant = ( GAP_SCORE_LINEAR == 0 );
477 
478     int row_start, col_start, row_end, col_end;
479     int row_start_rev, col_start_rev, row_end_rev, col_end_rev;
480     int* matrix = malloc( ROWS * COLUMNS * sizeof (int) );
481     if (matrix == NULL)
482         return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
483     * has_indel = true;
484 
485 
486     /* forward scan */
487     rc = calculate_similarity_matrix ( query, query_size, ref_slice, ref_slice_size, gap_score_constant, matrix, false, NULL, NULL, NULL );
488     if ( rc != 0 )
489         goto free_resources;
490 #if SW_DEBUG_PRINT != 0
491     print_matrix ( matrix, ref_slice, ref_slice_size, query, query_size, false );
492 #endif
493 
494     sw_find_indel_box ( matrix, ROWS, COLUMNS, &row_start, &row_end, &col_start, &col_end );
495     if ( row_start == -1 && row_end == -1 && col_start == -1 && col_end == -1 )
496     {
497         * has_indel = 0;
498         goto free_resources;
499     }
500 #if SW_DEBUG_PRINT != 0
501     printf ("start=(%d, %d), end=(%d, %d)\n", row_start, col_start, row_end, col_end);
502 #endif
503 
504     /* reverse scan */
505     rc = calculate_similarity_matrix ( query, query_size, ref_slice, ref_slice_size, gap_score_constant, matrix, true, NULL, NULL, NULL );
506     if ( rc != 0 )
507         goto free_resources;
508 #if SW_DEBUG_PRINT != 0
509     print_matrix ( matrix, ref_slice, ref_slice_size, query, query_size, true );
510 #endif
511 
512     sw_find_indel_box ( matrix, ROWS, COLUMNS, &row_start_rev, &row_end_rev, &col_start_rev, &col_end_rev );
513 #if SW_DEBUG_PRINT != 0
514     printf ("start_rev=(%d, %d), end_rev=(%d, %d)\n", row_start_rev, col_start_rev, row_end_rev, col_end_rev);
515 #endif
516     if ( row_start_rev != -1 || row_end_rev != -1 || col_start_rev != -1 || col_end_rev != -1 )
517     {
518         row_start = min ( (int)query_size - row_end_rev - 1, row_start );
519         row_end   = max ( (int)query_size - row_start_rev - 1, row_end );
520         col_start = min ( (int)ref_slice_size - col_end_rev - 1, col_start );
521         col_end   = max ( (int)ref_slice_size - col_start_rev - 1, col_end );
522     }
523 #if SW_DEBUG_PRINT != 0
524     printf ("COMBINED: start=(%d, %d), end=(%d, %d)\n", row_start, col_start, row_end, col_end);
525 #endif
526 
527     if ( ref_start != NULL )
528         *ref_start = col_start + 1;
529     if ( ref_len != NULL )
530         *ref_len = col_end - col_start - 1;
531 
532 free_resources:
533     free (matrix);
534 
535     return rc;
536 }
537 
538 /****************************************************/
539 /* yet another string helper */
540 typedef struct c_string_const
541 {
542     char const* str;
543     size_t size;
544 } c_string_const;
545 
c_string_const_assign(c_string_const * self,char const * src,size_t size)546 static void c_string_const_assign ( c_string_const* self, char const* src, size_t size )
547 {
548     self -> str = src;
549     self -> size = size;
550 }
551 
552 typedef struct c_string
553 {
554     char* str;
555     size_t size;
556     size_t capacity;
557 } c_string;
558 
c_string_make(c_string * self,size_t capacity)559 static int c_string_make ( c_string* self, size_t capacity )
560 {
561     assert ( capacity > 0 );
562     self -> str = malloc (capacity + 1);
563     if ( self -> str != NULL )
564     {
565         self -> str [0] = '\0';
566         self -> size = 0;
567         self -> capacity = capacity;
568         return 1;
569     }
570     else
571         return 0;
572 }
573 
c_string_destruct(c_string * self)574 static void c_string_destruct ( c_string* self )
575 {
576     if ( self->str != NULL )
577     {
578         free ( self -> str );
579         self -> str = NULL;
580         self -> size = 0;
581         self -> capacity = 0;
582     }
583 }
584 
c_string_realloc_no_preserve(c_string * self,size_t new_capacity)585 static int c_string_realloc_no_preserve ( c_string* self, size_t new_capacity )
586 {
587     if ( self -> capacity < new_capacity )
588     {
589         c_string_destruct ( self );
590 
591         return c_string_make ( self, new_capacity );
592     }
593     else
594     {
595         self -> str [0] = '\0';
596         self -> size = 0;
597     }
598 
599     return 1;
600 }
601 
c_string_assign(c_string * self,char const * src,size_t src_size)602 static int c_string_assign ( c_string* self, char const* src, size_t src_size )
603 {
604     assert ( self->capacity >= src_size );
605     if ( self->capacity < src_size && !c_string_realloc_no_preserve (self, max(self->capacity * 2, src_size)) )
606         return 0;
607 
608     memmove ( self -> str, src, src_size );
609     self -> str [src_size] = '\0';
610     self -> size = src_size;
611 
612     return 1;
613 }
614 
c_string_append(c_string * self,char const * append,size_t append_size)615 static int c_string_append ( c_string* self, char const* append, size_t append_size)
616 {
617     if ( append_size != 0 )
618     {
619         size_t new_size = self->size + append_size;
620         if ( self->capacity >= new_size )
621         {
622             memmove ( self->str + self->size, append, append_size );
623             self->size = new_size;
624             self->str [new_size] = '\0';
625         }
626         else
627         {
628             size_t new_capacity = max (new_size + 1, self->capacity * 2);
629             char* new_str = malloc ( new_capacity );
630             if (new_str == NULL)
631                 return 0;
632 
633             memmove (new_str, self->str, self->size);
634             memmove (new_str + self->size, append, append_size );
635             new_str [ new_size ] = '\0';
636 
637             c_string_destruct ( self );
638 
639             self->str = new_str;
640             self->size = new_size;
641             self->capacity = new_capacity;
642         }
643     }
644 
645     return 1;
646 }
647 
648 #if 0
649 static int c_string_wrap ( c_string* self,
650     char const* prefix, size_t prefix_size,
651     char const* postfix, size_t postfix_size)
652 {
653     assert ( self -> str != NULL );
654     size_t new_size = self->size + prefix_size + postfix_size;
655 
656     if ( new_size > self->capacity )
657     {
658         size_t new_capacity = max (new_size + 1, self->capacity * 2);
659         char* new_str = malloc ( new_capacity );
660         if (new_str == NULL)
661             return 0;
662 
663         memmove ( new_str, prefix, prefix_size );
664         memmove ( new_str + prefix_size, self -> str, self -> size );
665         memmove ( new_str + prefix_size + self->size, postfix, postfix_size );
666         new_str [ new_size ] = '\0';
667 
668         c_string_destruct ( self );
669 
670         self->str = new_str;
671         self->size = new_size;
672         self->capacity = new_capacity;
673     }
674     else
675     {
676         memmove ( self->str + prefix_size, self->str, self->size );
677         memmove ( self->str, prefix, prefix_size );
678         memmove (self->str + prefix_size + self->size, postfix, postfix_size );
679         self->str [new_size] = '\0';
680     }
681     return 1;
682 }
683 #endif
684 /************************************************/
685 
686 
687 /*
688    returns true if a new ref_slice is selected
689    returns false if the new ref_slice is the same as the previous one passed in ref_slice
690 */
691 
get_ref_slice(INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,size_t var_len_on_ref,size_t slice_expand_left,size_t slice_expand_right,c_string_const * ref_slice)692 static bool get_ref_slice (
693             INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
694             size_t var_len_on_ref,
695             size_t slice_expand_left, size_t slice_expand_right,
696             c_string_const* ref_slice)
697 {
698     size_t ref_start, ref_xend;
699     if ( ref_pos_var < slice_expand_left )
700         ref_start = 0;
701     else
702         ref_start = ref_pos_var - slice_expand_left;
703 
704     if ( ref_pos_var + slice_expand_right + var_len_on_ref >= ref_size )
705         ref_xend = ref_size;
706     else
707         ref_xend = ref_pos_var + slice_expand_right + var_len_on_ref;
708 
709     if ( ref_slice->str == ref + ref_start && ref_slice->size == ref_xend - ref_start)
710         return false;
711 
712     c_string_const_assign ( ref_slice, ref + ref_start, ref_xend - ref_start );
713     return true;
714 }
715 
716 #if 1
make_query(c_string_const const * ref_slice,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,int64_t var_start_pos_adj,c_string * query)717 static bool make_query ( c_string_const const* ref_slice,
718         INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
719         int64_t var_start_pos_adj, /* ref_pos adjusted to the beginning of ref_slice (in the simplest case - the middle of ref_slice) */
720         c_string* query
721     )
722 {
723     if ( !c_string_realloc_no_preserve (query, variation_size + ref_slice->size - var_len_on_ref) )
724         return false;
725 
726     if ( !c_string_append (query, ref_slice->str, var_start_pos_adj) ||
727          !c_string_append (query, variation, variation_size) ||
728          !c_string_append (query, ref_slice->str + var_start_pos_adj + var_len_on_ref, ref_slice->size - var_start_pos_adj - var_len_on_ref) )
729     {
730          return false;
731     }
732 
733     return true;
734 }
735 
compose_variation(c_string_const const * ref,size_t ref_start,size_t ref_len,INSDC_dna_text const * query,size_t query_len,int64_t ref_pos_var,size_t var_len_on_ref,c_string * variation,char const ** pallele,size_t * pallele_size)736 static bool compose_variation ( c_string_const const* ref,
737         size_t ref_start, size_t ref_len,
738         INSDC_dna_text const* query, size_t query_len,
739         int64_t ref_pos_var, size_t var_len_on_ref,
740         c_string* variation, char const** pallele, size_t* pallele_size )
741 {
742     bool ret = true;
743 
744     size_t ref_end_orig = (size_t)ref_pos_var + var_len_on_ref;
745     size_t ref_end_new = ref_start + ref_len;
746 
747     size_t prefix_start = ref_start, prefix_len, query_trim_l;
748     size_t postfix_start = ref_end_orig, postfix_len, query_trim_r;
749 
750     size_t query_len_new, var_len;
751 
752     size_t allele_expanded_l = 0, allele_expanded_r = 0;
753 
754     if ((int64_t)ref_start <= ref_pos_var) /* left bound is expanded */
755     {
756         prefix_len = (size_t)ref_pos_var - ref_start;
757         query_trim_l = 0;
758 
759         assert ((int64_t)prefix_len >= 0);
760     }
761     else /* left bound is shrinked */
762     {
763         prefix_len = 0;
764         query_trim_l = ref_start - (size_t)ref_pos_var;
765 
766         assert ((int64_t)query_trim_l >= 0);
767     }
768 
769     if (ref_end_new >= ref_end_orig) /* right bound is expanded */
770     {
771         postfix_start = ref_end_orig;
772         postfix_len = ref_end_new - ref_end_orig;
773         query_trim_r = 0;
774     }
775     else /* right bound is shrinked */
776     {
777         postfix_start = ref_end_new;
778         postfix_len = 0;
779         query_trim_r = ref_end_orig - ref_end_new;
780     }
781 
782     /*
783     special case: pure match/mismatch
784     algorithm gives ref_len = 0, but in this case
785     we want to have variation = input query
786     */
787     if ( ref_len == 0 && query_len == var_len_on_ref )
788     {
789         assert ( prefix_len == 0 );
790         assert ( postfix_len == 0 );
791         assert ( query_trim_l == 0 );
792         assert ( query_trim_r > 0 );
793 
794         query_trim_r = 0;
795     }
796     /*
797     special case: deletion
798     we have to create a query whis is the allele
799     expanded one base to the left and to the right
800     on the reference if possible
801     */
802     else if ( var_len_on_ref > query_len )
803     {
804         if ( prefix_start > 0 )
805         {
806             allele_expanded_l = 1;
807             prefix_start -= 1;
808             ++prefix_len;
809         }
810 
811         if ( postfix_start + postfix_len + 1 < ref->size)
812         {
813             allele_expanded_r = 1;
814             ++postfix_len;
815         }
816     }
817 
818     query_len_new = query_len - query_trim_l - query_trim_r;
819     assert ((int64_t)query_len_new >= 0);
820     var_len = prefix_len + query_len_new + postfix_len;
821 
822     if ( var_len > 0 )
823     {
824         /* non-empty variation */
825         if ( !c_string_realloc_no_preserve( variation, var_len ) )
826             ret = false;
827 
828         if ( prefix_len > 0 )
829             ret = ret && c_string_assign (variation, ref->str + prefix_start, prefix_len);
830 
831         if ( query_len_new > 0 )
832             ret = ret && c_string_append (variation, query + query_trim_l, query_len_new);
833 
834         if ( postfix_len > 0 )
835             ret = ret && c_string_append (variation, ref->str + postfix_start, postfix_len);
836 
837         if ( ! ret )
838             c_string_destruct ( variation );
839 
840         *pallele = variation->str + allele_expanded_l;
841         *pallele_size = variation->size - allele_expanded_l - allele_expanded_r;
842     }
843     else
844     {
845         /* in this case there is no query - don't allocate anything */
846         ret = true;
847         assert ( 0 ); /* since we expand deletions,
848                       this code shall not be reached.
849                       theoretically it can be reached if
850                       reference has length == 0 only */
851     }
852 
853     return ret;
854 }
855 
856 #endif
857 
858 #if 0
859 static bool make_query_ (
860         INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
861         INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
862         size_t slice_expand_left, size_t slice_expand_right,
863         c_string* query
864     )
865 {
866     size_t ref_prefix_start, ref_prefix_len, ref_suffix_start, ref_suffix_len;
867     if ( !c_string_realloc_no_preserve (query, variation_size + slice_expand_left + slice_expand_right + var_len_on_ref) )
868         return false;
869 
870     if ( ref_pos_var < slice_expand_left )
871     {
872         ref_prefix_start = 0;
873         ref_prefix_len = slice_expand_left - (ref_pos_var - 1);
874     }
875     else
876     {
877         ref_prefix_start = ref_pos_var - slice_expand_left;
878         ref_prefix_len = slice_expand_left;
879     }
880 
881     ref_suffix_start = ref_pos_var + var_len_on_ref;
882 
883     if ( ref_suffix_start + slice_expand_right >= ref_size )
884         ref_suffix_len = ref_size - (slice_expand_right + 1);
885     else
886         ref_suffix_len = slice_expand_right;
887 
888     if ( !c_string_append (query, ref + ref_prefix_start, ref_prefix_len) ||
889          !c_string_append (query, variation, variation_size) ||
890          !c_string_append (query, ref + ref_suffix_start, ref_suffix_len) )
891     {
892          return false;
893     }
894 
895     return true;
896 }
897 #endif
898 
899 /*
900     FindRefVariationRegionIUPAC_SW uses Smith-Waterman algorithm
901     to find theoretical bounds of the variation for
902     the given reference, position on the reference
903     and the raw query, or variation to look for at the given
904     reference position
905 
906     ref, ref_size [IN]     - the reference on which the
907                              variation will be looked for
908     ref_pos_var [IN]       - the position on reference to look for the variation
909     variation, variation_size [IN] - the variation to look for at the ref_pos_var
910     var_len_on_ref [IN]    - the length of the variation on the reference, e.g.:
911                            - mismatch, 2 bases: variation = "XY", var_len_on_ref = 2
912                            - deletion, 3 bases: variation = "", var_len_on_ref = 3
913                            - insertion, 2 bases:  variation = "XY", var_len_on_ref = 0
914 
915     p_ref_start, p_ref_len [OUT, NULL OK] - the region of ambiguity on the reference
916                                             (return values)
917 */
918 
FindRefVariationRegionIUPAC_SW(INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,size_t * p_ref_start,size_t * p_ref_len)919 static rc_t CC FindRefVariationRegionIUPAC_SW (
920         INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
921         INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
922         size_t* p_ref_start, size_t* p_ref_len
923     )
924 {
925     rc_t rc = 0;
926 
927     size_t var_half_len = 1;/*variation_size / 2 + 1;*/
928 
929     size_t exp_l = var_half_len;
930     size_t exp_r = var_half_len;
931 
932     /* previous start and end for reference slice */
933     int64_t slice_start = -1, slice_end = -1;
934 
935     c_string_const ref_slice;
936     c_string query;
937 
938     size_t ref_start = 0, ref_len = 0;
939 
940     ref_slice.str = NULL;
941     ref_slice.size = 0;
942 
943     if ( !c_string_make ( & query, ( variation_size + 1 ) * 2 ) )
944         return RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
945 
946     while ( 1 )
947     {
948         int64_t new_slice_start, new_slice_end;
949         int64_t ref_pos_adj;
950         int cont = 0;
951         bool has_indel = false;
952 
953         /* get new expanded slice and check if it has not reached the bounds of ref */
954         bool slice_expanded = get_ref_slice ( ref, ref_size, ref_pos_var, var_len_on_ref, exp_l, exp_r, & ref_slice );
955         if ( !slice_expanded )
956             break;
957 
958         /* get ref_pos relative to ref_slice start and new slice_start and end */
959         ref_pos_adj = (int64_t)ref_pos_var - ( ref_slice.str - ref );
960         new_slice_start = ref_slice.str - ref;
961         new_slice_end = new_slice_start + ref_slice.size;
962 
963         /* compose a new query for newly extended ref slice */
964         /*if ( !make_query_( ref, ref_size, ref_pos_var, variation, variation_size, var_len_on_ref, exp_l, exp_r, & query ) )*/
965         if ( !make_query ( & ref_slice, variation, variation_size, var_len_on_ref, ref_pos_adj, & query ) )
966         {
967             rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
968             goto free_resources;
969         }
970 
971         rc = FindRefVariationBounds ( ref_slice.str, ref_slice.size,
972                         query.str, query.size, & ref_start, & ref_len, & has_indel );
973 
974         /* if there are no indels report that there is no ambiguity
975            for the given ref_pos_var: region starting at ref_pos_var has length = 0
976            ambiguity
977         */
978         if ( !has_indel )
979         {
980             ref_start = ref_pos_adj;
981             ref_len = 0;
982         }
983 
984         if ( rc != 0 )
985             goto free_resources;
986 
987         /* if we've found the ambiguity window starting at the very
988            beginning of the ref slice and if we're still able to extend
989            ref slice to the left (haven't bumped into boundary) - extend to the left
990            and repeat the search
991         */
992         if ( ref_start == 0 && (slice_start == -1 || new_slice_start != slice_start ) )
993         {
994             exp_l *= 2;
995             cont = 1;
996         }
997 
998         /* if we've found the ambiguity window ending at the very
999            end of the ref slice and if we're still able to extend
1000            ref slice to the right (haven't bumped into boundary) - extend to the right
1001            and repeat the search
1002         */
1003         if ( ref_start + ref_len == ref_slice.size && (slice_end == -1 || new_slice_end != slice_end) )
1004         {
1005             exp_r *= 2;
1006             cont = 1;
1007         }
1008 
1009         if ( !cont )
1010             break;
1011 
1012         slice_start = new_slice_start;
1013         slice_end = new_slice_end;
1014     }
1015     if ( p_ref_start != NULL )
1016         *p_ref_start = ref_start + (ref_slice.str - ref);
1017     if ( p_ref_len != NULL )
1018         *p_ref_len = ref_len;
1019 
1020 free_resources:
1021     c_string_destruct ( &query );
1022 
1023     return rc;
1024 }
1025 
1026 /*
1027     FindRefVariationRegionIUPAC_RA uses Rolling-bulldozer algorithm
1028     to find theoretical bounds of the variation for
1029     the given reference, position on the reference
1030     and the raw query, or variation to look for at the given
1031     reference position
1032 
1033     ref, ref_size [IN]     - the reference on which the
1034                              variation will be looked for
1035     ref_pos_var [IN]       - the position on reference to look for the variation
1036     variation, variation_size [IN] - the variation to look for at the ref_pos_var
1037     var_len_on_ref [IN]    - the length of the variation on the reference, e.g.:
1038                            - mismatch, 2 bases: variation = "XY", var_len_on_ref = 2
1039                            - deletion, 3 bases: variation = "", var_len_on_ref = 3
1040                            - insertion, 2 bases:  variation = "XY", var_len_on_ref = 0
1041 
1042     p_ref_start, p_ref_len [OUT, NULL OK] - the region of ambiguity on the reference
1043                                             (return values)
1044 */
1045 
FindRefVariationRegionIUPAC_RA(INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,size_t * p_ref_start,size_t * p_ref_len)1046 static rc_t CC FindRefVariationRegionIUPAC_RA (
1047         INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
1048         INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
1049         size_t* p_ref_start, size_t* p_ref_len
1050     )
1051 {
1052     rc_t rc = 0;
1053     size_t del_pos_start, del_pos_xend;
1054     size_t ins_pos_start, ins_pos_xend;
1055 
1056     /* Stage 1: trying to expand deletion */
1057 
1058     /* expanding to the left */
1059     if (var_len_on_ref > 0)
1060     {
1061         for (del_pos_start = ref_pos_var;
1062             del_pos_start != 0 && ref[del_pos_start-1] == ref[del_pos_start-1 + var_len_on_ref];
1063             --del_pos_start);
1064 
1065         /* expanding to the right */
1066         for (del_pos_xend = ref_pos_var + var_len_on_ref;
1067             del_pos_xend < ref_size && ref[del_pos_xend] == ref[del_pos_xend - var_len_on_ref];
1068             ++del_pos_xend);
1069     }
1070     else
1071     {
1072         del_pos_start = ref_pos_var;
1073         del_pos_xend = ref_pos_var;
1074     }
1075 
1076     /* Stage 2: trying to expand insertion */
1077 
1078     /* expanding to the left */
1079     /* roll first repetition to the left (avoiding % operation) */
1080     if (variation_size > 0)
1081     {
1082         if (del_pos_start > 0)
1083         {
1084             for (ins_pos_start = ref_pos_var; ins_pos_start != 0; --ins_pos_start)
1085             {
1086                 size_t pos_in_var = ins_pos_start-1 - ref_pos_var + variation_size;
1087                 size_t pos_in_ref = ins_pos_start-1;
1088                 if ( (int64_t)pos_in_var == -1l || ref[pos_in_ref] != variation[pos_in_var] )
1089                     break;
1090             }
1091             /* roll beyond first repetition (still avoiding %) - now can compare with reference rather than with variation */
1092             for (; ins_pos_start != 0 && ref[ins_pos_start-1] == ref[ins_pos_start-1 + variation_size];
1093                 --ins_pos_start);
1094         }
1095         else
1096             ins_pos_start = 0;
1097 
1098         /* roll first repetition to the right (avoiding % operation) */
1099         if (del_pos_xend < ref_size)
1100         {
1101             for (ins_pos_xend = ref_pos_var + var_len_on_ref; ins_pos_xend < ref_size; ++ins_pos_xend)
1102             {
1103                 size_t pos_in_var = ins_pos_xend - ref_pos_var - var_len_on_ref;
1104                 if (pos_in_var == variation_size || ref[ins_pos_xend] != variation[pos_in_var])
1105                     break;
1106             }
1107             /* roll beyond first repetition (still avoiding %) - now can compare with reference rather than with variation */
1108             if (ins_pos_xend - ref_pos_var - var_len_on_ref == variation_size)
1109             {
1110                 for (; ins_pos_xend < ref_size && ref[ins_pos_xend] == ref[ins_pos_xend - variation_size];
1111                     ++ins_pos_xend);
1112             }
1113         }
1114         else
1115             ins_pos_xend = ref_size;
1116     }
1117     else
1118     {
1119         ins_pos_start = ref_pos_var;
1120         ins_pos_xend = ref_pos_var;
1121     }
1122 
1123 
1124     if (del_pos_start > ins_pos_start)
1125         del_pos_start = ins_pos_start;
1126     if (del_pos_xend < ins_pos_xend)
1127         del_pos_xend = ins_pos_xend;
1128 
1129     if ( p_ref_start != NULL )
1130         *p_ref_start = del_pos_start;
1131     if ( p_ref_len != NULL )
1132         *p_ref_len = del_pos_xend - del_pos_start;
1133 
1134     return rc;
1135 }
1136 
1137 /*
1138     FindRefVariationRegionIUPAC
1139     to find theoretical bounds of the variation for
1140     the given reference, position on the reference
1141     and the raw query, or variation to look for at the given
1142     reference position
1143 
1144     alg                    - algorithm to use for the search (one of RefVarAlg enum)
1145     ref, ref_size [IN]     - the reference on which the
1146                              variation will be looked for
1147     ref_pos_var [IN]       - the position on reference to look for the variation
1148     variation, variation_size [IN] - the variation to look for at the ref_pos_var
1149     var_len_on_ref [IN]    - the length of the variation on the reference, e.g.:
1150                            - mismatch, 2 bases: variation = "XY", var_len_on_ref = 2
1151                            - deletion, 3 bases: variation = "", var_len_on_ref = 3
1152                            - insertion, 2 bases:  variation = "XY", var_len_on_ref = 0
1153 
1154     p_ref_start, p_ref_len [OUT, NULL OK] - the region of ambiguity on the reference
1155                                             (return values)
1156 */
1157 
FindRefVariationRegionIUPAC(RefVarAlg alg,INSDC_dna_text const * ref,size_t ref_size,size_t ref_pos_var,INSDC_dna_text const * variation,size_t variation_size,size_t var_len_on_ref,size_t * p_ref_start,size_t * p_ref_len)1158 static rc_t CC FindRefVariationRegionIUPAC (
1159         RefVarAlg alg, INSDC_dna_text const* ref, size_t ref_size, size_t ref_pos_var,
1160         INSDC_dna_text const* variation, size_t variation_size, size_t var_len_on_ref,
1161         size_t* p_ref_start, size_t* p_ref_len
1162     )
1163 {
1164     switch (alg)
1165     {
1166     case refvarAlgSW:
1167         return FindRefVariationRegionIUPAC_SW ( ref, ref_size, ref_pos_var, variation, variation_size, var_len_on_ref, p_ref_start, p_ref_len );
1168     case refvarAlgRA:
1169         return FindRefVariationRegionIUPAC_RA ( ref, ref_size, ref_pos_var, variation, variation_size, var_len_on_ref, p_ref_start, p_ref_len );
1170     }
1171     return RC ( rcVDB, rcExpression, rcConstructing, rcParam, rcUnrecognized );
1172 }
1173 
RefVariationIUPACMake(RefVariation ** obj,INSDC_dna_text const * ref,size_t ref_len,size_t deletion_pos,size_t deletion_len,INSDC_dna_text const * insertion,size_t insertion_len,RefVarAlg alg)1174 rc_t CC RefVariationIUPACMake (RefVariation ** obj,
1175         INSDC_dna_text const* ref, size_t ref_len,
1176         size_t deletion_pos, size_t deletion_len,
1177         INSDC_dna_text const* insertion, size_t insertion_len
1178 #if REF_VAR_ALG
1179         , RefVarAlg alg
1180 #endif
1181     )
1182 {
1183     struct RefVariation* new_obj;
1184     rc_t rc = 0;
1185 
1186     if ( ( insertion_len == 0 && deletion_len == 0 )
1187         || ref_len == 0 )
1188     {
1189         return RC (rcText, rcString, rcSearching, rcParam, rcEmpty);
1190     }
1191 
1192     if ( (deletion_pos + deletion_len) > ref_len )
1193     {
1194         return RC (rcText, rcString, rcSearching, rcParam, rcOutofrange);
1195     }
1196 
1197     assert ( obj != NULL );
1198 
1199     new_obj = calloc ( 1, sizeof * new_obj );
1200 
1201     if ( new_obj == NULL )
1202     {
1203         rc = RC ( rcVDB, rcExpression, rcConstructing, rcMemory, rcExhausted );
1204     }
1205     else
1206     {
1207         size_t ref_window_start = 0, ref_window_len = 0;
1208         rc = FindRefVariationRegionIUPAC ( alg, ref, ref_len,
1209                                            deletion_pos,
1210                                            insertion, insertion_len, deletion_len,
1211                                            & ref_window_start, & ref_window_len );
1212         if ( rc != 0 )
1213         {
1214             free ( new_obj );
1215             new_obj = NULL;
1216         }
1217         else
1218         {
1219             size_t allele_size = 0;
1220             char const* allele = NULL;
1221 
1222             c_string_const ref_str;
1223 
1224             c_string var_str;
1225             var_str.capacity = var_str.size = 0;
1226             var_str.str = NULL;
1227 
1228             c_string_const_assign ( & ref_str, ref, ref_len );
1229 
1230             if ( ! compose_variation ( & ref_str,
1231                                        ref_window_start, ref_window_len,
1232                                        insertion, insertion_len,
1233                                        deletion_pos, deletion_len, & var_str,
1234                                        & allele, & allele_size ) )
1235             {
1236                 rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
1237                 free ( new_obj );
1238                 new_obj = NULL;
1239             }
1240             else
1241             {
1242                 KRefcountInit ( & new_obj->refcount, 1, "RefVariation", "make", "ref-var" );
1243                 /* moving var_str to the object (so no need to destruct var_str */
1244 
1245                 new_obj->var_buffer = var_str.str;
1246                 new_obj->var_buffer_size = var_str.size;
1247 
1248                 new_obj->allele = allele;
1249                 new_obj->allele_size = allele_size;
1250 
1251                 new_obj->allele_start = ref_window_start;
1252                 new_obj->allele_len_on_ref = ref_window_len == 0 && insertion_len == deletion_len
1253                     ? deletion_len : ref_window_len;
1254             }
1255         }
1256     }
1257 
1258     * obj = new_obj;
1259 
1260     /* TODO: if Kurt insists, return non-zero rc if var_start == 0 or var_start + var_len == ref_size */
1261     return rc;
1262 }
1263 
1264 
RefVariationAddRef(RefVariation const * self)1265 rc_t CC RefVariationAddRef ( RefVariation const* self )
1266 {
1267     if ( self != NULL )
1268     {
1269         switch ( KRefcountAdd ( & self -> refcount, "RefVariation" ) )
1270         {
1271         case krefLimit:
1272             return RC ( rcVDB, rcExpression, rcAttaching, rcRange, rcExcessive );
1273         }
1274     }
1275     return 0;
1276 }
1277 
RefVariationIUPACWhack(RefVariation * self)1278 static rc_t CC RefVariationIUPACWhack ( RefVariation* self )
1279 {
1280     KRefcountWhack ( & self -> refcount, "RefVariation" );
1281 
1282     assert ( self->var_buffer != NULL || self->var_buffer_size == 0 );
1283     if ( self->var_buffer != NULL )
1284         free ( self->var_buffer );
1285 
1286     memset ( self, 0, sizeof * self );
1287 
1288     free ( self );
1289 
1290     return 0;
1291 }
1292 
RefVariationRelease(RefVariation const * self)1293 rc_t CC RefVariationRelease ( RefVariation const* self )
1294 {
1295     if ( self != NULL )
1296     {
1297         switch ( KRefcountDrop ( & self -> refcount, "RefVariation" ) )
1298         {
1299         case krefWhack:
1300             return RefVariationIUPACWhack ( ( RefVariation* ) self );
1301         case krefNegative:
1302             return RC ( rcVDB, rcExpression, rcReleasing, rcRange, rcExcessive );
1303         }
1304     }
1305     return 0;
1306 }
1307 
RefVariationGetIUPACSearchQuery(RefVariation const * self,INSDC_dna_text const ** query,size_t * query_len,size_t * query_start)1308 rc_t CC RefVariationGetIUPACSearchQuery ( RefVariation const* self,
1309     INSDC_dna_text const ** query, size_t * query_len, size_t * query_start )
1310 {
1311     if ( self == NULL )
1312         return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1313 
1314     if ( query != NULL )
1315         * query = self->var_buffer;
1316     if ( query_len != NULL )
1317         * query_len = self->var_buffer_size;
1318     if ( query_start != NULL )
1319         * query_start = self->allele_start - (self->allele - self->var_buffer);
1320 
1321     return 0;
1322 }
1323 
RefVariationGetSearchQueryLenOnRef(RefVariation const * self,size_t * query_len_on_ref)1324 rc_t CC RefVariationGetSearchQueryLenOnRef ( RefVariation const* self, size_t * query_len_on_ref )
1325 {
1326     if ( self == NULL )
1327         return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1328 
1329     if ( query_len_on_ref != NULL )
1330         * query_len_on_ref = self->allele_len_on_ref + self->var_buffer_size - self->allele_size;
1331 
1332     return 0;
1333 }
1334 
RefVariationGetAllele(RefVariation const * self,INSDC_dna_text const ** allele,size_t * allele_len,size_t * allele_start)1335 rc_t CC RefVariationGetAllele ( RefVariation const* self,
1336     INSDC_dna_text const ** allele, size_t * allele_len, size_t * allele_start )
1337 {
1338     if ( self == NULL )
1339         return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1340 
1341     if ( allele != NULL )
1342         * allele = self->allele;
1343     if ( allele_len != NULL )
1344         * allele_len = self->allele_size;
1345     if ( allele_start != NULL )
1346         * allele_start = self->allele_start;
1347 
1348     return 0;
1349 }
1350 
RefVariationGetAlleleLenOnRef(RefVariation const * self,size_t * allele_len_on_ref)1351 rc_t CC RefVariationGetAlleleLenOnRef ( RefVariation const* self, size_t * allele_len_on_ref )
1352 {
1353     if ( self == NULL )
1354         return RC ( rcVDB, rcExpression, rcAccessing, rcParam, rcNull );
1355 
1356     if ( allele_len_on_ref != NULL )
1357         * allele_len_on_ref = self->allele_len_on_ref;
1358 
1359     return 0;
1360 }
1361 
1362 //////////////// Search-oriented SmithWaterman+
1363 
1364 struct SmithWaterman
1365 {
1366     char*   query;
1367     size_t  query_size;
1368     size_t  max_rows;
1369     int*    matrix; // originally NULL, grows as needed to hold enough memory for query_size * max_rows
1370 };
1371 
SmithWatermanMake(SmithWaterman ** p_self,const char * p_query)1372 LIB_EXPORT rc_t CC SmithWatermanMake( SmithWaterman** p_self, const char* p_query )
1373 {
1374     rc_t rc = 0;
1375 
1376     if( p_self != NULL && p_query != NULL )
1377     {
1378         SmithWaterman* ret = malloc ( sizeof ( SmithWaterman ) );
1379         if ( ret != NULL )
1380         {
1381             ret -> query = string_dup_measure ( p_query, & ret -> query_size );
1382             if ( ret -> query != NULL )
1383             {
1384                 ret -> max_rows = 0;
1385                 ret -> matrix = NULL;
1386                 *p_self = ret;
1387                 return 0;
1388             }
1389             else
1390             {
1391                 rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
1392             }
1393             free ( ret );
1394         }
1395         else
1396         {
1397             rc = RC(rcText, rcString, rcSearching, rcMemory, rcExhausted);
1398         }
1399     }
1400     else
1401     {
1402         rc = RC(rcText, rcString, rcSearching, rcParam, rcNull);
1403     }
1404 
1405     return rc;
1406 }
1407 
1408 LIB_EXPORT void CC
SmithWatermanWhack(SmithWaterman * self)1409 SmithWatermanWhack( SmithWaterman* self )
1410 {
1411     free ( self -> matrix  );
1412     free ( self -> query );
1413     free ( self );
1414 }
1415 
1416 LIB_EXPORT rc_t CC
SmithWatermanFindFirst(SmithWaterman * p_self,uint32_t p_threshold,const char * p_buf,size_t p_buf_size,SmithWatermanMatch * p_match)1417 SmithWatermanFindFirst( SmithWaterman* p_self, uint32_t p_threshold, const char* p_buf, size_t p_buf_size, SmithWatermanMatch* p_match )
1418 {
1419     rc_t rc = 0;
1420     int score;
1421     size_t max_row;
1422     size_t max_col;
1423 
1424     if (p_buf_size == 0)
1425     {
1426         return SILENT_RC(rcText, rcString, rcSearching, rcQuery, rcNotFound);
1427     }
1428 
1429     if ( p_buf_size > p_self -> max_rows )
1430     {
1431         /* calculate_similarity_matrix adds a row and a column, adjust matrix dimensions accordingly */
1432         int* new_matrix = realloc ( p_self -> matrix, (p_self->query_size + 1) * (p_buf_size + 1) * sizeof(*p_self->matrix) );
1433         if ( new_matrix == NULL )
1434         {   /* p_self -> matrix is unchanged and can be reused */
1435             return RC ( rcText, rcString, rcSearching, rcMemory, rcExhausted );
1436         }
1437         p_self -> max_rows = p_buf_size;
1438         p_self -> matrix = new_matrix;
1439     }
1440     /*TODO: pass threshold into calculate_similarity_matrix, have it stop as soon as the score is sufficient */
1441     rc = calculate_similarity_matrix ( p_buf, p_buf_size, p_self -> query, p_self -> query_size, false, p_self -> matrix, false, &score, &max_row, &max_col );
1442     if ( rc == 0 )
1443     {
1444         if ( p_threshold > p_self->query_size * 2 )
1445         {
1446             p_threshold = p_self->query_size * 2;
1447         }
1448         if ( score >= p_threshold )
1449         {
1450             if ( p_match != NULL )
1451             {
1452                 /* walk back from the max score row */
1453                 const size_t Columns = p_self->query_size + 1;
1454                 int row = max_row;
1455                 int col = max_col;
1456                 while ( row > 0 && col > 0 )
1457                 {
1458                     int curr = p_self -> matrix [ row*Columns + col ];
1459                     if ( curr == 0 )
1460                     {
1461                         break;
1462                     }
1463 					else
1464 					{
1465 						int left = p_self -> matrix [ row * Columns + (col - 1) ];
1466 						int up   = p_self -> matrix [ (row - 1)*Columns + col ];
1467 						int diag = p_self -> matrix [ (row - 1)*Columns + (col - 1) ];
1468 						if ( diag >= left && diag >= up )
1469 						{
1470 							--row;
1471 							--col;
1472 						}
1473 						else if ( diag < left )
1474 						{
1475 							--col;
1476 						}
1477 						else
1478 						{
1479 							--row;
1480 						}
1481 					}
1482                 }
1483 
1484                 p_match -> position = row;
1485                 p_match -> length = max_row - row;
1486                 p_match -> score = score;
1487             }
1488             return 0;
1489         }
1490         rc = SILENT_RC ( rcText, rcString, rcSearching, rcQuery, rcNotFound );
1491     }
1492 
1493     return rc;
1494 }
1495 
1496 
1497