1 /*===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  */
26 #include <vdb/extern.h>
27 
28 #include <klib/defs.h>
29 #include <klib/rc.h>
30 #include <vdb/table.h>
31 #include <vdb/xform.h>
32 #include <vdb/schema.h>
33 #include <kdb/meta.h>
34 #include <klib/data-buffer.h>
35 #include <bitstr.h>
36 #include <sysalloc.h>
37 #include <klib/printf.h>
38 
39 #include <klib/out.h>
40 
41 #include <align/align.h>
42 
43 #include <stdint.h>
44 #include <stdlib.h>
45 #include <assert.h>
46 #include <string.h>
47 #include <stdio.h>
48 #include <ctype.h>
49 #include <os-native.h>
50 
51 #include <insdc/insdc.h>
52 
53 #define ARG_BASE(TYPE, N) (((TYPE const *)argv[(N)].u.data.base) + argv[(N)].u.data.first_elem)
54 #define ARG_ALIAS(TYPE, NAME, N) TYPE const *const NAME = ARG_BASE(TYPE, N)
55 #define ARG_ALIAS_COND(TYPE, NAME, N, ALT) TYPE const *const NAME = ((argc > (N)) ? ARG_BASE(TYPE, N) : ALT)
56 
57 typedef struct {
58     int version;
59 } self_t;
60 
61 static
op2b(KDataBuffer * dst,unsigned const offset,unsigned * const count,int const opcode,unsigned oplen)62 rc_t op2b( KDataBuffer *dst, unsigned const offset, unsigned *const count, int const opcode, unsigned oplen )
63 {
64     unsigned digits = 1;
65     unsigned scale = 10;
66 
67     if ( oplen == 0 )
68     {
69         *count = 0;
70         return 0;
71     }
72 
73     while ( scale < oplen )
74     {
75         scale *= 10;
76         ++digits;
77     }
78 
79     if ( scale == oplen ) /* oplen is whole power of 10 */
80         ++digits;
81 
82     *count = digits + 1;
83 
84     if ( dst != NULL )
85     {
86         unsigned const need = offset + digits + 1;
87 
88         if ( need > dst->elem_count )
89         {
90             rc_t rc = KDataBufferResize( dst, need );
91             if ( rc != 0 ) return rc;
92         }
93 
94         {
95             char *const base = &( ( char * )dst->base )[ offset ];
96 
97             base[ digits ] = opcode;
98             do
99             {
100                 unsigned const digit = oplen % 10;
101 
102                 oplen /= 10;
103                 base[ --digits ] = digit + '0';
104             } while ( digits );
105         }
106     }
107     return 0;
108 }
109 
110 
111 static
cigar_string(KDataBuffer * dst,size_t boff,INSDC_coord_len * const bsize,bool const full,bool const has_mismatch[],bool const has_ref_offset[],INSDC_coord_zero const read_start,INSDC_coord_zero const read_end,int32_t const ref_offset[],unsigned const ro_len,unsigned * ro_offset)112 rc_t cigar_string(KDataBuffer *dst, size_t boff,
113                   INSDC_coord_len *const bsize, bool const full,
114                   bool const has_mismatch[], bool const has_ref_offset[],
115                   INSDC_coord_zero const read_start, INSDC_coord_zero const read_end,
116                   int32_t const ref_offset[], unsigned const ro_len, unsigned *ro_offset)
117 {
118     size_t bsz = 0;
119     unsigned nwrit;
120     uint32_t i, m, mm;
121     rc_t rc;
122     unsigned cur_off = ro_offset ? *ro_offset : 0;
123 
124 #define BUF_WRITE(OP, LEN) if ((rc = op2b(dst, (unsigned const)(bsz + boff), &nwrit, (int const)(OP), (unsigned)(LEN))) != 0) return rc; bsz += nwrit;
125 
126 #define MACRO_FLUSH_MATCH    { BUF_WRITE('=', m); m = 0; }
127 #define MACRO_FLUSH_MISMATCH { BUF_WRITE(i == read_end ? 'S' : 'X', mm); mm = 0; }
128 
129 #define MACRO_FLUSH_BOTH \
130 if(m+mm > 0) { \
131     if(i==read_end && has_ref_offset[i]) { \
132         BUF_WRITE('M', m); BUF_WRITE('S', mm); \
133     } else { \
134         BUF_WRITE('M', m + mm) \
135     } \
136     m=mm=0; \
137 }
138 
139 #define MACRO_FLUSH \
140 if(full){ \
141     MACRO_FLUSH_MATCH; \
142     MACRO_FLUSH_MISMATCH; \
143 } else { \
144     MACRO_FLUSH_BOTH; \
145 }
146 
147     for( i = read_start, bsz = m = mm = 0; i < (uint32_t)read_end; i++ )
148     {
149         if( has_ref_offset[ i ] ) /*** No offset in the reference **/
150         {
151             int32_t offset;
152 
153             if ( cur_off >= ro_len ) /*** bad data ***/
154                 return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
155 
156             offset = ref_offset[ cur_off++ ];
157 
158             if( offset > 0 ) /*** insert in the reference, delete in sequence ***/
159             {
160                 if ( i == 0 ) /**** deletes in the beginning are disallowed, REF_START should have been moved and delete converted to insert **/
161                     return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
162                 MACRO_FLUSH;
163                 BUF_WRITE( 'D', offset );
164             }
165             else if ( offset < 0 ) /**** delete from the reference ***/
166             {
167                 offset = -offset;
168                 if ( i + offset > (uint32_t)read_end )
169                     return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
170                 if ( i > 0 ) /** normally indels are before the current base ***/
171                 {
172                     MACRO_FLUSH;
173                     BUF_WRITE( 'I', offset );
174                 }
175                 else
176                 { /***  this  is a soft clip at the beginning ***/
177                     BUF_WRITE( 'S', offset );
178                 }
179                 i += offset;
180             }
181             else
182             { /*** Not possible ??? ***/
183                 return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
184             }
185         }
186 
187         if ( i < (uint32_t)read_end )
188         {
189             if ( has_mismatch[ i ] )
190             {
191                 if ( full )
192                 {
193                     MACRO_FLUSH_MATCH;
194                 }
195                 mm++;
196             }
197             else
198             {
199                 if ( full )
200                 {
201                     MACRO_FLUSH_MISMATCH;
202                 }
203                 else
204                 {
205                     m += mm;
206                     mm = 0;
207                 }
208                 m++;
209             }
210         }
211     }
212     MACRO_FLUSH;
213     *bsize = (INSDC_coord_len)bsz;
214     if ( ro_offset )
215         *ro_offset = cur_off;
216     return 0;
217 
218 #undef BUF_WRITE
219 #undef MACRO_FLUSH_MATCH
220 #undef MACRO_FLUSH_MISMATCH
221 #undef MACRO_FLUSH_BOTH
222 #undef MACRO_FLUSH
223 }
224 
225 typedef struct {
226     int opcode_M, opcode_X, opcode_S;
227 } cigar_opcode_options_t;
228 
229 static
cigar_string_2_0(KDataBuffer * dst,size_t boff,INSDC_coord_len * const bsize,bool const has_mismatch[],bool const has_ref_offset[],INSDC_coord_zero const read_start,INSDC_coord_zero const read_end,int32_t const ref_offset[],unsigned const ro_len,unsigned ro_offset[],unsigned const reflen,cigar_opcode_options_t const * ops)230 rc_t cigar_string_2_0(KDataBuffer *dst,
231                       size_t boff,
232                       INSDC_coord_len *const bsize,
233                       bool const has_mismatch[],
234                       bool const has_ref_offset[],
235                       INSDC_coord_zero const read_start,
236                       INSDC_coord_zero const read_end,
237                       int32_t const ref_offset[],
238                       unsigned const ro_len,
239                       unsigned ro_offset[],
240                       unsigned const reflen,
241                       cigar_opcode_options_t const *ops)
242 {
243     int ri;
244     unsigned si;
245     unsigned di;
246     rc_t rc;
247     unsigned nwrit;
248     unsigned cur_off = ro_offset ? *ro_offset : 0;
249     unsigned op_len;
250     int opcode;
251 
252 #define BUF_WRITE(OP, LEN) { if ((rc = op2b(dst, (unsigned const)(di + boff), &nwrit, (int const)(OP), (unsigned)(LEN))) != 0) return rc; di += nwrit; }
253     assert ( read_start >= 0 );
254     si = read_start;
255     if ( /* !use_S && */ read_start == read_end && reflen > 0 ) /** full delete as a last ploidy ends up written nowhere  **/
256     {
257         di=0;
258         opcode = 'D';
259         op_len = reflen;
260     }
261     else for ( op_len = di = 0, opcode = ri = 0; si < ( unsigned )read_end && ri <= ( int )reflen; )
262     {
263         if ( has_ref_offset[ si ] )
264         {
265             int offs;
266 
267             if ( op_len > 0 )
268             {
269                 BUF_WRITE( opcode, op_len );
270                 op_len = 0;
271             }
272             if ( cur_off >= ro_len ) /*** bad data ***/
273                 return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
274 
275             offs = ref_offset[ cur_off++ ];
276             if ( offs < 0 )
277             {
278                 unsigned j;
279                 for ( j = 1; j < (unsigned)-offs && ( si + j ) < ( unsigned )read_end; )
280                 {
281                     if ( has_ref_offset[ si + j ] ) /*** structured insert **/
282                     {
283                         BUF_WRITE( si ? 'I' : ops->opcode_S, j );
284                         offs += j;
285                         si   += j;
286                         j = 1;
287                     }
288                     else
289                     {
290                         j++;
291                     }
292                 }
293                 if ( offs < 0 )
294                 {
295                     BUF_WRITE( si ? 'I' : ops->opcode_S, -offs );
296                     si -= offs;
297                 }
298                 continue;
299             }
300             else if ( offs > 0 )
301             {
302                 BUF_WRITE( 'D', offs );
303                 ri += offs;
304             }
305             else
306                 return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
307         }
308 
309         if ( ri < ( int )reflen )
310         {
311             int const op_nxt = ( has_mismatch[ si ] ? ops->opcode_X : ops->opcode_M );
312 
313             if ( op_len == 0 || opcode == op_nxt )
314                 ++op_len;
315             else
316             {
317                 BUF_WRITE( opcode, op_len );
318                 op_len = 1;
319             }
320             opcode = op_nxt;
321         }
322         else
323             break;
324         ++si;
325         ++ri;
326     }
327 
328     BUF_WRITE( opcode, op_len );
329     if ( si < (unsigned)read_end )
330     {
331         if ( cur_off + 1 < ro_len )
332         {
333             assert( read_end + ref_offset[ cur_off ] == si );
334             cur_off++;
335             BUF_WRITE( 'I', read_end - si );
336         }
337         else
338         {
339             BUF_WRITE( ops->opcode_S, read_end - si );
340         }
341     }
342     *bsize = di;
343     if ( ro_offset != NULL )
344         *ro_offset = cur_off;
345 
346     return 0;
347 #undef BUF_WRITE
348 }
349 
350 static
cigar_string_2(KDataBuffer * dst,size_t boff,INSDC_coord_len * bsize,const int version,bool const has_mismatch[],bool const has_ref_offset[],INSDC_coord_zero const read_start,INSDC_coord_zero const read_end,int32_t const ref_offset[],unsigned const ro_len,unsigned * ro_offset,unsigned const reflen,bool use_S)351 rc_t cigar_string_2(KDataBuffer *dst, size_t boff,
352                     INSDC_coord_len *bsize, const int version,
353                     bool const has_mismatch[], bool const has_ref_offset[],
354                     INSDC_coord_zero const read_start, INSDC_coord_zero const read_end,
355                     int32_t const ref_offset[], unsigned const ro_len, unsigned * ro_offset,
356                     unsigned const reflen, bool use_S)
357 {
358     cigar_opcode_options_t const ops = {
359         version == 1 ? '=' : 'M',
360         version == 1 ? 'X' : 'M',
361         use_S ? 'S' : 'I'
362     };
363     return cigar_string_2_0(dst, boff, bsize, has_mismatch, has_ref_offset, read_start, read_end, ref_offset, ro_len, ro_offset, reflen, &ops);
364 }
365 
366 static
cigar_string_2_1(KDataBuffer * dst,size_t boff,INSDC_coord_len * bsize,const int version,bool const has_mismatch[],bool const has_ref_offset[],INSDC_coord_zero const read_start,INSDC_coord_zero const read_end,int32_t const ref_offset[],unsigned const ro_len,unsigned ro_offset[],uint8_t const ref_offset_type[],unsigned const reflen)367 rc_t cigar_string_2_1(KDataBuffer *dst, size_t boff,
368                       INSDC_coord_len *bsize, const int version,
369                       bool const has_mismatch[], bool const has_ref_offset[],
370                       INSDC_coord_zero const read_start, INSDC_coord_zero const read_end,
371                       int32_t const ref_offset[], unsigned const ro_len, unsigned ro_offset[],
372                       uint8_t const ref_offset_type[],
373                       unsigned const reflen)
374 {
375     int ri;
376     unsigned si = read_start;
377     unsigned di;
378     rc_t rc;
379     unsigned nwrit;
380     unsigned cur_off = ro_offset ? *ro_offset : 0;
381     unsigned op_len;
382     int opcode;
383     int const opM = (version & 1) ? '=' : 'M';
384     int const opX = (version & 1) ? 'X' : 'M';
385 
386 #define BUF_WRITE(OP, LEN) { if ((rc = op2b(dst, (unsigned const)(di + boff), &nwrit, (int const)(OP), (unsigned)(LEN))) != 0) return rc; di += nwrit; }
387     if (read_start == read_end && reflen > 0) {
388         /** full delete as a last ploidy ends up written nowhere  **/
389         di = 0;
390         opcode = 'D';
391         op_len = reflen;
392     }
393     else for (op_len = di = 0, opcode = ri = 0; si < (unsigned)read_end && ri <= (int)reflen; ) {
394         if (has_ref_offset[si]) {
395             int offs;
396             int type;
397 
398             if (op_len > 0) {
399                 BUF_WRITE(opcode, op_len);
400                 op_len = 0;
401             }
402             if (cur_off >= ro_len) /*** bad data ***/
403                 return RC(rcXF, rcFunction, rcExecuting, rcData, rcInvalid);
404 
405             type = ref_offset_type[cur_off];
406             offs = ref_offset[cur_off];
407             ++cur_off;
408 
409             if (offs < 0) {
410 		unsigned const ins = -offs;
411 		if( (version & 1) /*CIGAR_LONG*/ && type == 5 /** complete genomics **/ && ri >= ins /** safety **/){
412 			BUF_WRITE('B', ins);
413 			ri -= ins;
414 		} else {
415 			int const opc = (type == 1) ? 'S' : 'I';
416 			BUF_WRITE(opc, ins);
417 			si += ins;
418 			continue;
419 		}
420             } else if (offs > 0) {
421                 int const opc = type == 0 ? 'D' : 'N';
422 
423                 BUF_WRITE(opc, offs);
424                 ri += offs;
425             }
426         }
427         if (ri < (int)reflen) {
428             int const op_nxt = (has_mismatch[si] ? opX : opM);
429 
430             if (op_len == 0 || opcode == op_nxt)
431                 ++op_len;
432             else {
433                 BUF_WRITE(opcode, op_len);
434                 op_len = 1;
435             }
436             opcode = op_nxt;
437         }
438         else
439             break;
440         ++si;
441         ++ri;
442     }
443 
444     BUF_WRITE(opcode, op_len);
445     *bsize = di;
446     if (ro_offset != NULL)
447         *ro_offset = cur_off;
448 
449     return 0;
450 #undef BUF_WRITE
451 }
452 
right_soft_clip(unsigned seq_len,unsigned ref_len,unsigned noffsets,int32_t const ref_offset[])453 static INSDC_coord_len right_soft_clip(unsigned seq_len, unsigned ref_len,
454                                        unsigned noffsets,
455                                        int32_t const ref_offset[])
456 {
457     INSDC_coord_len a = ref_len;
458     unsigned i;
459 
460     for ( i = 0; i < noffsets; ++i )
461         a -= ref_offset[ i ];
462     return a < seq_len ? seq_len - a : 0;
463 }
464 
465 static
cigar_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])466 rc_t CC cigar_impl ( void *data, const VXformInfo *info, int64_t row_id,
467     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
468 {
469     self_t const *self = data;
470     unsigned const rdln   = ( unsigned const )argv[ 0 ].u.data.elem_count;
471     unsigned const ro_len = ( unsigned const )argv[ 2 ].u.data.elem_count;
472     bool const *has_mismatch   = argv[ 0 ].u.data.base;
473     bool const *has_ref_offset = argv[ 1 ].u.data.base;
474     int32_t const *ref_offset  = argv[ 2 ].u.data.base;
475     rc_t rc;
476 
477     assert( argv[ 0 ].u.data.elem_bits == 8 );
478     assert( argv[ 1 ].u.data.elem_bits == 8 );
479     assert( argv[ 2 ].u.data.elem_bits == 32 );
480 
481     assert( rdln == argv[ 1 ].u.data.elem_count );
482 
483     has_mismatch   += argv[ 0 ].u.data.first_elem;
484     has_ref_offset += argv[ 1 ].u.data.first_elem;
485     ref_offset     += argv[ 2 ].u.data.first_elem;
486 
487     rslt->data->elem_bits = 8;
488     if ( argc == 3 )
489     {
490         INSDC_coord_len count;
491 
492         rc = cigar_string( rslt->data, 0, &count, self->version & 0x1,
493                            has_mismatch, has_ref_offset,
494                            0, rdln, ref_offset, ro_len, NULL);
495         rslt->elem_count = count;
496     }
497     else
498     {
499         int32_t const *const rfln = argv[ 3 ].u.data.base;
500         INSDC_coord_len count;
501 
502         rc = cigar_string_2( rslt->data, 0, &count, self->version & 0x1,
503                             has_mismatch, has_ref_offset,
504                             0, rdln, ref_offset, ro_len, NULL,
505                             rfln[ argv[ 3 ].u.data.first_elem ], true );
506         rslt->elem_count = count;
507     }
508     return rc;
509 }
510 
511 static
cigar_impl_2(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])512 rc_t CC cigar_impl_2 ( void *data, const VXformInfo *info, int64_t row_id,
513                         VRowResult *rslt, uint32_t argc, const VRowData argv [] )
514 {
515     self_t const *self = data;
516     bool const *has_mismatch        = argv[0].u.data.base;
517     bool const *has_ref_offset      = argv[1].u.data.base;
518     int32_t const *ref_offset       = argv[2].u.data.base;
519     INSDC_coord_len const *read_len = argv[3].u.data.base;
520     uint8_t const *const ref_offset_type = argc >= 6 ? ((uint8_t const *)argv[5].u.data.base) + argv[5].u.data.first_elem : NULL;
521     uint32_t const nreads = (uint32_t const)argv[ 3 ].u.data.elem_count;
522     uint32_t const ro_len = (uint32_t const)argv[ 2 ].u.data.elem_count;
523     uint32_t n;
524     uint32_t ro_offset = 0;
525     rc_t rc = 0;
526     INSDC_coord_zero start;
527     INSDC_coord_len *cigar_len = NULL;
528     KDataBuffer *buf = ( self->version & 0x04 ) ? NULL : rslt->data;
529 
530     assert( argv[ 0 ].u.data.elem_bits == 8 );
531     assert( argv[ 1 ].u.data.elem_bits == 8 );
532     assert( argv[ 2 ].u.data.elem_bits == 32 );
533     assert( argv[ 3 ].u.data.elem_bits == 32 );
534 
535     has_mismatch   += argv[ 0 ].u.data.first_elem;
536     has_ref_offset += argv[ 1 ].u.data.first_elem;
537     ref_offset     += argv[ 2 ].u.data.first_elem;
538     read_len       += argv[ 3 ].u.data.first_elem;
539 
540     if (self->version & 0x4) {
541         rslt->data->elem_bits = sizeof(cigar_len[0]) * 8;
542         rslt->elem_count = nreads;
543 
544         rc = KDataBufferResize(rslt->data, rslt->elem_count);
545         if (rc != 0)
546             return rc;
547 
548         cigar_len = rslt->data->base;
549         if (argv[0].u.data.elem_count == 0 || argv[1].u.data.elem_count == 0) {
550             memset(cigar_len, 0, sizeof(cigar_len[0]) * nreads);
551             return 0;
552         }
553     }
554     else {
555         rslt->data->elem_bits = 8;
556         rslt->elem_count = 0;
557     }
558 
559     for (n = 0, start = 0, ro_offset = 0; n < nreads; start += read_len[n], ++n) {
560         INSDC_coord_len cnt;
561         INSDC_coord_len *const count = (self->version & 0x04) ? cigar_len + n : &cnt;
562 
563         if (argc == 4) {
564             rc = cigar_string(buf, (size_t)rslt->elem_count, count, self->version & 0x1,
565                               has_mismatch, has_ref_offset,
566                               start, start + read_len[n],
567                               ref_offset, ro_len, &ro_offset);
568         }
569         else if (argc == 5) {
570             int32_t const *const reflen = argv[4].u.data.base;
571 
572             rc = cigar_string_2(buf, (size_t)rslt->elem_count, count, self->version & 0x1,
573                                 has_mismatch, has_ref_offset,
574                                 start, start + read_len[n],
575                                 ref_offset, ro_len, &ro_offset,
576                                 reflen[argv[4].u.data.first_elem], nreads == 1 ? true : false);
577         }
578         else { /* should be new function */
579             int32_t const *const reflen = argv[4].u.data.base;
580 
581             rc = cigar_string_2_1(buf, (size_t)rslt->elem_count, count, self->version & 0x1,
582                                   has_mismatch, has_ref_offset,
583                                   start, start + read_len[n],
584                                   ref_offset, ro_len, &ro_offset,
585                                   ref_offset_type,
586                                   reflen[argv[4].u.data.first_elem]);
587         }
588         if (rc != 0)
589             return rc;
590         if ((self->version & 0x04) == 0)
591             rslt->elem_count += cnt;
592     }
593     return 0;
594 }
595 
596 static
self_whack(void * ptr)597 void CC self_whack( void *ptr )
598 {
599     free( ptr );
600 }
601 
602 
603 /*
604  * function
605  * ascii ALIGN:cigar #1 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset);
606  */
607 VTRANSFACT_IMPL ( ALIGN_cigar, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
608     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
609 {
610     self_t self;
611 
612     self.version =cp -> argv [ 0 ] . data . u8 [ 0 ];
613     switch(self.version){
614     case 0:
615     case 1:
616         break;
617     default:
618         return RC ( rcXF, rcFunction, rcConstructing, rcParam, rcIncorrect );
619     }
620     rslt->u.rf = cigar_impl;
621     rslt->variant = vftRow;
622     rslt -> self = malloc ( sizeof self );
623     memmove(rslt -> self,&self,sizeof(self));
624     rslt -> whack = self_whack;
625 
626     return 0;
627 }
628 
629 VTRANSFACT_IMPL ( ALIGN_cigar_2, 2, 0, 0 ) ( const void *Self, const VXfactInfo *info,
630     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
631 {
632     self_t self;
633     VTypedesc const return_type = info->fdesc.desc;
634     int const version = cp->argv[0].data.u8[0];
635 
636     self.version = version;
637     switch (version) {
638     case 0:
639     case 1:
640         break;
641     default:
642         return RC( rcXF, rcFunction, rcConstructing, rcParam, rcIncorrect );
643     }
644 
645     if (return_type.domain == vtdAscii && return_type.intrinsic_bits == 8)
646     {
647         self.version |= 0x2;
648     }
649     else if (return_type.domain == vtdUint && return_type.intrinsic_bits == sizeof(INSDC_coord_len) * 8)
650     {
651         self.version |= 0x4;
652     }
653     else
654     {
655         return RC( rcXF, rcFunction, rcConstructing, rcParam, rcIncorrect );
656     }
657 
658     rslt->u.rf = cigar_impl_2;
659     rslt->variant = vftRow;
660     rslt->self = malloc(sizeof(self));
661     if (rslt->self == NULL)
662         return RC( rcXF, rcFunction, rcConstructing, rcMemory, rcExhausted );
663 
664     memmove(rslt->self, &self, sizeof(self));
665     rslt->whack = self_whack;
666     return 0;
667 }
668 
669 static
edit_distance_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])670 rc_t CC edit_distance_impl ( void *data, const VXformInfo *info, int64_t row_id,
671                              VRowResult *rslt, uint32_t argc, const VRowData argv [] )
672 {
673     rc_t rc;
674     uint32_t i, roi, mrun;
675     uint32_t len = ( uint32_t ) argv[ 0 ].u.data.elem_count;
676     uint32_t *dst;
677 
678     uint8_t const *has_mismatch   = argv [ 0 ] . u . data . base;
679     uint8_t const *has_ref_offset = argv [ 1 ] . u . data . base;
680     int32_t const *ref_offset     = argv [ 2 ] . u . data . base;
681 
682     assert( argv[ 0 ].u.data.elem_bits == 8 );
683     assert( argv[ 1 ].u.data.elem_bits == 8 );
684     assert( argv[ 2 ].u.data.elem_bits == 32 );
685 
686     assert( len == argv[ 1 ].u.data.elem_count );
687 
688     has_mismatch   += argv [ 0 ] . u . data . first_elem;
689     has_ref_offset += argv [ 1 ] . u . data . first_elem;
690     ref_offset     += argv [ 2 ] . u . data . first_elem;
691 
692     /* resize output row for the total number of reads */
693     rslt->data->elem_bits = rslt->elem_bits;
694     rc = KDataBufferResize( rslt -> data, 1 );
695     if ( rc != 0 )
696         return rc;
697 
698     rslt -> elem_count = 1;
699     dst = rslt -> data -> base;
700     dst[ 0 ] = 0;
701     if( len == 0 )
702         return 0; /** nothing to do **/
703 
704     if ( has_ref_offset[ 0 ] ) /** skip mismatches from the beginning == soft clip ***/
705     {
706         if ( ref_offset[ 0 ] > 0 )  /**** deletes in the beginning are disallowed, REF_START should have been moved and delete converted to insert **/
707             return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid  );
708 
709         i = -( ref_offset [ 0 ] );
710         roi = 1;
711         mrun = 0;
712     }
713     else
714     {
715         i = roi = 0;
716     }
717 
718     for ( mrun = 0; i < len; i++ )
719     {
720         if ( has_mismatch[ i ] )
721         {
722             mrun++;
723         }
724         else /*** intentionally skipping last run of mismatches **/
725         {
726             dst[ 0 ] += mrun;
727             mrun = 0;
728         }
729     }
730     return 0;
731 }
732 
733 /*
734  * function
735  * ascii NCBI:align:edit_distance #1 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset);
736  */
737 VTRANSFACT_IMPL ( NCBI_align_edit_distance, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
738     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
739 {
740     rslt->u.rf = edit_distance_impl;
741     rslt->variant = vftRow;
742     return 0;
743 }
744 
745 
746 /*
747  * edit distance = sum of lengths of inserts
748  *               + sum of lengths of deletes
749  *               + number of mismatches
750  * excluding soft clips
751  */
752 
edit_distance(bool const has_ref_offset[],bool const has_mismatch[],unsigned const readlen,unsigned const reflen,int32_t const ref_offset[],unsigned const offsets)753 unsigned edit_distance(bool const has_ref_offset[],
754                        bool const has_mismatch[],
755                        unsigned const readlen,
756                        unsigned const reflen,
757                        int32_t const ref_offset[],
758                        unsigned const offsets)
759 {
760     if ( readlen == 0 )
761     {
762         /* full delete */
763         return reflen;
764     }
765     else
766     {
767         INSDC_coord_len const rsc = right_soft_clip( readlen, reflen, offsets, ref_offset );
768         unsigned indels = 0;
769         unsigned misses = 0;
770         unsigned i = 0;
771         unsigned j = 0;
772 
773         if ( has_ref_offset[ 0 ] && ref_offset[ j ] < 0 )
774             j = i = 1;
775 
776         /* sum of insert lengths + sum of delete lengths excluding soft clips */
777         for ( ; i < readlen - rsc; ++i )
778         {
779             if ( has_ref_offset[ i ] )
780             {
781                 int const offset = ref_offset[ j++ ];
782 
783                 if ( offset < 0 )
784                     indels += -offset;
785                 else
786                     indels +=  offset;
787             }
788         }
789 
790         /* sum of mismatches not in inserts or soft clips */
791         for ( j = i = 0; i < readlen - rsc; )
792         {
793             if ( has_ref_offset[ i ] )
794             {
795                 int offset = ref_offset[ j++ ];
796 
797                 if ( offset < 0 )
798                 {
799                     i += -offset;
800                     continue;
801                 }
802             }
803             misses += has_mismatch[ i ] ? 1 : 0;
804             ++i;
805         }
806         return indels + misses;
807     }
808 }
809 
810 
811 static
edit_distance_2_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])812 rc_t CC edit_distance_2_impl ( void *data, const VXformInfo *info, int64_t row_id,
813     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
814 {
815     rc_t rc;
816     unsigned const nreads = argc > 4 ? ( unsigned const )argv[ 4 ].u.data.elem_count : 1;
817     unsigned const len = ( unsigned const )argv[ 0 ].u.data.elem_count;
818     unsigned const noffsets = ( unsigned const )argv[ 2 ].u.data.elem_count;
819     INSDC_coord_len const dummy_rl = len;
820 
821     ARG_ALIAS     ( bool           , has_mismatch  , 0 );
822     ARG_ALIAS     ( bool           , has_ref_offset, 1 );
823     ARG_ALIAS     ( int32_t        , ref_offset    , 2 );
824     ARG_ALIAS     ( INSDC_coord_len, ref_len       , 3 );
825     ARG_ALIAS_COND( INSDC_coord_len, readlen       , 4, &dummy_rl );
826 
827     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_mismatch  [ 0 ] ) * 8 );
828     assert( argv[ 1 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
829     assert( argv[ 2 ].u.data.elem_bits == sizeof( ref_offset    [ 0 ] ) * 8 );
830     assert( argv[ 3 ].u.data.elem_bits == sizeof( ref_len       [ 0 ] ) * 8 );
831 
832     rslt->data->elem_bits = rslt->elem_bits;
833     if ( len == 0 )
834     {
835         return KDataBufferResize( rslt->data, rslt->elem_count = 0 );
836     }
837 
838     assert( len == argv[ 1 ].u.data.elem_count );
839 
840     rslt->elem_count = nreads;
841     rc = KDataBufferResize( rslt->data, nreads );
842     if ( rc == 0 )
843     {
844         unsigned i;
845         unsigned start = 0;
846         unsigned offset = 0;
847         uint32_t *const dst = rslt->data->base;
848 
849         for ( i = 0; i < nreads; ++i )
850         {
851             unsigned const rlen = readlen[ i ];
852             unsigned j;
853             unsigned offsets = 0;
854 
855             for ( j = 0; j < rlen; ++j )
856             {
857                 if ( has_ref_offset[ start + j ] )
858                     ++offsets;
859             }
860 
861             if ( offsets + offset > noffsets )
862                 return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
863 
864             dst[ i ] = edit_distance( has_ref_offset + start,
865                                    has_mismatch + start,
866                                    rlen,
867                                    ref_len[ 0 ],
868                                    ref_offset + offset,
869                                    offsets );
870             start += rlen;
871             offset += offsets;
872         }
873     }
874     return rc;
875 }
876 
877 
878 /*
879  * function
880  * U32 NCBI:align:edit_distance #2 ( bool has_mismatch, bool has_ref_offset,
881  *     I32 ref_offset, INSDC:coord:len ref_len );
882  */
883 VTRANSFACT_IMPL ( NCBI_align_edit_distance_2, 2, 0, 0 ) ( const void *Self, const VXfactInfo *info,
884     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
885 {
886     rslt->u.rf = edit_distance_2_impl;
887     rslt->variant = vftRow;
888     return 0;
889 }
890 
891 
892 /*
893  * function
894  * U32 NCBI:align:edit_distance #3 ( bool has_mismatch, bool has_ref_offset,
895  *     I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len );
896  */
897 static
edit_distance_3_impl(void * const data,VXformInfo const info[],int64_t const row_id,VRowResult rslt[],uint32_t const argc,VRowData const argv[])898 rc_t CC edit_distance_3_impl(void *const data, VXformInfo const info[],
899                              int64_t const row_id,
900                              VRowResult rslt[],
901                              uint32_t const argc,
902                              VRowData const argv[/* argc */])
903 {
904     rc_t rc;
905     unsigned const nreads = (unsigned const)argv[4].u.data.elem_count;
906     ARG_ALIAS(              bool, has_mismatch   , 0);
907     ARG_ALIAS(              bool, has_ref_offset , 1);
908     ARG_ALIAS(           int32_t, ref_offset     , 2);
909     ARG_ALIAS(NCBI_align_ro_type, ref_offset_type, 3);
910     ARG_ALIAS(   INSDC_coord_len, read_len       , 4);
911 
912     rslt->data->elem_bits = rslt->elem_bits;
913     rslt->elem_count = nreads;
914     rc = KDataBufferResize(rslt->data, nreads);
915     if (rc == 0) {
916         uint32_t *const result = rslt->data->base;
917         unsigned cur = 0;
918         unsigned cur_ro = 0;
919         unsigned n;
920 
921         for (n = 0; n < nreads; ++n) {
922             unsigned const len = read_len[n];
923             unsigned j;
924             unsigned miss = 0;
925             unsigned indel = 0;
926             unsigned cur_ro2 = cur_ro;
927 
928             for (j = 0; j < len; ) {
929                 if (has_ref_offset[cur] != 0) {
930                     int const offset = ref_offset[cur_ro];
931 
932                     ++cur_ro;
933                     if (offset < 0) {
934                         unsigned const dist = -offset;
935 
936                         cur += dist;
937                         j += dist;
938                         continue;
939                     }
940                 }
941                 if (has_mismatch[cur])
942                     ++miss;
943                 ++cur;
944                 ++j;
945             }
946             while (cur_ro2 < cur_ro) {
947                 int const type = ref_offset_type[cur_ro2];
948 
949                 if (type == 0) {
950                     int const offset = ref_offset[cur_ro2];
951 
952                     if (offset < 0)
953                         indel += -offset;
954                     else
955                         indel += +offset;
956                 }
957                 ++cur_ro2;
958             }
959             result[n] = miss + indel;
960         }
961     }
962     return rc;
963 }
964 
965 VTRANSFACT_IMPL ( NCBI_align_edit_distance_3, 3, 0, 0 ) ( const void *Self, const VXfactInfo *info,
966      VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
967 {
968     rslt->u.rf = edit_distance_3_impl;
969     rslt->variant = vftRow;
970     return 0;
971 }
972 
973 
974 /*
975  * function bool ALIGN:generate_has_mismatch #1 (INSDC:4na:bin reference,
976  *     INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset);
977  */
978 static
generate_has_mismatch_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])979 rc_t CC generate_has_mismatch_impl ( void *data, const VXformInfo *info, int64_t row_id,
980     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
981 {
982     rc_t rc;
983     int32_t si, ri, roi;
984     uint32_t ref_len = ( uint32_t )argv[ 0 ].u.data.elem_count;
985     uint32_t sbj_len = ( uint32_t )argv[ 1 ].u.data.elem_count;
986     uint32_t hro_len = ( uint32_t )argv[ 2 ].u.data.elem_count;
987     uint32_t ro_len  = ( uint32_t )argv[ 3 ].u.data.elem_count;
988     const uint8_t *ref  = argv [ 0 ] . u . data . base;
989     const uint8_t *sbj  = argv [ 1 ] . u . data . base;
990     const uint8_t *has_ref_offset     = argv [ 2 ] . u . data . base;
991     const int32_t *ref_offset = argv [ 3 ] . u . data . base;
992 
993     uint8_t * dst;
994     uint32_t  len = 0;
995 
996     rslt -> data -> elem_bits = 8;
997     if ( sbj_len == 0 )
998     {
999         rc = KDataBufferResize ( rslt -> data, 0 );
1000         if ( rc != 0 )
1001             return rc;
1002         rslt -> elem_count = 0;
1003         return 0;
1004     }
1005     assert( sbj_len == hro_len );
1006     len = sbj_len;
1007 
1008     ref            += argv [ 0 ] . u . data . first_elem;
1009     sbj            += argv [ 1 ] . u . data . first_elem;
1010     has_ref_offset += argv [ 2 ] . u . data . first_elem;
1011     ref_offset     += argv [ 3 ] . u . data . first_elem;
1012 
1013     /* resize output row for the total number of reads */
1014     rslt -> data -> elem_bits = 8;
1015     rc = KDataBufferResize ( rslt -> data, len );
1016     if ( rc != 0 )
1017         return rc;
1018     rslt -> elem_count = len;
1019     dst = rslt -> data->base;
1020     for ( si = ri = roi = 0; si < ( int32_t )len; si++, ri++ )
1021     {
1022         if ( has_ref_offset[ si ] != 0 ) /*** need to offset the reference ***/
1023         {
1024             if ( roi >= ( int32_t )ro_len )
1025             {
1026                 return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1027             }
1028             ri += ref_offset[ roi++ ];
1029         }
1030 
1031         if ( ri >= 0 && ri < ( int32_t )ref_len && sbj[ si ] == ref[ ri ] )
1032             dst[ si ]=0;
1033         else
1034             dst[ si ]=1;
1035     }
1036     return 0;
1037 }
1038 
1039 
1040 VTRANSFACT_IMPL ( ALIGN_generate_has_mismatch, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1041     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1042 {
1043     rslt->u.rf = generate_has_mismatch_impl;
1044     rslt->variant = vftRow;
1045     rslt -> self = NULL;
1046     rslt -> whack = NULL;
1047     return 0;
1048 }
1049 
1050 /*
1051  * function bool ALIGN:generate_mismatch #1 (INSDC:4na:bin reference,INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset);
1052  */
1053 static
generate_mismatch_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1054 rc_t CC generate_mismatch_impl ( void *data, const VXformInfo *info, int64_t row_id,
1055     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1056 {
1057     rc_t rc;
1058     int32_t si, ri, roi;
1059     uint32_t ref_len = ( uint32_t )argv[ 0 ].u.data.elem_count;
1060     uint32_t sbj_len = ( uint32_t )argv[ 1 ].u.data.elem_count;
1061     uint32_t hro_len = ( uint32_t )argv[ 2 ].u.data.elem_count;
1062     uint32_t ro_len  = ( uint32_t )argv[ 3 ].u.data.elem_count;
1063     const uint8_t *ref  = argv [ 0 ] . u . data . base;
1064     const uint8_t *sbj  = argv [ 1 ] . u . data . base;
1065     const uint8_t *has_ref_offset     = argv [ 2 ] . u . data . base;
1066     const int32_t *ref_offset = argv [ 3 ] . u . data . base;
1067     uint8_t	buf[ 5 * 1024 ];
1068     uint32_t  len;
1069 
1070     rslt -> data -> elem_bits = 8;
1071     if ( sbj_len == 0 )
1072     {
1073         return KDataBufferResize( rslt->data, rslt->elem_count = 0 );
1074     }
1075     assert( sbj_len == hro_len );
1076 
1077     ref            += argv [ 0 ] . u . data . first_elem;
1078     sbj            += argv [ 1 ] . u . data . first_elem;
1079     has_ref_offset += argv [ 2 ] . u . data . first_elem;
1080     ref_offset     += argv [ 3 ] . u . data . first_elem;
1081 
1082     for ( si = ri = roi = 0, len = 0; si < ( int32_t )sbj_len; si++, ri++ )
1083     {
1084         if ( has_ref_offset[ si ] != 0 )/*** need to offset the reference ***/
1085         {
1086             if ( roi >= ( int32_t )ro_len )
1087             {
1088                 return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1089             }
1090             ri += ref_offset[ roi++ ];
1091         }
1092 
1093         if ( ri >=0 && ri < ( int32_t )ref_len && sbj[ si ] == ref[ ri ] )
1094         {
1095             /*noop*/
1096         }
1097         else
1098         {
1099             if ( len > sizeof( buf ) )
1100                 return RC( rcXF, rcFunction, rcExecuting, rcBuffer, rcInsufficient );
1101 
1102             buf[ len++ ] = sbj[ si ];
1103         }
1104     }
1105 
1106     /* resize output row for the total number of reads */
1107     rc = KDataBufferResize ( rslt -> data, len );
1108     if ( rc != 0 )
1109         return rc;
1110     rslt -> elem_count = len;
1111     memmove( rslt -> data->base, buf, len );
1112     return 0;
1113 }
1114 
1115 
1116 VTRANSFACT_IMPL ( ALIGN_generate_mismatch, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1117     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1118 {
1119     rslt->u.rf = generate_mismatch_impl;
1120     rslt->variant = vftRow;
1121     rslt -> self = NULL;
1122     rslt -> whack = NULL;
1123     return 0;
1124 }
1125 
1126 
1127 
1128 /*
1129  * function INSDC:quality:phred NCBI:align:generate_mismatch_qual #1 (INSDC:quality:phred qual,bool has_mismatch)
1130  */
1131 static
generate_mismatch_qual_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1132 rc_t CC generate_mismatch_qual_impl ( void *data, const VXformInfo *info, int64_t row_id,
1133     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1134 {
1135     rc_t rc;
1136     const uint8_t *q    = argv[ 0 ].u.data.base;
1137     const uint8_t *h_mm = argv[ 1 ].u.data.base;
1138     uint8_t	buf[ 5 * 1024 ];
1139     uint32_t mm_cnt, i;
1140 
1141     q    += argv[ 0 ].u.data.first_elem;
1142     h_mm += argv[ 1 ].u.data.first_elem;
1143 
1144     for ( mm_cnt = 0, i = 0; i < argv[ 0 ].u.data.elem_count; i++ )
1145     {
1146         if( h_mm[ i ] )
1147         {
1148             buf[ mm_cnt++ ] = q[ i ];
1149         }
1150     }
1151 
1152     /* resize output row for the total number of reads */
1153     rslt -> data -> elem_bits = 8;
1154     rc = KDataBufferResize ( rslt -> data, mm_cnt );
1155     if ( rc != 0 )
1156         return rc;
1157     rslt -> elem_count = mm_cnt;
1158     if ( mm_cnt > 0 )
1159         memmove( rslt -> data->base, buf, mm_cnt );
1160 
1161     return 0;
1162 }
1163 
1164 
1165 VTRANSFACT_IMPL ( ALIGN_generate_mismatch_qual, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1166     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1167 {
1168     rslt->u.rf = generate_mismatch_qual_impl;
1169     rslt->variant = vftRow;
1170     rslt -> self = NULL;
1171     rslt -> whack = NULL;
1172     return 0;
1173 }
1174 
1175 
1176 
1177 /*
1178  * function ascii NCBI:align:get_mismatch_read #1
1179  *    ( bool has_mismatch, INSDC:dna:text mismatch )
1180  */
1181 static
get_mismatch_read_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1182 rc_t CC get_mismatch_read_impl ( void *data, const VXformInfo *info, int64_t row_id,
1183     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1184 {
1185     rc_t rc;
1186     char *result;
1187     unsigned const readlen = ( unsigned const )argv[ 0 ].u.data.elem_count;
1188 
1189     rslt->data->elem_bits = sizeof( result[ 0 ] ) * 8;
1190     rslt->elem_count = readlen;
1191     rc = KDataBufferResize( rslt->data, rslt->elem_count );
1192     if ( rc == 0 )
1193     {
1194         unsigned i;
1195         unsigned j;
1196         bool const *has_mismatch = argv[ 0 ].u.data.base;
1197         char const *mismatch = argv[ 1 ].u.data.base;
1198 
1199         assert( argv[ 0 ].u.data.elem_bits == sizeof( has_mismatch[ 0 ] ) * 8 );
1200         has_mismatch += argv[ 0 ].u.data.first_elem;
1201 
1202         assert( argv[ 1 ].u.data.elem_bits == sizeof( mismatch[ 0 ] ) * 8 );
1203         mismatch += argv[ 1 ].u.data.first_elem;
1204 
1205         result = rslt->data->base;
1206         for ( i = j = 0; i != readlen; ++i )
1207         {
1208             result[ i ] = has_mismatch[ i ] ? mismatch[ j++ ] : '=';
1209         }
1210     }
1211 
1212     return rc;
1213 }
1214 
1215 
1216 VTRANSFACT_IMPL ( NCBI_align_get_mismatch_read, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1217     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1218 {
1219     rslt->u.rf = get_mismatch_read_impl;
1220     rslt->variant = vftRow;
1221     rslt -> self = NULL;
1222     rslt -> whack = NULL;
1223     return 0;
1224 }
1225 
1226 
1227 
1228 /*
1229  * function INSDC:coord:len NCBI:align:get_left_soft_clip #1 (
1230  *     bool has_ref_offset, INSDC:coord:zero ref_offset )
1231  */
1232 static
left_soft_clip_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1233 rc_t CC left_soft_clip_impl ( void *data, const VXformInfo *info, int64_t row_id,
1234     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1235 {
1236     rc_t rc;
1237     INSDC_coord_len result;
1238     unsigned const n_offsets = ( unsigned const )argv[ 1 ].u.data.elem_count;
1239 
1240     result = 0;
1241 
1242     if ( n_offsets > 0 )
1243     {
1244         bool const *has_ref_offset = argv[ 0 ].u.data.base;
1245         int32_t const *ref_offset = argv[ 1 ].u.data.base;
1246 
1247         assert( argv[ 0 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
1248         assert( argv[ 1 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
1249 
1250         has_ref_offset += argv[ 0 ].u.data.first_elem;
1251         ref_offset += argv[ 1 ].u.data.first_elem;
1252 
1253         if ( has_ref_offset[ 0 ] && ref_offset[ 0 ] < 0 )
1254         {
1255             result = -ref_offset[ 0 ];
1256         }
1257     }
1258 
1259     rslt->data->elem_bits = sizeof( result ) * 8;
1260     rslt->elem_count = 1;
1261     rc = KDataBufferResize( rslt->data, 1 );
1262     if ( rc == 0 )
1263         memmove( rslt->data->base, &result, sizeof( result ) );
1264 
1265     return rc;
1266 }
1267 
1268 
1269 static
left_soft_clip_impl_2(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1270 rc_t CC left_soft_clip_impl_2 ( void *data, const VXformInfo *info, int64_t row_id,
1271     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1272 {
1273     rc_t rc;
1274     INSDC_coord_len* result;
1275     const bool* has_ref_offset = argv[ 0 ].u.data.base;
1276     const int32_t* ref_offset = argv[ 1 ].u.data.base;
1277     const uint64_t n_offsets = argv[ 1 ].u.data.elem_count;
1278     const INSDC_coord_len* read_len = argv[ 2 ].u.data.base;
1279     const uint32_t nreads = ( const uint32_t )argv[ 2 ].u.data.elem_count;
1280     uint32_t n, roi;
1281     INSDC_coord_zero start;
1282 
1283     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
1284     assert( argv[ 1 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
1285     assert( argv[ 2 ].u.data.elem_bits == sizeof( read_len[ 0 ] ) * 8 );
1286 
1287     has_ref_offset += argv[ 0 ].u.data.first_elem;
1288     ref_offset += argv[ 1 ].u.data.first_elem;
1289     read_len += argv[ 2 ].u.data.first_elem;
1290 
1291     rslt->data->elem_bits = sizeof( *result ) * 8;
1292     rslt->elem_count = nreads;
1293     rc = KDataBufferResize( rslt->data, rslt->elem_count );
1294     result = rslt->data->base;
1295 
1296     for ( n = roi = start = 0; rc == 0 && n < nreads; start += read_len[ n++ ] )
1297     {
1298         if ( has_ref_offset[ start ] && ref_offset[ roi ] < 0 )
1299         {
1300             if ( roi >= n_offsets )
1301             {
1302                 rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1303                 break;
1304             }
1305             result[ n ] = -ref_offset[ roi ];
1306         }
1307         else
1308         {
1309             result[ n ] = 0;
1310         }
1311 
1312         if ( n < nreads - 1 && roi < n_offsets )
1313         {
1314             /* scroll through has_ref_offset and consume ref_offsets for this read
1315                only if there is next read or more offsets */
1316             uint32_t k;
1317             for ( k = start; k < start + read_len[ n ]; k++ )
1318             {
1319                 if ( has_ref_offset[ k ] )
1320                 {
1321                     if ( roi >= n_offsets )
1322                     {
1323                         rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1324                         break;
1325                     }
1326                     roi++;
1327                 }
1328             }
1329         }
1330     }
1331 
1332     return rc;
1333 }
1334 
1335 
1336 VTRANSFACT_IMPL ( NCBI_align_get_left_soft_clip, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1337     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1338 {
1339     rslt->u.rf = left_soft_clip_impl;
1340     rslt->variant = vftRow;
1341     rslt -> self = NULL;
1342     rslt -> whack = NULL;
1343     return 0;
1344 }
1345 
1346 
1347 VTRANSFACT_IMPL ( NCBI_align_get_left_soft_clip_2, 2, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1348     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1349 {
1350     rslt->u.rf = left_soft_clip_impl_2;
1351     rslt->variant = vftRow;
1352     rslt -> self = NULL;
1353     rslt -> whack = NULL;
1354     return 0;
1355 }
1356 
1357 
1358 /*
1359  * function INSDC:coord:len NCBI:align:get_right_soft_clip #1 (
1360  *     bool has_mismatch, INSDC:coord:len left_clip )
1361  */
1362 static
right_soft_clip_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1363 rc_t CC right_soft_clip_impl ( void *data, const VXformInfo *info, int64_t row_id,
1364     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1365 {
1366     rc_t rc;
1367     INSDC_coord_len result = 0;
1368     INSDC_coord_len left = 0;
1369     uint32_t right = ( uint32_t )argv[ 0 ].u.data.elem_count;
1370     bool const *has_mismatch = argv[ 0 ].u.data.base;
1371     bool const *has_ref_offset = NULL;
1372     int32_t    last_ref_offset = 0;
1373 
1374     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_mismatch[ 0 ] ) * 8 );
1375     has_mismatch += argv[ 0 ].u.data.first_elem;
1376 
1377     if ( argc > 2 )
1378     {
1379         has_ref_offset = argv[ 2 ].u.data.base;
1380         has_ref_offset += argv[ 2 ].u.data.first_elem;
1381         if ( argc > 3 )
1382         {
1383             const int32_t *ro = argv[ 3 ].u.data.base;
1384             ro += argv[ 3 ].u.data.first_elem;
1385             if ( argv[ 3 ].u.data.elem_count > 0 )
1386             {
1387                 last_ref_offset = ro[ argv[ 3 ].u.data.elem_count - 1 ];
1388             }
1389         }
1390     }
1391 
1392     assert( argv[ 1 ].u.data.elem_bits == sizeof( left ) * 8 );
1393     left = ( ( INSDC_coord_len const * )argv[ 1 ].u.data.base )[ argv[ 1 ].u.data.first_elem ];
1394 
1395     while ( right != left && has_mismatch[ right - 1 ] &&
1396             ( has_ref_offset == NULL || has_ref_offset[ right - 1 ] == 0 ) )
1397     {
1398         ++result;
1399         --right;
1400     }
1401 
1402     while ( right > 0 && last_ref_offset < 0 && has_ref_offset[ right - 1 ] == 0 ) /*** some mismatches from left needs to be recovered to cover for inserts **/
1403     {
1404         last_ref_offset++;
1405         right--;
1406     }
1407 
1408     if ( last_ref_offset < -1 )
1409     {
1410         last_ref_offset ++;
1411         if ( result < (INSDC_coord_len)-last_ref_offset )
1412             result=0;
1413         else
1414             result += last_ref_offset;
1415     }
1416     else if ( last_ref_offset > 0 )
1417     {
1418         result += last_ref_offset;
1419     }
1420 
1421     rslt->data->elem_bits = sizeof( result ) * 8;
1422     rslt->elem_count = 1;
1423     rc = KDataBufferResize( rslt->data, 1 );
1424     if ( rc == 0 )
1425         memmove( rslt->data->base, &result, sizeof( result ) );
1426 
1427     return rc;
1428 }
1429 
1430 
1431 VTRANSFACT_IMPL ( NCBI_align_get_right_soft_clip, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1432     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1433 {
1434     rslt->u.rf = right_soft_clip_impl;
1435     rslt->variant = vftRow;
1436     rslt -> self = NULL;
1437     rslt -> whack = NULL;
1438     return 0;
1439 }
1440 
1441 
1442 VTRANSFACT_IMPL ( NCBI_align_get_right_soft_clip_2, 2, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1443     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1444 {
1445     rslt->u.rf = right_soft_clip_impl;
1446     rslt->variant = vftRow;
1447     rslt -> self = NULL;
1448     rslt -> whack = NULL;
1449     return 0;
1450 }
1451 
1452 
1453 /*
1454  * function INSDC:coord:len NCBI:align:get_right_soft_clip #3
1455  *     ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len )
1456  */
1457 static
right_soft_clip_3_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1458 rc_t CC right_soft_clip_3_impl ( void *data, const VXformInfo *info, int64_t row_id,
1459                               VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1460 {
1461     rc_t rc;
1462     INSDC_coord_len const ref_len = ( ( INSDC_coord_len const * )argv[ 2 ].u.data.base )[ argv[ 2 ].u.data.first_elem ];
1463     unsigned const seq_len = ( unsigned const )argv[ 0 ].u.data.elem_count;
1464     int32_t const *ref_offset  = &( ( int32_t const * )argv[ 1 ].u.data.base )[ argv[ 1 ].u.data.first_elem ];
1465     unsigned const n = ( unsigned const )argv[ 1 ].u.data.elem_count;
1466 
1467     assert( argv[ 1 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
1468     assert( argv[ 2 ].u.data.elem_bits == sizeof( ref_len         ) * 8 );
1469 
1470     rslt->data->elem_bits = rslt->elem_bits;
1471     rslt->elem_count = 1;
1472     rc = KDataBufferResize( rslt->data, 1 );
1473     if ( rc == 0 )
1474     {
1475         ( ( INSDC_coord_len * )rslt->data->base )[ 0 ] = right_soft_clip( seq_len, ref_len, n, ref_offset );
1476     }
1477 
1478     return rc;
1479 }
1480 
1481 
1482 VTRANSFACT_IMPL ( NCBI_align_get_right_soft_clip_3, 3, 0, 0 )
1483     ( const void *Self, const VXfactInfo *info, VFuncDesc *rslt,
1484       const VFactoryParams *cp, const VFunctionParams *dp )
1485 {
1486     rslt->u.rf = right_soft_clip_3_impl;
1487     rslt->variant = vftRow;
1488     rslt -> self = NULL;
1489     rslt -> whack = NULL;
1490     return 0;
1491 }
1492 
1493 
1494 static
right_soft_clip_4_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1495 rc_t CC right_soft_clip_4_impl ( void *data, const VXformInfo *info, int64_t row_id,
1496                                  VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1497 {
1498     rc_t rc;
1499     INSDC_coord_len* result;
1500     const bool* has_ref_offset = argv[ 0 ].u.data.base;
1501     const int32_t* ref_offset = argv[ 1 ].u.data.base;
1502 #if 0
1503     const uint64_t n_offsets = argv[ 1 ].u.data.elem_count;
1504 #endif
1505     const INSDC_coord_len* read_len = argv[ 2 ].u.data.base;
1506     const uint32_t nreads = ( const uint32_t )argv[ 2 ].u.data.elem_count;
1507     const INSDC_coord_len* ref_len = argv[ 3 ].u.data.base;
1508     uint32_t n, roi;
1509     INSDC_coord_zero start;
1510 
1511     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
1512     assert( argv[ 1 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
1513     assert( argv[ 2 ].u.data.elem_bits == sizeof( read_len[ 0 ] ) * 8 );
1514     assert( argv[ 3 ].u.data.elem_bits == sizeof( ref_len[ 0 ] ) * 8 );
1515     assert( argv[ 3 ].u.data.elem_count == 1);
1516 
1517     has_ref_offset += argv[ 0 ].u.data.first_elem;
1518     ref_offset += argv[ 1 ].u.data.first_elem;
1519     read_len += argv[ 2 ].u.data.first_elem;
1520     ref_len += argv[ 3 ].u.data.first_elem;
1521 
1522     rslt->data->elem_bits = sizeof( *result ) * 8;
1523     rslt->elem_count = nreads;
1524     rc = KDataBufferResize( rslt->data, rslt->elem_count );
1525     result = rslt->data->base;
1526 
1527     for( n = roi = start = 0; rc == 0 && n < nreads; ++n )
1528     {
1529         INSDC_coord_len const rlen = read_len[ n ];
1530         unsigned k;
1531         unsigned offset_count;
1532 
1533         for ( k = offset_count = 0; k < rlen; ++k )
1534         {
1535             if ( has_ref_offset[ k + start ] )
1536                 ++offset_count;
1537         }
1538 
1539         result[ n ] = right_soft_clip( rlen, ref_len[ 0 ], offset_count, ref_offset );
1540         ref_offset += offset_count;
1541         start += rlen;
1542 #if 0
1543         result[ n ] = read_len[ n ] - ref_len[ 0 ];
1544         for ( k = start; k < start + read_len[ n ]; k++ )
1545         {
1546             if ( has_ref_offset[ k ] )
1547             {
1548                 if ( roi >= n_offsets )
1549                 {
1550                     rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1551                     break;
1552                 }
1553                 result[ n ] += ref_offset[ roi++ ];
1554             }
1555         }
1556 #endif
1557     }
1558 
1559     return rc;
1560 }
1561 
1562 VTRANSFACT_IMPL ( NCBI_align_get_right_soft_clip_4, 4, 0, 0 )
1563     ( const void *Self, const VXfactInfo *info, VFuncDesc *rslt,
1564       const VFactoryParams *cp, const VFunctionParams *dp )
1565 {
1566     rslt->u.rf = right_soft_clip_4_impl;
1567     rslt->variant = vftRow;
1568     rslt -> self = NULL;
1569     rslt -> whack = NULL;
1570     return 0;
1571 }
1572 
1573 /* extern function INSDC:coord:len NCBI:align:get_right_soft_clip #5
1574  *     ( bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type,
1575  *		 INSDC:coord:len read_len )
1576  */
1577 static
right_soft_clip_5_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1578 rc_t CC right_soft_clip_5_impl ( void *data, const VXformInfo *info, int64_t row_id,
1579                                 VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1580 {
1581     rc_t rc;
1582     unsigned const nreads = argv[3].u.data.elem_count;
1583     ARG_ALIAS(              bool,  has_ref_offset, 0);
1584     ARG_ALIAS(           int32_t,      ref_offset, 1);
1585     ARG_ALIAS(NCBI_align_ro_type, ref_offset_type, 2);
1586     ARG_ALIAS(   INSDC_coord_len,        read_len, 3);
1587 
1588     rslt->data->elem_bits = sizeof(INSDC_coord_len) * 8;
1589     rslt->elem_count = nreads;
1590     rc = KDataBufferResize(rslt->data, nreads);
1591     if (rc == 0) {
1592         INSDC_coord_len *const result = rslt->data->base;
1593         unsigned cur = 0;
1594         unsigned cur_ro = 0;
1595         unsigned n;
1596 
1597         for (n = 0; n < nreads; ++n) {
1598             unsigned const len = read_len[n];
1599             unsigned last = 0;
1600             unsigned prev = 0;
1601             unsigned j;
1602 
1603             for (j = 0; j < len; ++j, ++cur) {
1604                 if (has_ref_offset[cur] != 0) {
1605                     int const offset = ref_offset[cur_ro];
1606                     int const type = ref_offset_type[cur_ro];
1607 
1608                     ++cur_ro;
1609                     if (j > 0 && offset < 0 && type == 1) {
1610                         prev = last;
1611                         last = -offset;
1612                     }
1613                 }
1614             }
1615             result[n] = prev == 0 ? last : prev;
1616         }
1617     }
1618 
1619     return rc;
1620 }
1621 
1622 VTRANSFACT_IMPL(NCBI_align_get_right_soft_clip_5, 5, 0, 0 )
1623     (const void *Self, const VXfactInfo *info, VFuncDesc *rslt,
1624      const VFactoryParams *cp, const VFunctionParams *dp )
1625 {
1626     rslt->u.rf = right_soft_clip_5_impl;
1627     rslt->variant = vftRow;
1628     rslt -> self = NULL;
1629     rslt -> whack = NULL;
1630     return 0;
1631 }
1632 
1633 /*
1634  * function ascii NCBI:align:get_clipped_cigar #1 ( ascii cigar )
1635  */
1636 static
clipped_cigar_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1637 rc_t CC clipped_cigar_impl ( void *data, const VXformInfo *info, int64_t row_id,
1638                               VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1639 {
1640     rc_t rc;
1641     char const *cigar = argv[ 0 ].u.data.base;
1642     unsigned const ciglen = ( unsigned const )argv[ 0 ].u.data.elem_count;
1643     int n;
1644     unsigned start = 0;
1645     unsigned end = ciglen;
1646 
1647     assert( argv[ 0 ].u.data.elem_bits == sizeof( cigar[ 0 ] ) * 8 );
1648     cigar += argv[ 0 ].u.data.first_elem;
1649 
1650     for ( n = 0; n != ciglen; ++n )
1651     {
1652         if ( !isdigit( cigar[ n ] ) )
1653             break;
1654     }
1655 
1656     if ( cigar[ n ] == 'S' )
1657         start = n + 1;
1658 
1659     if ( cigar[ end - 1 ] == 'S' )
1660     {
1661         --end;
1662         while ( end > start && isdigit( cigar[ end - 1 ] ) )
1663             --end;
1664     }
1665 
1666     rslt->data->elem_bits = sizeof( cigar[ 0 ] ) * 8;
1667     rslt->elem_count = ( end > start ) ? end - start : 0;
1668     rc = KDataBufferResize( rslt->data, rslt->elem_count );
1669     if ( rc == 0 && rslt->elem_count > 0 )
1670         memmove( rslt->data->base, &cigar[ start ], ( size_t )rslt->elem_count );
1671 
1672     return rc;
1673 }
1674 
1675 
remove_left_soft_clip(int const length,char const cigar[])1676 static int remove_left_soft_clip(int const length, char const cigar[ /* length */ ])
1677 {
1678     int i;
1679     for (i = 0; i < length; ++i) {
1680         int const ch = cigar[i];
1681         if (ch < '0' || ch > '9')
1682             break;
1683     }
1684     if (i < length && cigar[i] == 'S')
1685         return i + 1;
1686     else
1687         return 0;
1688 }
1689 
remove_right_soft_clip(int const length,char const cigar[])1690 static int remove_right_soft_clip(int const length, char const cigar[ /* length */ ])
1691 {
1692     if (length > 0 && cigar[length - 1] == 'S') {
1693         int i = length - 1;
1694         while (i > 0) {
1695             int const ch = cigar[i - 1];
1696             if (ch < '0' || ch > '9')
1697                 break;
1698             --i;
1699         }
1700         return i;
1701     }
1702     return length;
1703 }
1704 
1705 static
clipped_cigar_impl_v2(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1706 rc_t CC clipped_cigar_impl_v2 ( void *data, const VXformInfo *info, int64_t row_id,
1707                                 VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1708 {
1709     VRowData const *const argCigar = &argv[0];
1710     VRowData const *const argCigLen = &argv[1];
1711     char const* cigar = argCigar->u.data.base;
1712     INSDC_coord_len const *cigLen = argCigLen->u.data.base;
1713     int const N = (int)argCigLen->u.data.elem_count;
1714     rc_t rc = 0;
1715 
1716     assert(argCigar->u.data.elem_bits == sizeof(cigar[0]) * 8);
1717     assert(argCigLen->u.data.elem_bits == sizeof(cigLen[0]) * 8);
1718 
1719     if (argCigar->u.data.elem_count == 0) {
1720         rslt->elem_count = 0;
1721         return KDataBufferResize(rslt->data, 0);
1722     }
1723 
1724     cigar += argCigar->u.data.first_elem;
1725     cigLen += argCigLen->u.data.first_elem;
1726 
1727     if (data != NULL)
1728     { /* outputting the lengths */
1729         rslt->data->elem_bits = sizeof(cigLen[0]) * 8;
1730         rslt->elem_count = N;
1731         rc = KDataBufferResize(rslt->data, rslt->elem_count);
1732         if (rc == 0) {
1733             INSDC_coord_len *const out = rslt->data->base;
1734             int i;
1735 
1736             for (i = 0; i < N; ++i) {
1737                 int const len = cigLen[i];
1738                 int const rlsc = remove_left_soft_clip(len, cigar);
1739                 int const rrsc = remove_right_soft_clip(len, cigar);
1740 
1741                 out[i] = (INSDC_coord_len)(rrsc > rlsc ? (rrsc - rlsc) : 0);
1742                 cigar += len;
1743             }
1744         }
1745     }
1746     else
1747     { /* outputting the strings */
1748         int i;
1749 
1750         rslt->data->elem_bits = sizeof(cigar[0]) * 8;
1751         rslt->elem_count = 0;
1752 
1753         for (i = 0; i < N; ++i) {
1754             int const len = cigLen[i];
1755             int const rlsc = remove_left_soft_clip(len, cigar);
1756             int const rrsc = remove_right_soft_clip(len, cigar);
1757             int const newLen = rrsc > rlsc ? (rrsc - rlsc) : 0;
1758             char *out;
1759 
1760             rc = KDataBufferResize(rslt->data, rslt->elem_count + newLen);
1761             if (rc) return rc;
1762             out = rslt->data->base;
1763             memmove(&out[rslt->elem_count], &cigar[rlsc], (size_t)newLen);
1764             rslt->elem_count += newLen;
1765             cigar += len;
1766         }
1767     }
1768     return rc;
1769 }
1770 
1771 
1772 VTRANSFACT_IMPL ( NCBI_align_get_clipped_cigar, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1773     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1774 {
1775     rslt->u.rf = clipped_cigar_impl;
1776     rslt->variant = vftRow;
1777     rslt -> self = NULL;
1778     rslt -> whack = NULL;
1779     return 0;
1780 }
1781 
1782 
1783 VTRANSFACT_IMPL ( NCBI_align_get_clipped_cigar_2, 2, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1784     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1785 {
1786     const VTypedesc* tp = &info->fdesc.desc;
1787 
1788     if ( tp->domain == vtdAscii && tp->intrinsic_bits == 8 )
1789     {
1790         rslt->self = NULL;
1791     }
1792     else if ( tp->domain == vtdUint && tp->intrinsic_bits == sizeof( INSDC_coord_len ) * 8 )
1793     {
1794         rslt->self = rslt; /* something different from NULL :) */
1795     }
1796     else
1797     {
1798         return RC( rcXF, rcFunction, rcConstructing, rcParam, rcIncorrect );
1799     }
1800     rslt->u.rf = clipped_cigar_impl_v2;
1801     rslt->variant = vftRow;
1802     rslt->whack = NULL;
1803     return 0;
1804 }
1805 
1806 
1807 
1808 /*
1809  * function I32 NCBI:align:get_clipped_ref_offset #1 (
1810  *     bool has_ref_offset, I32 ref_offset )
1811  */
1812 static
clipped_ref_offset_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1813 rc_t CC clipped_ref_offset_impl ( void *data, const VXformInfo *info, int64_t row_id,
1814     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1815 {
1816     rc_t rc;
1817     unsigned start = 0;
1818     unsigned const n_offsets = ( unsigned const )argv[ 1 ].u.data.elem_count;
1819     int32_t const *ref_offset = argv[ 1 ].u.data.base;
1820 
1821     assert( argv[ 1 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
1822     ref_offset += argv[ 1 ].u.data.first_elem;
1823 
1824     if ( n_offsets > 0 )
1825     {
1826         bool const *has_ref_offset = argv[ 0 ].u.data.base;
1827 
1828         assert( argv[ 0 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
1829         has_ref_offset += argv[ 0 ].u.data.first_elem;
1830 
1831         if ( has_ref_offset[ 0 ] && ref_offset[ 0 ] < 0 )
1832         {
1833             start = 1;
1834         }
1835     }
1836 
1837     rslt->data->elem_bits = sizeof( ref_offset[ 0 ] ) * 8;
1838     rslt->elem_count = n_offsets - start;
1839     rc = KDataBufferResize( rslt->data, rslt->elem_count );
1840     if ( rc == 0 )
1841         memmove( rslt->data->base,
1842                 &ref_offset[ start ],
1843                 ( size_t )( sizeof( ref_offset[ 0 ] ) * rslt->elem_count ) );
1844 
1845     return rc;
1846 }
1847 
1848 
1849 VTRANSFACT_IMPL ( NCBI_align_get_clipped_ref_offset, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1850     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1851 {
1852     rslt->u.rf = clipped_ref_offset_impl;
1853     rslt->variant = vftRow;
1854     rslt -> self = NULL;
1855     rslt -> whack = NULL;
1856     return 0;
1857 }
1858 
1859 
1860 /*
1861  * function INSDC:coord:len NCBI:align:get_ref_len #1 (
1862  *     bool has_ref_offset, I32 ref_offset, INSDC:coord:len right_clip )
1863  */
1864 static
get_ref_len_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1865 rc_t CC get_ref_len_impl ( void *data, const VXformInfo *info, int64_t row_id,
1866     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1867 {
1868     rc_t rc;
1869     INSDC_coord_len result,right=0;
1870 
1871     unsigned const read_len = ( unsigned const )argv[ 0 ].u.data.elem_count;
1872     unsigned const n_offsets = ( unsigned const )argv[ 1 ].u.data.elem_count;
1873     bool const *hro = argv[ 0 ].u.data.base;
1874     int32_t const *ref_offset = argv[ 1 ].u.data.base;
1875 
1876     assert( argv[ 1 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
1877 
1878     hro += argv[ 0 ].u.data.first_elem;
1879     ref_offset += argv[ 1 ].u.data.first_elem;
1880 
1881     if ( argc > 2 )
1882     {
1883         right = ( ( INSDC_coord_len* )argv[ 2 ].u.data.base )[ argv[ 2 ].u.data.first_elem ];
1884 
1885         assert( argv[ 2 ].u.data.elem_bits == sizeof( right ) * 8 );
1886         assert( read_len >= right );
1887     }
1888 
1889     if ( n_offsets == 0 )
1890     {
1891         if ( read_len < right )
1892             return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1893         result = read_len - right;
1894     }
1895     else
1896     {
1897         int32_t  ires, rov;
1898         unsigned i;
1899 
1900         if ( argc > 2 ) /*** clipping ***/
1901         {
1902             for ( i = 0, ires = read_len - right; i < n_offsets; i++ )
1903             {
1904                 memmove( &rov, ref_offset + i, sizeof rov );
1905                 ires += rov;
1906             }
1907         }
1908         else
1909         {
1910             int32_t sum_pos, sum_neg;
1911             for ( i = 0, sum_pos = sum_neg = 0; i < n_offsets; i++ )
1912             {
1913                 memmove( &rov, ref_offset + i, sizeof rov );
1914                 if ( rov > 0 )
1915                     sum_pos += rov;
1916                 else
1917                     sum_neg += rov;
1918             }
1919 
1920             if ( sum_pos + sum_neg >= 0 ) /** all offsets may not over-shorten needed reference ***/
1921             {
1922                 ires = read_len + sum_pos + sum_neg;
1923             }
1924             else /** inefficient case - exact reach into the reference is needed **/
1925             {
1926                 unsigned j;
1927                 for ( i = 0, j = 0, sum_pos = 0, ires = 0; j < read_len; j++ )
1928                 {
1929                     if ( hro[ i ] )
1930                     {
1931                         if ( i >= n_offsets )
1932                             return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1933                         memmove( &rov, ref_offset + i, sizeof rov );
1934                         ires += rov;
1935                         i++;
1936                     }
1937                     sum_pos ++;
1938                     if ( sum_pos > ires )
1939                         ires=sum_pos;
1940                 }
1941             }
1942         }
1943 
1944         if ( ires < 0 )
1945             return RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
1946         result = ires;
1947     }
1948 
1949     rslt->data->elem_bits = sizeof( result ) * 8;
1950     rslt->elem_count = 1;
1951     rc = KDataBufferResize( rslt->data, rslt->elem_count );
1952     if ( rc == 0 )
1953         memmove( rslt->data->base, &result, sizeof( result ) );
1954 
1955     return rc;
1956 }
1957 
1958 
1959 VTRANSFACT_IMPL ( NCBI_align_get_ref_len, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
1960     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
1961 {
1962     rslt->u.rf = get_ref_len_impl;
1963     rslt->variant = vftRow;
1964     rslt -> self = NULL;
1965     rslt -> whack = NULL;
1966     return 0;
1967 }
1968 
1969 
1970 /*
1971  * function INSDC:coord:len NCBI:align:get_ref_len #2 (
1972  *											bool has_ref_offset,
1973  *											I32 ref_offset)
1974  */
1975 static
get_ref_len_2_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])1976 rc_t CC get_ref_len_2_impl ( void *data, const VXformInfo *info, int64_t row_id,
1977                           VRowResult *rslt, uint32_t argc, const VRowData argv [] )
1978 {
1979     rc_t rc;
1980     ARG_ALIAS(int32_t, refOffset, 1);
1981     unsigned const n = (unsigned)argv[1].u.data.elem_count;
1982     int result = (unsigned)argv[0].u.data.elem_count;
1983     unsigned i;
1984 
1985     for (i = 0; i < n; ++i)
1986         result += (int)refOffset[i];
1987 
1988     rslt->data->elem_bits = sizeof(INSDC_coord_len) * 8;
1989     rc = KDataBufferResize(rslt->data, rslt->elem_count = 1);
1990     if (rc == 0)
1991         ((INSDC_coord_len *)rslt->data->base)[0] = result;
1992 
1993     return rc;
1994 }
1995 
1996 
1997 VTRANSFACT_IMPL(NCBI_align_get_ref_len_2, 2, 0, 0)(const void *Self, const VXfactInfo *info,
1998                      VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp)
1999 {
2000     rslt->u.rf = get_ref_len_2_impl;
2001     rslt->variant = vftRow;
2002     rslt -> self = NULL;
2003     rslt -> whack = NULL;
2004     return 0;
2005 }
2006 
2007 
2008 /*
2009  * function < type T > T NCBI:align:clip #1 ( T value,
2010  *     INSDC:coord:len left, INSDC:coord:len right )
2011  */
2012 static
clip_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2013 rc_t CC clip_impl ( void *data, const VXformInfo *info, int64_t row_id,
2014     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
2015 {
2016     rc_t rc;
2017     INSDC_coord_len left  = ( ( INSDC_coord_len const * )argv[ 1 ].u.data.base )[ argv[ 1 ].u.data.first_elem ];
2018     INSDC_coord_len right = ( ( INSDC_coord_len const * )argv[ 2 ].u.data.base )[ argv[ 2 ].u.data.first_elem ];
2019 
2020     rslt->data->elem_bits = argv[ 0 ].u.data.elem_bits;
2021     if ( argv[ 0 ].u.data.elem_count >= left + right )
2022     {
2023         rslt->elem_count = argv[ 0 ].u.data.elem_count - left - right;
2024     }
2025     else
2026     {
2027         rslt->elem_count = 0;
2028     }
2029 
2030     rc = KDataBufferResize( rslt->data, rslt->elem_count );
2031     if ( rc == 0 )
2032     {
2033         if ( ( rslt->data->elem_bits & 7 ) == 0 )
2034         {
2035             memmove( rslt->data->base,
2036                     &( ( char const * )argv[ 0 ].u.data.base )[ ( ( left + argv[ 0 ].u.data.first_elem ) * argv[ 0 ].u.data.elem_bits ) >> 3 ],
2037                     ( size_t )( ( rslt->elem_count * rslt->data->elem_bits ) >> 3 ) );
2038         }
2039         else
2040         {
2041             bitcpy( rslt->data->base,
2042                     0,
2043                     argv[ 0 ].u.data.base,
2044                     ( left + argv[ 0 ].u.data.first_elem ) * argv[ 0 ].u.data.elem_bits,
2045                     rslt->elem_count * rslt->data->elem_bits );
2046         }
2047     }
2048     return rc;
2049 }
2050 
2051 
2052 VTRANSFACT_IMPL ( NCBI_align_clip, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2053     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2054 {
2055     rslt->u.rf = clip_impl;
2056     rslt->variant = vftRow;
2057     rslt -> self = NULL;
2058     rslt -> whack = NULL;
2059     return 0;
2060 }
2061 
2062 
2063 static
clip_impl_2(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2064 rc_t CC clip_impl_2 ( void *data, const VXformInfo *info, int64_t row_id,
2065                       VRowResult *rslt, uint32_t argc, const VRowData argv [] )
2066 {
2067     rc_t rc;
2068     const INSDC_coord_len* read_len = argv[ 1 ].u.data.base;
2069     const uint32_t nreads = ( const uint32_t )argv[ 1 ].u.data.elem_count;
2070     const INSDC_coord_len* left =  argv[ 2 ].u.data.base;
2071     const INSDC_coord_len* right = argv[ 3 ].u.data.base;
2072     uint32_t n;
2073     INSDC_coord_zero start;
2074 
2075     assert( argv[ 1 ].u.data.elem_count == argv[ 2 ].u.data.elem_count );
2076     assert( argv[ 1 ].u.data.elem_count == argv[ 3 ].u.data.elem_count );
2077     assert( argv[ 1 ].u.data.elem_bits == sizeof( read_len[ 0 ] ) * 8 );
2078     assert( argv[ 2 ].u.data.elem_bits == sizeof( left[ 0 ] ) * 8 );
2079     assert( argv[ 2 ].u.data.elem_bits == sizeof( right[ 0 ] ) * 8 );
2080 
2081     read_len += argv[ 1 ].u.data.first_elem;
2082     left += argv[ 2 ].u.data.first_elem;
2083     right += argv[ 3 ].u.data.first_elem;
2084 
2085     rslt->data->elem_bits = argv[ 0 ].u.data.elem_bits;
2086     rslt->elem_count = 0;
2087     rc = KDataBufferResize( rslt->data, rslt->elem_count );
2088     if ( rc != 0 )
2089         return rc;
2090     if ( argv[ 0 ].u.data.elem_count == 0 )
2091         return 0;
2092 
2093     for ( n = start = 0; rc == 0 && n < nreads; start += read_len[ n++ ] )
2094     {
2095         uint64_t x = left[n] + right[n];
2096 
2097         if ( argv[ 0 ].u.data.elem_count < start ||
2098              argv[ 0 ].u.data.elem_count < start + read_len[ n ] )
2099         {
2100             rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInvalid );
2101         }
2102 /*
2103         assert(argv[0].u.data.elem_count >= start);
2104         assert(argv[0].u.data.elem_count >= start + read_len[n]);
2105 */
2106         else if ( read_len[ n ] > x )
2107         {
2108             x = read_len[ n ] - x;
2109             if ( x > 0 )
2110             {
2111                 rc = KDataBufferResize( rslt->data, rslt->elem_count + x );
2112                 if ( rc == 0 )
2113                 {
2114                     bitcpy( rslt->data->base,
2115                             rslt->elem_count * argv[ 0 ].u.data.elem_bits,
2116                             argv[ 0 ].u.data.base,
2117                             ( argv[ 0 ].u.data.first_elem + start + left[ n ] ) * argv[ 0 ].u.data.elem_bits,
2118                             x * rslt->data->elem_bits );
2119                     rslt->elem_count += x;
2120                 }
2121             }
2122         }
2123     }
2124 
2125     return rc;
2126 }
2127 
2128 
2129 VTRANSFACT_IMPL ( NCBI_align_clip_2, 2, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2130     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2131 {
2132     rslt->u.rf = clip_impl_2;
2133     rslt->variant = vftRow;
2134     rslt -> self = NULL;
2135     rslt -> whack = NULL;
2136     return 0;
2137 }
2138 
2139 
2140 /*
2141  * function bool NCBI:align:get_ref_mismatch #1
2142  *     ( bool has_mismatch, bool has_ref_offset, I32 ref_offset,
2143  *       INSDC:coord:len ref_len )
2144  */
2145 static
get_ref_mismatch_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2146 rc_t CC get_ref_mismatch_impl ( void *data, const VXformInfo *info, int64_t row_id,
2147                    VRowResult *rslt, uint32_t argc, const VRowData argv [] )
2148 {
2149     rc_t rc;
2150     bool const *has_mismatch       = argv [ 0 ] . u . data . base;
2151     bool const *has_ref_offset     = argv [ 1 ] . u . data . base;
2152     int32_t const *ref_offset      = argv [ 2 ] . u . data . base;
2153     INSDC_coord_len const *ref_len = argv [ 3 ] . u . data . base;
2154 
2155     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_mismatch[ 0 ] ) * 8 );
2156     assert( argv[ 1 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
2157     assert( argv[ 2 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
2158     assert( argv[ 3 ].u.data.elem_bits == sizeof( ref_len[ 0 ] ) * 8 );
2159 
2160     has_mismatch   += argv[ 0 ].u.data.first_elem;
2161     has_ref_offset += argv[ 1 ].u.data.first_elem;
2162     ref_offset     += argv[ 2 ].u.data.first_elem;
2163 
2164     rslt->data->elem_bits = sizeof( bool ) * 8;
2165     rslt->elem_count = ref_len[ argv[ 3 ].u.data.first_elem ];
2166     rc = KDataBufferResize( rslt->data, rslt->elem_count );
2167     if ( rc == 0 )
2168     {
2169         bool *result = ( bool* )rslt->data->base;
2170         unsigned j;
2171         unsigned ri;
2172         unsigned si;
2173 
2174         memset( result, 0, ( size_t )( sizeof( result[ 0 ] ) * rslt->elem_count ) );
2175         for ( j = ri = si = 0; si < argv[ 0 ].u.data.elem_count; )
2176         {
2177             if ( has_ref_offset[ si ] )
2178             {
2179                 int offset = ref_offset[ j++ ];
2180 
2181                 if ( offset > 0 )
2182                     ri += offset;
2183                 else
2184                 {
2185                     si -= offset;
2186                     continue;
2187                 }
2188             }
2189             if ( ri >= rslt->elem_count )
2190                 break;
2191             if ( has_mismatch[ si ] )
2192             {
2193                 result[ ri ] = 1;
2194             }
2195             ++si;
2196             ++ri;
2197         }
2198     }
2199 
2200     return rc;
2201 }
2202 
2203 
2204 VTRANSFACT_IMPL ( NCBI_align_get_ref_mismatch, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2205     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2206 {
2207     rslt->u.rf = get_ref_mismatch_impl;
2208     rslt->variant = vftRow;
2209     rslt -> self = NULL;
2210     rslt -> whack = NULL;
2211     return 0;
2212 }
2213 
2214 
2215 
2216 /*
2217  * function bool NCBI:align:get_ref_insert #1
2218  *     ( bool has_mismatch, bool has_ref_offset, I32 ref_offset,
2219  *       INSDC:coord:len ref_len )
2220  */
2221 static
get_ref_insert_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2222 rc_t CC get_ref_insert_impl ( void *data, const VXformInfo *info, int64_t row_id,
2223                                VRowResult *rslt, uint32_t argc, const VRowData argv [] )
2224 {
2225     rc_t rc;
2226     bool const *has_mismatch       = argv [ 0 ] . u . data . base;
2227     bool const *has_ref_offset     = argv [ 1 ] . u . data . base;
2228     int32_t const *ref_offset      = argv [ 2 ] . u . data . base;
2229     INSDC_coord_len const *ref_len = argv [ 3 ] . u . data . base;
2230 
2231     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_mismatch[ 0 ] ) * 8 );
2232     assert( argv[ 1 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
2233     assert( argv[ 2 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
2234     assert( argv[ 3 ].u.data.elem_bits == sizeof( ref_len[ 0 ] ) * 8 );
2235 
2236     has_mismatch   += argv[ 0 ].u.data.first_elem;
2237     has_ref_offset += argv[ 1 ].u.data.first_elem;
2238     ref_offset     += argv[ 2 ].u.data.first_elem;
2239 
2240     rslt->data->elem_bits = sizeof( bool ) * 8;
2241     rslt->elem_count = ref_len[ argv[ 3 ].u.data.first_elem ];
2242     rc = KDataBufferResize( rslt->data, rslt->elem_count );
2243     if (rc == 0 && rslt->elem_count > 0) {
2244         bool *result = ( bool* )rslt->data->base;
2245         unsigned j;
2246         unsigned ri;
2247         unsigned si;
2248 
2249         memset( result, 0, ( size_t )( sizeof( result[ 0 ] ) * rslt->elem_count ) );
2250         for ( j = ri = si = 0; si < argv[ 0 ].u.data.elem_count; )
2251         {
2252             if ( has_ref_offset[ si ] )
2253             {
2254                 int offset = ref_offset[ j++ ];
2255 
2256                 if ( offset > 0 )
2257                 {
2258                     ri += offset;
2259                 }
2260                 else
2261                 {
2262                     if ( si > 0 )
2263                     {
2264                         if ( ri >= 1 )
2265                             result[ ri - 1 ] = 1;
2266                         result[ ri ] = 1;
2267                     }
2268                     si -= offset;
2269                     continue;
2270                 }
2271             }
2272             ++si;
2273             ++ri;
2274         }
2275     }
2276     return rc;
2277 }
2278 
2279 
2280 VTRANSFACT_IMPL ( NCBI_align_get_ref_insert, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2281     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2282 {
2283     rslt->u.rf = get_ref_insert_impl;
2284     rslt->variant = vftRow;
2285     rslt -> self = NULL;
2286     rslt -> whack = NULL;
2287     return 0;
2288 }
2289 
2290 
2291 
2292 /*
2293  * function bool NCBI:align:get_ref_delete #1
2294  *     ( bool has_mismatch, bool has_ref_offset, I32 ref_offset,
2295  *       INSDC:coord:len ref_len )
2296  */
2297 static
get_ref_delete_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2298 rc_t CC get_ref_delete_impl ( void *data, const VXformInfo *info, int64_t row_id,
2299     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
2300 {
2301     rc_t rc;
2302     bool const *has_mismatch       = argv [ 0 ] . u . data . base;
2303     bool const *has_ref_offset     = argv [ 1 ] . u . data . base;
2304     int32_t const *ref_offset      = argv [ 2 ] . u . data . base;
2305     INSDC_coord_len const *ref_len = argv [ 3 ] . u . data . base;
2306 
2307     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_mismatch[ 0 ] ) * 8 );
2308     assert( argv[ 1 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
2309     assert( argv[ 2 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
2310     assert( argv[ 3 ].u.data.elem_bits == sizeof( ref_len[ 0 ] ) * 8 );
2311 
2312     has_mismatch   += argv[ 0 ].u.data.first_elem;
2313     has_ref_offset += argv[ 1 ].u.data.first_elem;
2314     ref_offset     += argv[ 2 ].u.data.first_elem;
2315 
2316     rslt->data->elem_bits = sizeof( bool ) * 8;
2317     rslt->elem_count = ref_len[ argv[ 3 ].u.data.first_elem ];
2318     rc = KDataBufferResize( rslt->data, rslt->elem_count );
2319     if ( rc == 0 )
2320     {
2321         bool *result = ( bool* )rslt->data->base;
2322         unsigned j;
2323         unsigned ri;
2324         unsigned si;
2325 
2326         memset( result, 0, ( size_t )( sizeof( result[ 0 ] ) * rslt->elem_count ) );
2327         for ( j = ri = si = 0; si < argv[ 0 ].u.data.elem_count; )
2328         {
2329             if ( has_ref_offset[ si ] )
2330             {
2331                 int offset = ref_offset[ j++ ];
2332 
2333                 if ( offset > 0 )
2334                 {
2335                     memset( &result[ ri ], 1, offset );
2336                     ri += offset;
2337                 }
2338                 else
2339                 {
2340                     si -= offset;
2341                     continue;
2342                 }
2343             }
2344             ++si;
2345             ++ri;
2346         }
2347     }
2348 
2349     return rc;
2350 }
2351 
2352 
2353 VTRANSFACT_IMPL ( NCBI_align_get_ref_delete, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2354     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2355 {
2356     rslt->u.rf = get_ref_delete_impl;
2357     rslt->variant = vftRow;
2358     rslt -> self = NULL;
2359     rslt -> whack = NULL;
2360     return 0;
2361 }
2362 
2363 
2364 
2365 #define USE_BIGGER_PRESERVE_BORDER 1
2366 /*
2367  * function bool NCBI:align:get_preserve_qual #1
2368  *     ( bool has_mismatch, bool has_ref_offset, I32 ref_offset,
2369  *       INSDC:coord:len ref_len )
2370  */
2371 static
get_ref_preserve_qual_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2372 rc_t CC get_ref_preserve_qual_impl ( void *data, const VXformInfo *info, int64_t row_id,
2373     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
2374 {
2375     rc_t rc;
2376     bool const *has_mismatch       = argv [ 0 ] . u . data . base;
2377     bool const *has_ref_offset     = argv [ 1 ] . u . data . base;
2378     int32_t const *ref_offset      = argv [ 2 ] . u . data . base;
2379     INSDC_coord_len const *ref_len = argv [ 3 ] . u . data . base;
2380 
2381     assert( argv[ 0 ].u.data.elem_bits == sizeof( has_mismatch[ 0 ] ) * 8 );
2382     assert( argv[ 1 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
2383     assert( argv[ 2 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
2384     assert( argv[ 3 ].u.data.elem_bits == sizeof( ref_len[ 0 ] ) * 8 );
2385 
2386     has_mismatch   += argv[ 0 ].u.data.first_elem;
2387     has_ref_offset += argv[ 1 ].u.data.first_elem;
2388     ref_offset     += argv[ 2 ].u.data.first_elem;
2389 
2390     rslt->data->elem_bits = sizeof( bool ) * 8;
2391     rslt->elem_count = ref_len[ argv[ 3 ].u.data.first_elem ];
2392     rc = KDataBufferResize( rslt->data, rslt->elem_count );
2393     if ( rc == 0 )
2394     {
2395         bool *result = ( bool* )rslt->data->base;
2396         unsigned j;
2397         unsigned ri;
2398         unsigned si;
2399 
2400         memset( result, 0, ( size_t )( sizeof( result[ 0 ] ) * rslt->elem_count ) );
2401         for ( j = ri = si = 0; si < argv[ 0 ].u.data.elem_count; )
2402         {
2403             if ( has_ref_offset[ si ] )
2404             {
2405                 int offset = ref_offset[ j++ ];
2406 
2407                 if ( offset > 0 )
2408                 {
2409                     /* Preserve the qualities for deleted bases + plus the border */
2410 #if USE_BIGGER_PRESERVE_BORDER
2411                     if ( ri >= 2 ) result[ ri - 2 ] = 1;
2412 #endif
2413                     if ( ri >= 1 ) result[ ri - 1 ] = 1;
2414                     memset( &result[ ri ], 1, offset );
2415                     ri += offset;
2416                     result[ ri ] = 1;
2417 #if USE_BIGGER_PRESERVE_BORDER
2418                     if ( ri + 1 < rslt->elem_count ) result[ ri + 1 ] = 1;
2419 #endif
2420                 }
2421                 else
2422                 {
2423                     if ( si > 0 )
2424                     {
2425                         /* Preserve the qualites for the bases bordering the insert */
2426 #if USE_BIGGER_PRESERVE_BORDER
2427                         if ( ri >= 2 ) result[ ri - 2 ] = 1;
2428 #endif
2429                         if ( ri >= 1 ) result[ ri - 1 ] = 1;
2430                         result[ ri ] = 1;
2431 #if USE_BIGGER_PRESERVE_BORDER
2432                         if ( ri + 1 < rslt->elem_count ) result[ ri + 1 ] = 1;
2433 #endif
2434                     }
2435                     si -= offset;
2436                     continue;
2437                 }
2438             }
2439             if ( ri >= rslt->elem_count ) break;
2440             if ( has_mismatch[ si ] )
2441             {
2442 #if USE_BIGGER_PRESERVE_BORDER
2443                 if ( ri >= 1 ) result[ ri - 1 ] = 1;
2444 #endif
2445                 result[ ri ] = 1;
2446 #if USE_BIGGER_PRESERVE_BORDER
2447                 if ( ri + 1 < rslt->elem_count ) result[ ri + 1 ] = 1;
2448 #endif
2449             }
2450             ++si;
2451             ++ri;
2452         }
2453     }
2454     return rc;
2455 }
2456 
2457 
2458 VTRANSFACT_IMPL ( NCBI_align_get_ref_preserve_qual, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2459     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2460 {
2461     rslt->u.rf = get_ref_preserve_qual_impl;
2462     rslt->variant = vftRow;
2463     rslt -> self = NULL;
2464     rslt -> whack = NULL;
2465     return 0;
2466 }
2467 
2468 
2469 
2470 /*
2471  * function bool NCBI:align:get_seq_preserve_qual #1
2472  *    ( bool ref_preserve_qual, bool has_ref_offset, I32 ref_offset );
2473  */
2474 static
get_seq_preserve_qual_impl(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2475 rc_t CC get_seq_preserve_qual_impl ( void *data, const VXformInfo *info, int64_t row_id,
2476                                     VRowResult *rslt, uint32_t argc, const VRowData argv [] )
2477 {
2478     rc_t rc;
2479     bool const *ref_pq             = argv [ 0 ] . u . data . base;
2480     bool const *has_ref_offset     = argv [ 1 ] . u . data . base;
2481     int32_t const *ref_offset      = argv [ 2 ] . u . data . base;
2482 
2483     assert( argv[ 0 ].u.data.elem_bits == sizeof( ref_pq[ 0 ] ) * 8 );
2484     assert( argv[ 1 ].u.data.elem_bits == sizeof( has_ref_offset[ 0 ] ) * 8 );
2485     assert( argv[ 2 ].u.data.elem_bits == sizeof( ref_offset[ 0 ] ) * 8 );
2486 
2487     ref_pq         += argv[ 0 ].u.data.first_elem;
2488     has_ref_offset += argv[ 1 ].u.data.first_elem;
2489     ref_offset     += argv[ 2 ].u.data.first_elem;
2490 
2491     rslt->data->elem_bits = sizeof( bool ) * 8;
2492     rslt->elem_count = argv[ 1 ].u.data.elem_count;
2493     rc = KDataBufferResize( rslt->data, rslt->elem_count );
2494     if ( rc == 0 )
2495     {
2496         bool *result = ( bool* )rslt->data->base;
2497         unsigned j;
2498         unsigned ri;
2499         unsigned si;
2500 
2501         memset( result, 1, ( size_t )( sizeof( result[ 0 ] ) * rslt->elem_count ) );
2502         for ( j = ri = si = 0; si < argv[ 1 ].u.data.elem_count; )
2503         {
2504             if ( has_ref_offset[ si ] )
2505             {
2506                 int offset = ref_offset[ j++ ];
2507 
2508                 if ( offset > 0 )
2509                 {
2510                     ri += offset;
2511                 }
2512                 else
2513                 {
2514                     si -= offset;
2515                     continue;
2516                 }
2517             }
2518             if ( ri >= argv[ 0 ].u.data.elem_count ) break;
2519             result[ si ] = ref_pq[ ri ];
2520             ++si;
2521             ++ri;
2522         }
2523     }
2524 
2525     return rc;
2526 }
2527 
2528 VTRANSFACT_IMPL ( NCBI_align_get_seq_preserve_qual, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2529                                                                VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2530 {
2531     rslt->u.rf = get_seq_preserve_qual_impl;
2532     rslt->variant = vftRow;
2533     rslt -> self = NULL;
2534     rslt -> whack = NULL;
2535     return 0;
2536 }
2537 
2538 
2539 
2540 /*
2541  * function ascii NCBI:align:rna_orientation #1 ( NCBI:align:ro_type *ref_offset_type );
2542  */
2543 static
get_rna_orientation(void * data,const VXformInfo * info,int64_t row_id,VRowResult * rslt,uint32_t argc,const VRowData argv[])2544 rc_t CC get_rna_orientation(void *data, const VXformInfo *info, int64_t row_id,
2545                         VRowResult *rslt, uint32_t argc, const VRowData argv [])
2546 {
2547     rc_t rc;
2548     uint8_t const *const offset_type = argc == 1 ? ((uint8_t const *)argv[0].u.data.base) + argv[0].u.data.first_elem : NULL;
2549     unsigned const count = argc == 1 ? (unsigned)argv[0].u.data.elem_count : 0;
2550 
2551     assert(argv[ 0 ].u.data.elem_bits == sizeof(offset_type[0]) * 8);
2552 
2553     rslt->data->elem_bits = sizeof(char) * 8;
2554     rslt->elem_count = 1;
2555     rc = KDataBufferResize(rslt->data, 1);
2556     if ( rc == 0 )
2557     {
2558         unsigned p_count = 0;
2559         unsigned m_count = 0;
2560     	unsigned i;
2561         char *orient = rslt->data->base;
2562 
2563         for (i = 0; i < count; ++i) {
2564             if (offset_type[i] == NCBI_align_ro_intron_plus)
2565                 ++p_count;
2566             else if (offset_type[i] == NCBI_align_ro_intron_minus)
2567                 ++m_count;
2568         }
2569         if (p_count > 0 && m_count == 0)
2570             *orient = '+';
2571         else if (m_count > 0 && p_count == 0)
2572             *orient = '-';
2573         else
2574             rslt->elem_count = 0;
2575     }
2576 
2577     return rc;
2578 }
2579 
2580 VTRANSFACT_IMPL ( NCBI_align_rna_orientation, 1, 0, 0 ) ( const void *Self, const VXfactInfo *info,
2581     VFuncDesc *rslt, const VFactoryParams *cp, const VFunctionParams *dp )
2582 {
2583     rslt->u.rf = get_rna_orientation;
2584     rslt->variant = vftRow;
2585     rslt -> self = NULL;
2586     rslt -> whack = NULL;
2587     return 0;
2588 }
2589