1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include "cg_tools.h"
28 #include "debug.h"
29 #include <klib/out.h>
30 
31 #include <klib/printf.h>
32 #include <sysalloc.h>
33 #include <ctype.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37 
38 
SetCigOp(CigOps * dst,char op,uint32_t oplen)39 static void SetCigOp( CigOps * dst, char op, uint32_t oplen )
40 {
41     dst->op    = op;
42     dst->oplen = oplen;
43     switch( op )
44     { /*MX= DN B IS PH*/
45 
46     case 'M' :
47     case 'X' :
48     case '=' :  dst->ref_sign = +1;
49                 dst->seq_sign = +1;
50                 break;
51 
52     case 'D' :
53     case 'N' :  dst->ref_sign = +1;
54                 dst->seq_sign = 0;
55                 break;
56 
57     case 'B' :  dst->ref_sign = -1;
58                 dst->seq_sign = 0;
59                 break;
60 
61     case 'S' :
62     case 'I' :  dst->ref_sign = 0;
63                 dst->seq_sign = +1;
64                 break;
65 
66     case 'P' :
67     case 'H' :
68     case 0   :  dst->ref_sign= 0;   /* terminating op */
69                 dst->seq_sign= 0;
70                 break;
71 
72     default :   assert( 0 );
73                 break;
74     }
75 }
76 
77 
ExplodeCIGAR(CigOps dst[],uint32_t len,char const cigar[],uint32_t ciglen)78 int32_t ExplodeCIGAR( CigOps dst[], uint32_t len, char const cigar[], uint32_t ciglen )
79 {
80     uint32_t i;
81     uint32_t j;
82     int32_t opLen;
83 
84     for ( i = j = opLen = 0; i < ciglen; ++i )
85     {
86         if ( isdigit( cigar[ i ] ) )
87         {
88             opLen = opLen * 10 + cigar[ i ] - '0';
89         }
90         else
91         {
92             assert( isalpha( cigar[ i ] ) );
93             SetCigOp( dst + j, cigar[ i ], opLen );
94             opLen = 0;
95             j++;
96         }
97     }
98     SetCigOp( dst + j, 0, 0 );
99     j++;
100     return j;
101 }
102 
103 
104 typedef struct cgOp_s
105 {
106     uint16_t length;
107     uint8_t type; /* 0: match, 1: insert, 2: delete */
108     char code;
109 } cgOp;
110 
111 
print_CG_cigar(int line,cgOp const op[],unsigned const ops,unsigned const gap[3])112 static void print_CG_cigar( int line, cgOp const op[], unsigned const ops, unsigned const gap[ 3 ] )
113 {
114 #if _DEBUGGING
115     unsigned i;
116 
117     SAM_DUMP_DBG( 3, ( "%5i: ", line ) );
118     for ( i = 0; i < ops; ++i )
119     {
120         if ( gap && ( i == gap[ 0 ] || i == gap[ 1 ] || i == gap[ 2 ] ) )
121         {
122             SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, tolower( op[ i ].code ) ) );
123         }
124         else
125         {
126             SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, toupper( op[ i ].code ) ) );
127         }
128     }
129     SAM_DUMP_DBG( 3, ( "\n" ) );
130 #endif
131 }
132 
133 
134 /* gap contains the indices of the wobbles in op
135  * gap[0] is between read 1 and 2; it is the 'B'
136  * gap[1] is between read 2 and 3; it is an 'N'
137  * gap[2] is between read 3 and 4; it is an 'N'
138  */
139 
140 typedef struct cg_cigar_temp
141 {
142     unsigned gap[ 3 ];
143     cgOp cigOp[ MAX_READ_LEN ];
144     unsigned opCnt;
145     unsigned S_adjust;
146     unsigned CG_adjust;
147     bool     has_ref_offset_type;
148 } cg_cigar_temp;
149 
150 
CIGAR_to_CG_Ops(const cg_cigar_input * input,cg_cigar_temp * tmp)151 static rc_t CIGAR_to_CG_Ops( const cg_cigar_input * input,
152                              cg_cigar_temp * tmp )
153 {
154     unsigned i;
155     unsigned ops = 0;
156     unsigned gapno;
157 
158     tmp->opCnt = 0;
159     tmp->S_adjust = 0;
160     tmp->CG_adjust = 0;
161     tmp->has_ref_offset_type = false;
162     for ( i = 0; i < input->p_cigar.len; ++ops )
163     {
164         char opChar = 0;
165         int opLen = 0;
166         int n;
167 
168         for ( n = 0; ( ( n + i ) < input->p_cigar.len ) && ( isdigit( input->p_cigar.ptr[ n + i ] ) ); n++ )
169         {
170             opLen = opLen * 10 + input->p_cigar.ptr[ n + i ] - '0';
171         }
172 
173         if ( ( n + i ) < input->p_cigar.len )
174         {
175             opChar = input->p_cigar.ptr[ n + i ];
176             n++;
177         }
178 
179         if ( ( ops + 1 ) >= MAX_READ_LEN )
180             return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
181 
182         i += n;
183 
184         tmp->cigOp[ ops ].length = opLen;
185         tmp->cigOp[ ops ].code = opChar;
186         switch ( opChar )
187         {
188             case 'M' :
189             case '=' :
190             case 'X' :  tmp->cigOp[ ops ].type = 0;
191                         break;
192 
193             case 'S' :  tmp->S_adjust += opLen;
194             case 'I' :  tmp->cigOp[ ops ].type = 1;
195                         tmp->cigOp[ ops ].code = 'I';
196                         break;
197 	    case 'N':   tmp->has_ref_offset_type = true;
198             case 'D':   tmp->cigOp[ ops ].type = 2;
199                         break;
200 
201             default :   return RC( rcExe, rcData, rcReading, rcConstraint, rcViolated );
202         }
203     }
204 
205     tmp->opCnt = ops;
206     tmp->gap[ 0 ] = tmp->gap[ 1 ] = tmp->gap[ 2 ] = ops;
207     print_CG_cigar( __LINE__, tmp->cigOp, ops, NULL );
208 
209 
210     if ( ops < 3 )
211         return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
212 
213     {
214         unsigned fwd = 0; /* 5 10 10 10 */
215         unsigned rev = 0; /* 10 10 10 5 */
216         unsigned acc; /* accumulated length */
217 
218         if ( ( input->seq_req_id == 1 && !input->orientation ) ||
219              ( input->seq_req_id == 2 && input->orientation ) )
220         {
221             for ( i = 0, acc = 0; i < ops  && acc <= 5; ++i )
222             {
223                 if ( tmp->cigOp[ i ].type != 2 )
224                 {
225                     acc += tmp->cigOp[ i ].length;
226                     if ( acc == 5 && tmp->cigOp[ i + 1 ].type == 1 )
227                     {
228                         fwd = i + 1;
229                         break;
230                     }
231                     else if ( acc > 5 )
232                     {
233                         unsigned const right = acc - 5;
234                         unsigned const left = tmp->cigOp[ i ].length - right;
235 
236                         memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
237                         ops += 2;
238                         tmp->cigOp[ i ].length = left;
239                         tmp->cigOp[ i + 2 ].length = right;
240 
241                         tmp->cigOp[ i + 1 ].type = 1;
242                         tmp->cigOp[ i + 1 ].code = 'B';
243                         tmp->cigOp[ i + 1 ].length = 0;
244 
245                         fwd = i + 1;
246                         break;
247                     }
248                 }
249             }
250         }
251         else if ( ( input->seq_req_id == 2 && !input->orientation ) ||
252                   ( input->seq_req_id == 1 && input->orientation ) )
253         {
254             for ( i = ops, acc = 0; i > 0 && acc <= 5; )
255             {
256                 --i;
257                 if ( tmp->cigOp[ i ].type != 2 )
258                 {
259                     acc += tmp->cigOp[ i ].length;
260                     if ( acc == 5 && tmp->cigOp[ i ].type == 1 )
261                     {
262                         rev = i;
263                         break;
264                     }
265                     else if ( acc > 5 )
266                     {
267                         unsigned const left = acc - 5;
268                         unsigned const right = tmp->cigOp[ i ].length - left;
269 
270                         memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
271                         ops += 2;
272                         tmp->cigOp[ i ].length = left;
273                         tmp->cigOp[ i + 2 ].length = right;
274 
275                         tmp->cigOp[ i + 1 ].type = 1;
276                         tmp->cigOp[ i + 1 ].code = 'B';
277                         tmp->cigOp[ i + 1 ].length = 0;
278 
279                         rev = i + 1;
280                         break;
281                     }
282                 }
283             }
284         }
285         else
286         {
287             /* fprintf(stderr, "guessing layout\n"); */
288             for ( i = 0, acc = 0; i < ops  && acc <= 5; ++i )
289             {
290                 if ( tmp->cigOp[ i ].type != 2 )
291                 {
292                     acc += tmp->cigOp[ i ].length;
293                     if ( acc == 5 && tmp->cigOp[ i + 1 ].type == 1 )
294                     {
295                         fwd = i + 1;
296                     }
297                 }
298             }
299             for ( i = ops, acc = 0; i > 0 && acc <= 5; )
300             {
301                 --i;
302                 if ( tmp->cigOp[ i ].type != 2 )
303                 {
304                     acc += tmp->cigOp[ i ].length;
305                     if ( acc == 5 && tmp->cigOp[i].type == 1 )
306                     {
307                         rev = i;
308                     }
309                 }
310             }
311             if ( !tmp->has_ref_offset_type && (( fwd == 0 && rev == 0 ) || ( fwd != 0 && rev != 0 ) ))
312             {
313                 for ( i = 0; i < ops; ++i )
314                 {
315                     if ( tmp->cigOp[ i ].type == 2 )
316                     {
317                         tmp->cigOp[ i ].code = 'N';
318                         tmp->CG_adjust += tmp->cigOp[ i ].length;
319                     }
320                 }
321                 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
322             }
323         }
324         if ( fwd && tmp->cigOp[ fwd ].type == 1 )
325         {
326             for ( i = ops, acc = 0; i > fwd + 1; )
327             {
328                 --i;
329                 if ( tmp->cigOp[ i ].type != 2 )
330                 {
331                     acc += tmp->cigOp[ i ].length;
332                     if ( acc >= 10 )
333                     {
334                         if ( acc > 10 )
335                         {
336                             unsigned const r = 10 + tmp->cigOp[ i ].length - acc;
337                             unsigned const l = tmp->cigOp[ i ].length - r;
338 
339                             if ( ops + 2 >= MAX_READ_LEN )
340                                 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
341                             memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
342                             ops += 2;
343                             tmp->cigOp[ i + 2 ].length = r;
344                             tmp->cigOp[ i ].length = l;
345 
346                             tmp->cigOp[ i + 1 ].length = 0;
347                             tmp->cigOp[ i + 1 ].type = 2;
348                             tmp->cigOp[ i + 1 ].code = 'N';
349                             i += 2;
350                         }
351                         else if ( i - 1 > fwd )
352                         {
353                             if ( tmp->cigOp[ i - 1 ].type == 2 )
354                                  tmp->cigOp[ i - 1 ].code = 'N';
355                             else
356                             {
357                                 if ( ops + 1 >= MAX_READ_LEN )
358                                     return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
359                                 memmove( &tmp->cigOp[ i + 1 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
360                                 ops += 1;
361                                 tmp->cigOp[ i ].length = 0;
362                                 tmp->cigOp[ i ].type = 2;
363                                 tmp->cigOp[ i ].code = 'N';
364                                 i += 1;
365                             }
366                         }
367                         acc = 0;
368                     }
369                 }
370             }
371             /** change I to B+M **/
372             tmp->cigOp[ fwd ].code = 'B';
373             memmove( &tmp->cigOp[ fwd + 1 ], &tmp->cigOp[ fwd ], ( ops - fwd ) * sizeof( tmp->cigOp[ 0 ] ) );
374             ops += 1;
375             tmp->cigOp[ fwd + 1 ].code = 'M';
376             tmp->opCnt = ops;
377             /** set the gaps now **/
378             for ( gapno = 3, i = ops; gapno > 1 && i > 0; )
379             {
380                 --i;
381                 if ( tmp->cigOp[ i ].code == 'N' )
382                     tmp->gap[ --gapno ] = i;
383             }
384             tmp->gap[ 0 ] = fwd;
385             print_CG_cigar( __LINE__, tmp->cigOp, ops, tmp->gap );
386             return 0;
387         }
388         if ( rev && tmp->cigOp[ rev ].type == 1 )
389         {
390             for ( acc = i = 0; i < rev; ++i )
391             {
392                 if ( tmp->cigOp[ i ].type != 2 )
393                 {
394                     acc += tmp->cigOp[ i ].length;
395                     if ( acc >= 10 )
396                     {
397                         if ( acc > 10 )
398                         {
399                             unsigned const l = 10 + tmp->cigOp[ i ].length - acc;
400                             unsigned const r = tmp->cigOp[ i ].length - l;
401 
402                             if ( ops + 2 >= MAX_READ_LEN )
403                                 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
404                             memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
405                             ops += 2;
406                             tmp->cigOp[ i + 2 ].length = r;
407                             tmp->cigOp[ i ].length = l;
408 
409                             tmp->cigOp[ i + 1 ].length = 0;
410                             tmp->cigOp[ i + 1 ].type = 2;
411                             tmp->cigOp[ i + 1 ].code = 'N';
412                             rev += 2;
413                             i += 2;
414                         }
415                         else if ( i + 1 < rev )
416                         {
417                             if ( tmp->cigOp[ i + 1 ].type == 2 )
418                                  tmp->cigOp[ i + 1 ].code = 'N';
419                             else
420                             {
421                                 if ( ops + 1 >= MAX_READ_LEN )
422                                     return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
423                                 memmove( &tmp->cigOp[ i + 1 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
424                                 ops += 1;
425                                 tmp->cigOp[ i + 1 ].length = 0;
426                                 tmp->cigOp[ i + 1 ].type = 2;
427                                 tmp->cigOp[ i + 1 ].code = 'N';
428                                 rev += 1;
429                                 i += 1;
430                             }
431                         }
432                         acc = 0;
433                     }
434                 }
435             }
436             for ( gapno = 3, i = 0; gapno > 1 && i < ops; ++i )
437             {
438                 if ( tmp->cigOp[ i ].code == 'N' )
439                     tmp->gap[ --gapno ] = i;
440             }
441             tmp->gap[ 0 ] = rev;
442             tmp->cigOp[ rev ].code = 'B';
443             memmove( &tmp->cigOp[ rev + 1 ], &tmp->cigOp[ rev ], ( ops - rev ) * sizeof( tmp->cigOp[ 0 ] ) );
444             ops += 1;
445             tmp->cigOp[ rev + 1 ].code = 'M';
446             tmp->opCnt = ops;
447             print_CG_cigar( __LINE__, tmp->cigOp, ops, tmp->gap );
448             return 0;
449         }
450     }
451     return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
452 }
453 
454 
adjust_cigar(const cg_cigar_input * input,cg_cigar_temp * tmp,cg_cigar_output * output)455 static rc_t adjust_cigar( const cg_cigar_input * input, cg_cigar_temp * tmp, cg_cigar_output * output )
456 {
457     rc_t rc = 0;
458     unsigned i, j;
459 
460     print_CG_cigar( __LINE__, tmp->cigOp, tmp->opCnt, NULL );
461 
462     /* remove zero length ops */
463     for ( j = i = 0; i < tmp->opCnt; )
464     {
465         if ( tmp->cigOp[ j ].length == 0 )
466         {
467             ++j;
468             --(tmp->opCnt);
469             continue;
470         }
471         tmp->cigOp[ i++ ] = tmp->cigOp[ j++ ];
472     }
473 
474     print_CG_cigar( __LINE__, tmp->cigOp, tmp->opCnt, NULL );
475 
476     if ( input->edit_dist_available )
477     {
478         int const adjusted = input->edit_dist + tmp->S_adjust - tmp->CG_adjust;
479         output->edit_dist = adjusted > 0 ? adjusted : 0;
480         SAM_DUMP_DBG( 4, ( "NM: before: %u, after: %u(+%u-%u)\n", input->edit_dist, output->edit_dist, tmp->S_adjust, tmp->CG_adjust ) );
481     }
482     else
483     {
484         output->edit_dist = input->edit_dist;
485     }
486 
487     /* merge adjacent ops */
488     for ( i = tmp->opCnt; i > 1; )
489     {
490         --i;
491         if ( tmp->cigOp[ i - 1 ].code == tmp->cigOp[ i ].code )
492         {
493             tmp->cigOp[ i - 1 ].length += tmp->cigOp[ i ].length;
494             memmove( &tmp->cigOp[ i ], &tmp->cigOp[ i + 1 ], ( tmp->opCnt - 1 - i ) * sizeof( tmp->cigOp[ 0 ] ) );
495             --(tmp->opCnt);
496         }
497     }
498     print_CG_cigar( __LINE__, tmp->cigOp, tmp->opCnt, NULL );
499     for ( i = j = 0; i < tmp->opCnt && rc == 0; ++i )
500     {
501         size_t sz;
502         rc = string_printf( &output->cigar[ j ], sizeof( output->cigar ) - j, &sz, "%u%c", tmp->cigOp[ i ].length, tmp->cigOp[ i ].code );
503         j += sz;
504     }
505     output->cigar_len = j;
506     return rc;
507 }
508 
509 
make_cg_cigar(const cg_cigar_input * input,cg_cigar_output * output)510 rc_t make_cg_cigar( const cg_cigar_input * input, cg_cigar_output * output )
511 {
512     rc_t rc;
513     cg_cigar_temp tmp;
514     memset( &tmp, 0, sizeof tmp );
515 
516     rc = CIGAR_to_CG_Ops( input, &tmp );
517     if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
518     {
519         memmove( &output->cigar[ 0 ], &input->p_cigar.ptr[ 0 ], input->p_cigar.len );
520         output->cigar_len = input->p_cigar.len;
521         output->cigar[ output->cigar_len ] = 0;
522         output->edit_dist = input->edit_dist;
523         output->p_cigar.ptr = output->cigar;
524         output->p_cigar.len = output->cigar_len;
525         rc = 0;
526     }
527     else if ( rc == 0 )
528     {
529         if ( tmp.CG_adjust == 0 && !tmp.has_ref_offset_type)
530         {
531             if ( tmp.gap[ 0 ] < tmp.opCnt )
532                 tmp.CG_adjust = tmp.cigOp[ tmp.gap[ 0 ] ].length;
533 
534             if ( tmp.gap[ 1 ] < tmp.opCnt )
535                 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 1 ] ].length;
536 
537             if ( tmp.gap[ 2 ] < tmp.opCnt )
538                 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 2 ] ].length;
539         }
540 
541         rc = adjust_cigar( input, &tmp, output );
542     }
543     return rc;
544 }
545 
546 
merge_M_type_ops(char a,char b)547 static char merge_M_type_ops( char a, char b )
548 { /*MX=*/
549     char c = 0;
550     switch( b )
551     {
552         case 'X' :  switch( a )
553                     {
554                         case '=' : c = 'X'; break;
555                         case 'X' : c = 'M'; break; /**we don't know - 2X may create '=' **/
556                         case 'M' : c = 'M'; break;
557                     }
558                     break;
559         case 'M' :  c = 'M'; break;
560         case '=' :  c = a; break;
561     }
562     assert( c != 0 );
563     return c;
564 }
565 
566 
fmt_cigar_elem(char * dst,uint32_t cig_oplen,char cig_op)567 static size_t fmt_cigar_elem( char * dst, uint32_t cig_oplen, char cig_op )
568 {
569     size_t num_writ;
570     rc_t rc = string_printf ( dst, 11, &num_writ, "%d%c", cig_oplen, cig_op );
571     assert ( rc == 0 && num_writ > 1 );
572     return num_writ;
573 }
574 
CombineCIGAR(char dst[],CigOps const seqOp[],uint32_t seq_len,uint32_t refPos,CigOps const refOp[],uint32_t ref_len)575 uint32_t CombineCIGAR( char dst[], CigOps const seqOp[], uint32_t seq_len,
576                        uint32_t refPos, CigOps const refOp[], uint32_t ref_len )
577 {
578     bool done = false;
579     uint32_t ciglen = 0, last_ciglen = 0, last_cig_oplen = 0;
580     int32_t si = 0, ri = 0;
581     char last_cig_op = '0'; /* never used operation, forces a mismatch in MACRO_BUILD_CIGAR */
582     CigOps seq_cop = { 0, 0, 0, 0 }, ref_cop = { 0, 0, 0, 0 };
583     int32_t seq_pos = 0;        /** seq_pos is tracked roughly - with every extraction from seqOp **/
584     int32_t ref_pos = 0;        /** ref_pos is tracked precisely - with every delta and consumption in cigar **/
585     int32_t delta = refPos;     /*** delta in relative positions of seq and ref **/
586                                 /*** when delta < 0 - rewind or extend the reference ***/
587                                 /*** wher delta > 0 - skip reference  ***/
588 #define MACRO_BUILD_CIGAR(OP,OPLEN) \
589 	if( last_cig_oplen > 0 && last_cig_op == OP){							\
590                 last_cig_oplen += OPLEN;								\
591                 ciglen = last_ciglen + fmt_cigar_elem( dst + last_ciglen, last_cig_oplen,last_cig_op );	\
592         } else {											\
593                 last_ciglen = ciglen;									\
594                 last_cig_oplen = OPLEN;									\
595                 last_cig_op    = OP;									\
596                 ciglen = ciglen      + fmt_cigar_elem( dst + last_ciglen, last_cig_oplen, last_cig_op );	\
597         }
598     while( !done )
599     {
600         while ( delta < 0 )
601         {
602             ref_pos += delta; /** we will make it to back up this way **/
603             if( ri > 0 )
604             { /** backing up on ref if possible ***/
605                 int avail_oplen = refOp[ ri - 1 ].oplen - ref_cop.oplen;
606                 if ( avail_oplen > 0 )
607                 {
608                     if ( ( -delta ) <= avail_oplen * ref_cop.ref_sign )
609                     { /*** rewind within last operation **/
610                         ref_cop.oplen -= delta;
611                         delta -= delta * ref_cop.ref_sign;
612                     }
613                     else
614                     { /*** rewind the whole ***/
615                         ref_cop.oplen += avail_oplen;
616                         delta += avail_oplen * ref_cop.ref_sign;
617                     }
618                 }
619                 else
620                 {
621                     ri--;
622                     /** pick the previous as used up **/
623                     ref_cop = refOp[ri-1];
624                     ref_cop.oplen =0;
625                 }
626             }
627             else
628             { /** extending the reference **/
629                 SetCigOp( &ref_cop, '=', ref_cop.oplen - delta );
630                 delta = 0;
631             }
632             ref_pos -= delta;
633         }
634         if ( ref_cop.oplen == 0 )
635         { /*** advance the reference ***/
636             ref_cop = refOp[ri++];
637             if ( ref_cop.oplen == 0 )
638             { /** extending beyond the reference **/
639                 SetCigOp( &ref_cop,'=', 1000 );
640             }
641             assert( ref_cop.oplen > 0 );
642         }
643         if ( delta > 0 )
644         { /***  skip refOps ***/
645             ref_pos += delta; /** may need to back up **/
646             if ( delta >=  ref_cop.oplen )
647             { /** full **/
648                 delta -= ref_cop.oplen * ref_cop.ref_sign;
649                 ref_cop.oplen = 0;
650             }
651             else
652             { /** partial **/
653                 ref_cop.oplen -= delta;
654                 delta -= delta * ref_cop.ref_sign;
655             }
656             ref_pos -= delta; /** if something left - restore ***/
657             continue;
658         }
659 
660         /*** seq and ref should be synchronized here **/
661         assert( delta == 0 );
662         if ( seq_cop.oplen == 0 )
663         { /*** advance sequence ***/
664             if ( seq_pos < seq_len )
665             {
666                 seq_cop = seqOp[ si++ ];
667                 assert( seq_cop.oplen > 0 );
668                 seq_pos += seq_cop.oplen * seq_cop.seq_sign;
669             }
670             else
671             {
672                 done=true;
673             }
674         }
675 
676         if( !done )
677         {
678             int seq_seq_step = seq_cop.oplen * seq_cop.seq_sign; /** sequence movement**/
679             int seq_ref_step = seq_cop.oplen * seq_cop.ref_sign; /** influence of sequence movement on intermediate reference **/
680             int ref_seq_step = ref_cop.oplen * ref_cop.seq_sign; /** movement of the intermediate reference ***/
681             int ref_ref_step = ref_cop.oplen * ref_cop.ref_sign; /** influence of the intermediate reference movement on final reference ***/
682             assert( ref_ref_step >= 0 ); /** no B in the reference **/
683             if ( seq_ref_step <= 0 )
684             { /** BSIPH in the sequence against anything ***/
685                 MACRO_BUILD_CIGAR( seq_cop.op, seq_cop.oplen );
686                 seq_cop.oplen = 0;
687                 delta = seq_ref_step; /** if negative - will force rewind next cycle **/
688             }
689             else if ( ref_ref_step <= 0 )
690             { /** MX=DN against SIPH in the reference***/
691                 if ( ref_seq_step == 0 )
692                 { /** MX=DN against PH **/
693                     MACRO_BUILD_CIGAR( ref_cop.op,ref_cop.oplen);
694                     ref_cop.oplen = 0;
695                 }
696                 else
697                 {
698                     int min_len = ( seq_cop.oplen < ref_cop.oplen ) ? seq_cop.oplen : ref_cop.oplen;
699                     if( seq_seq_step == 0 )
700                     { /** DN agains SI **/
701                         MACRO_BUILD_CIGAR( 'P', min_len );
702                     }
703                     else
704                     { /** MX= agains SI ***/
705                         MACRO_BUILD_CIGAR( ref_cop.op,min_len );
706                     }
707                     seq_cop.oplen -= min_len;
708                     ref_cop.oplen -= min_len;
709                 }
710             }
711             else
712             {
713                 /*MX=DN  against MX=DN*/
714                 int min_len = ( seq_cop.oplen < ref_cop.oplen ) ? seq_cop.oplen : ref_cop.oplen;
715                 if ( seq_seq_step == 0 )
716                 { /* DN against MX=DN */
717                     if ( ref_seq_step == 0 )
718                     { /** padding DN against DN **/
719                         MACRO_BUILD_CIGAR( 'P', min_len );
720                         ref_cop.oplen -= min_len;
721                         seq_cop.oplen -= min_len;
722                     }
723                     else
724                     { /* DN against MX= **/
725                         MACRO_BUILD_CIGAR( seq_cop.op, min_len );
726                         seq_cop.oplen -= min_len;
727                     }
728                 }
729                 else if ( ref_cop.seq_sign == 0 )
730                 { /* MX= against DN - always wins */
731                     MACRO_BUILD_CIGAR( ref_cop.op, min_len );
732                     ref_cop.oplen -= min_len;
733                 }
734                 else
735                 { /** MX= against MX= ***/
736                     char curr_op = merge_M_type_ops( seq_cop.op, ref_cop.op );
737                     /* or otherwise merge_M_type_ops() will be called twice by the macro! */
738                     MACRO_BUILD_CIGAR( curr_op, min_len );
739                     ref_cop.oplen -= min_len;
740                     seq_cop.oplen -= min_len;
741                 }
742                 ref_pos += min_len;
743             }
744         }
745     }
746     return ciglen;
747 }
748 
749 
750 typedef struct cg_merger
751 {
752     char newSeq[ MAX_READ_LEN ];
753     char newQual[ MAX_READ_LEN ];
754     char tags[ MAX_CG_CIGAR_LEN * 2 ];
755 } cg_merger;
756 
757 
merge_cg_cigar(const cg_cigar_input * input,cg_cigar_output * output)758 rc_t merge_cg_cigar( const cg_cigar_input * input, cg_cigar_output * output )
759 {
760     rc_t rc;
761     cg_cigar_temp tmp;
762     memset( &tmp, 0, sizeof tmp );
763 
764     rc = CIGAR_to_CG_Ops( input, &tmp );
765     if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
766     {
767         memmove( &output->cigar[ 0 ], &input->p_cigar.ptr[ 0 ], input->p_cigar.len );
768         output->cigar_len = input->p_cigar.len;
769         output->cigar[ output->cigar_len ] = 0;
770         output->edit_dist = input->edit_dist;
771         rc = 0;
772     }
773     else if ( rc == 0 )
774     {
775 
776         if ( tmp.CG_adjust == 0  && !tmp.has_ref_offset_type )
777         {
778             if ( tmp.gap[ 0 ] < tmp.opCnt )
779                 tmp.CG_adjust = tmp.cigOp[ tmp.gap[ 0 ] ].length;
780 
781             if ( tmp.gap[ 1 ] < tmp.opCnt )
782                 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 1 ] ].length;
783 
784             if ( tmp.gap[ 2 ] < tmp.opCnt )
785                 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 2 ] ].length;
786         }
787 
788         rc = adjust_cigar( input, &tmp, output );
789 
790     }
791     return rc;
792 }
793 
794 
make_cg_merge(const cg_cigar_input * input,cg_cigar_output * output)795 rc_t make_cg_merge( const cg_cigar_input * input, cg_cigar_output * output )
796 {
797     rc_t rc;
798     cg_cigar_temp tmp;
799     memset( &tmp, 0, sizeof tmp );
800 
801     rc = CIGAR_to_CG_Ops( input, &tmp );
802     if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
803     {
804         memmove( &output->cigar[ 0 ], &input->p_cigar.ptr[ 0 ], input->p_cigar.len );
805         output->cigar_len = input->p_cigar.len;
806         output->cigar[ output->cigar_len ] = 0;
807         output->edit_dist = input->edit_dist;
808         rc = 0;
809     }
810     else if ( rc == 0 )
811     {
812 
813         if ( tmp.CG_adjust == 0  && !tmp.has_ref_offset_type )
814         {
815             if ( tmp.gap[ 0 ] < tmp.opCnt )
816                 tmp.CG_adjust = tmp.cigOp[ tmp.gap[ 0 ] ].length;
817 
818             if ( tmp.gap[ 1 ] < tmp.opCnt )
819                 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 1 ] ].length;
820 
821             if ( tmp.gap[ 2 ] < tmp.opCnt )
822                 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 2 ] ].length;
823         }
824     }
825 
826     if ( rc == 0 )
827     {
828         uint32_t const B_len = tmp.cigOp[ tmp.gap[ 0 ] ].length;
829         uint32_t const B_at = tmp.gap[ 0 ] < tmp.gap[ 2 ] ? 5 : 30;
830 
831         if ( 0 < B_len && B_len < 5 )
832         {
833             memmove( output->newSeq,  input->p_read.ptr, MAX_READ_LEN );
834             memmove( output->newQual, input->p_quality.ptr, MAX_READ_LEN );
835 
836             output->p_read.ptr = output->newSeq;
837             output->p_read.len = ( MAX_READ_LEN - B_len );
838 
839             output->p_quality.ptr = output->newQual;
840             output->p_quality.len = ( MAX_READ_LEN - B_len );
841 
842             output->p_tags.ptr = output->tags;
843             output->p_tags.len = 0;
844 
845             /* nBnM -> nB0M */
846             tmp.cigOp[ tmp.gap[ 0 ] + 1 ].length -= B_len;
847             if ( tmp.gap[ 0 ] < tmp.gap[ 2 ] )
848             {
849                 size_t written;
850                 rc = string_printf( output->tags, sizeof( output->tags ), &written,
851                                     "GC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
852                                     5 - B_len, B_len, 30 - B_len, 2 * B_len, &output->newSeq[ 5 - B_len ], 2 * B_len, &output->newQual[ 5 - B_len ] );
853                 if ( rc == 0 )
854                     output->p_tags.len = written;
855                 memmove( &tmp.cigOp[ tmp.gap[ 0 ] ],
856                          &tmp.cigOp[ tmp.gap[ 0 ] + 1 ],
857                          ( tmp.opCnt - ( tmp.gap[ 0 ] + 1 ) ) * sizeof( tmp.cigOp[ 0 ] ) );
858                 --tmp.opCnt;
859             }
860             else
861             {
862                 size_t written;
863                 rc = string_printf( output->tags, sizeof( output->tags ), &written,
864                                     "GC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
865                                     30 - B_len, B_len, 5 - B_len, 2 * B_len, &output->newSeq[ 30 - B_len ], 2 * B_len, &output->newQual[ 30 - B_len ] );
866                 if ( rc == 0 )
867                     output->p_tags.len = written;
868                 memmove( &tmp.cigOp[ tmp.gap[ 0 ] ],
869                          &tmp.cigOp[ tmp.gap[ 0 ] + 1 ],
870                          ( tmp.opCnt - ( tmp.gap[ 0 ] + 1 ) ) * sizeof( tmp.cigOp[ 0 ] ) );
871                 --tmp.opCnt;
872             }
873             if ( rc == 0 )
874             {
875                 uint32_t i;
876                 for ( i = B_at; i < B_at + B_len; ++i )
877                 {
878                     uint32_t const Lq = output->newQual[ i - B_len ];
879                     uint32_t const Rq = output->newQual[ i ];
880 
881                     if ( Lq <= Rq )
882                     {
883                         output->newSeq[ i - B_len ] = output->newSeq[ i ];
884                         output->newQual[ i - B_len ] = Rq;
885                     }
886                     else
887                     {
888                         output->newSeq[ i ] = output->newSeq[ i - B_len ];
889                         output->newQual[ i ] = Lq;
890                     }
891                 }
892                 memmove( &output->newSeq [ B_at ], &output->newSeq [ B_at + B_len ], MAX_READ_LEN - B_at - B_len );
893                 memmove( &output->newQual[ B_at ], &output->newQual[ B_at + B_len ], MAX_READ_LEN - B_at - B_len );
894             }
895         }
896         else
897         {
898             uint32_t i, len = tmp.cigOp[ tmp.gap[ 0 ] ].length;
899 
900             tmp.cigOp[ tmp.gap[ 0 ] ].code = 'I';
901             for ( i = tmp.gap[ 0 ] + 1; i < tmp.opCnt && len > 0; ++i )
902             {
903                 if ( tmp.cigOp[ i ].length <= len )
904                 {
905                     len -= tmp.cigOp[ i ].length;
906                     tmp.cigOp[ i ].length = 0;
907                 }
908                 else
909                 {
910                     tmp.cigOp[ i ].length -= len;
911                     len = 0;
912                 }
913             }
914             tmp.CG_adjust -= tmp.cigOp[ tmp.gap[ 0 ] ].length;
915         }
916     }
917 
918     if ( rc == 0 )
919         rc = adjust_cigar( input, &tmp, output );
920 
921     return rc;
922 }
923 
924 
925 /* A-->0x00, C-->0x01 G-->0x02 T-->0x03 */
926 
927 static const uint8_t ASCII_to_2na[] = {
928 /*      00  01  02  03  04  05  06  07  08  09  0A  0B  0C  0D  0E  0F  */
929         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
930 
931 /*      10  11  12  13  14  15  16  17  18  19  1A  1B  1C  1D  1E  1F  */
932         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
933 
934 /*      20  21  22  23  24  25  26  27  28  29  2A  2B  2C  2D  2E  2F  */
935         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
936 
937 /*      30  31  32  33  34  35  36  37  38  39  3A  3B  3C  3D  3E  3F  */
938         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
939 
940 /*      40  41  42  43  44  45  46  47  48  49  4A  4B  4C  4D  4E  4F  */
941         0,  0,  0,  1,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,
942 
943 /*      50  51  52  53  54  55  56  57  58  59  5A  5B  5C  5D  5E  5F  */
944         0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
945 
946 /*      60  61  62  63  64  65  66  67  68  69  6A  6B  6C  6D  6E  6F  */
947         0,  0,  0,  1,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  0,
948 
949 /*      70  71  72  73  74  75  76  77  78  79  7A  7B  7C  7D  7E  7F  */
950         0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
951 };
952 
compress_4_bases_to_byte(uint8_t * bases)953 static uint8_t compress_4_bases_to_byte( uint8_t * bases )
954 {
955     uint8_t res = ASCII_to_2na[ bases[ 0 ] & 0x7F ];
956     res <<= 2;
957     res |= ASCII_to_2na[ bases[ 1 ] & 0x7F ];
958     res <<= 2;
959     res |= ASCII_to_2na[ bases[ 2 ] & 0x7F ];
960     res <<= 2;
961     res |= ASCII_to_2na[ bases[ 3 ] & 0x7F ];
962     return res;
963 }
964 
965 
966 static const uint8_t compressed_to_fwd_reverse_0[] = {
967 
968     /* 0000.0000    0x00 ... AAAA */ ( RNA_SPLICE_FWD | 0x20 ),
969     /* 0000.0001    0x01 ... AAAC */ ( RNA_SPLICE_FWD | 0x10 ),
970     /* 0000.0010    0x02 ... AAAG */ ( RNA_SPLICE_FWD | 0x20 ),
971     /* 0000.0011    0x03 ... AAAT */ ( RNA_SPLICE_FWD | 0x20 ),
972     /* 0000.0100    0x04 ... AACA */ RNA_SPLICE_UNKNOWN,
973     /* 0000.0101    0x05 ... AACC */ ( RNA_SPLICE_FWD | 0x20 ),
974     /* 0000.0110    0x06 ... AACG */ RNA_SPLICE_UNKNOWN,
975     /* 0000.0111    0x07 ... AACT */ RNA_SPLICE_UNKNOWN,
976     /* 0000.1000    0x08 ... AAGA */ RNA_SPLICE_UNKNOWN,
977     /* 0000.1001    0x09 ... AAGC */ ( RNA_SPLICE_FWD | 0x20 ),
978     /* 0000.1010    0x0A ... AAGG */ RNA_SPLICE_UNKNOWN,
979     /* 0000.1011    0x0B ... AAGT */ RNA_SPLICE_UNKNOWN,
980     /* 0000.1100    0x0C ... AATA */ RNA_SPLICE_UNKNOWN,
981     /* 0000.1101    0x0D ... AATC */ ( RNA_SPLICE_FWD | 0x20 ),
982     /* 0000.1110    0x0E ... AATG */ RNA_SPLICE_UNKNOWN,
983     /* 0000.1111    0x0F ... AATT */ RNA_SPLICE_UNKNOWN,
984 
985     /* 0001.0000    0x10 ... ACAA */ ( RNA_SPLICE_FWD | 0x20 ),
986     /* 0001.0001    0x11 ... ACAC */ ( RNA_SPLICE_FWD | 0x10 ),
987     /* 0001.0010    0x12 ... ACAG */ ( RNA_SPLICE_FWD | 0x20 ),
988     /* 0001.0011    0x13 ... ACAT */ ( RNA_SPLICE_FWD | 0x20 ),
989     /* 0001.0100    0x14 ... ACCA */ RNA_SPLICE_UNKNOWN,
990     /* 0001.0101    0x15 ... ACCC */ ( RNA_SPLICE_FWD | 0x20 ),
991     /* 0001.0110    0x16 ... ACCG */ RNA_SPLICE_UNKNOWN,
992     /* 0001.0111    0x17 ... ACCT */ RNA_SPLICE_UNKNOWN,
993     /* 0001.1000    0x18 ... ACGA */ RNA_SPLICE_UNKNOWN,
994     /* 0001.1001    0x19 ... ACGC */ ( RNA_SPLICE_FWD | 0x20 ),
995     /* 0001.1010    0x1A ... ACGG */ RNA_SPLICE_UNKNOWN,
996     /* 0001.1011    0x1B ... ACGT */ RNA_SPLICE_UNKNOWN,
997     /* 0001.1100    0x1C ... ACTA */ RNA_SPLICE_UNKNOWN,
998     /* 0001.1101    0x1D ... ACTC */ ( RNA_SPLICE_FWD | 0x20 ),
999     /* 0001.1110    0x1E ... ACTG */ RNA_SPLICE_UNKNOWN,
1000     /* 0001.1111    0x1F ... ACTT */ RNA_SPLICE_UNKNOWN,
1001 
1002     /* 0010.0000    0x20 ... AGAA */ ( RNA_SPLICE_FWD | 0x20 ),
1003     /* 0010.0001    0x21 ... AGAC */ ( RNA_SPLICE_FWD | 0x10 ),
1004     /* 0010.0010    0x22 ... AGAG */ ( RNA_SPLICE_FWD | 0x20 ),
1005     /* 0010.0011    0x23 ... AGAT */ ( RNA_SPLICE_FWD | 0x20 ),
1006     /* 0010.0100    0x24 ... AGCA */ RNA_SPLICE_UNKNOWN,
1007     /* 0010.0101    0x25 ... AGCC */ ( RNA_SPLICE_FWD | 0x20 ),
1008     /* 0010.0110    0x26 ... AGCG */ RNA_SPLICE_UNKNOWN,
1009     /* 0010.0111    0x27 ... AGCT */ RNA_SPLICE_UNKNOWN,
1010     /* 0010.1000    0x28 ... AGGA */ RNA_SPLICE_UNKNOWN,
1011     /* 0010.1001    0x29 ... AGGC */ ( RNA_SPLICE_FWD | 0x20 ),
1012     /* 0010.1010    0x2A ... AGGG */ RNA_SPLICE_UNKNOWN,
1013     /* 0010.1011    0x2B ... AGGT */ RNA_SPLICE_UNKNOWN,
1014     /* 0010.1100    0x2C ... AGTA */ RNA_SPLICE_UNKNOWN,
1015     /* 0010.1101    0x2D ... AGTC */ ( RNA_SPLICE_FWD | 0x20 ),
1016     /* 0010.1110    0x2E ... AGTG */ RNA_SPLICE_UNKNOWN,
1017     /* 0010.1111    0x2F ... AGTT */ RNA_SPLICE_UNKNOWN,
1018 
1019     /* 0011.0000    0x30 ... ATAA */ ( RNA_SPLICE_FWD | 0x10 ),
1020     /* 0011.0001    0x31 ... ATAC */ RNA_SPLICE_FWD,            /* MINOR forward */
1021     /* 0011.0010    0x32 ... ATAG */ ( RNA_SPLICE_FWD | 0x10 ),
1022     /* 0011.0011    0x33 ... ATAT */ ( RNA_SPLICE_FWD | 0x10 ),
1023     /* 0011.0100    0x34 ... ATCA */ RNA_SPLICE_UNKNOWN,
1024     /* 0011.0101    0x35 ... ATCC */ ( RNA_SPLICE_FWD | 0x10 ),
1025     /* 0011.0110    0x36 ... ATCG */ ( RNA_SPLICE_FWD | 0x20 ),
1026     /* 0011.0111    0x37 ... ATCT */ ( RNA_SPLICE_REV | 0x20 ),
1027     /* 0011.1000    0x38 ... ATGA */ RNA_SPLICE_UNKNOWN,
1028     /* 0011.1001    0x39 ... ATGC */ ( RNA_SPLICE_FWD | 0x10 ),
1029     /* 0011.1010    0x3A ... ATGG */ ( RNA_SPLICE_FWD | 0x20 ),
1030     /* 0011.1011    0x3B ... ATGT */ ( RNA_SPLICE_REV | 0x20 ),
1031     /* 0011.1100    0x3C ... ATTA */ RNA_SPLICE_UNKNOWN,
1032     /* 0011.1101    0x3D ... ATTC */ ( RNA_SPLICE_FWD | 0x10 ),
1033     /* 0011.1110    0x3E ... ATTG */ ( RNA_SPLICE_FWD | 0x20 ),
1034     /* 0011.1111    0x3F ... ATTT */ ( RNA_SPLICE_REV | 0x20 ),
1035 
1036     /* 0100.0000    0x40 ... CAAA */ ( RNA_SPLICE_REV | 0x20 ),
1037     /* 0100.0001    0x41 ... CAAC */ ( RNA_SPLICE_REV | 0x10 ),
1038     /* 0100.0010    0x42 ... CAAG */ ( RNA_SPLICE_REV | 0x20 ),
1039     /* 0100.0011    0x43 ... CAAT */ ( RNA_SPLICE_REV | 0x20 ),
1040     /* 0100.0100    0x44 ... CACA */ RNA_SPLICE_UNKNOWN,
1041     /* 0100.0101    0x45 ... CACC */ ( RNA_SPLICE_REV | 0x20 ),
1042     /* 0100.0110    0x46 ... CACG */ RNA_SPLICE_UNKNOWN,
1043     /* 0100.0111    0x47 ... CACT */ RNA_SPLICE_UNKNOWN,
1044     /* 0100.1000    0x48 ... CAGA */ RNA_SPLICE_UNKNOWN,
1045     /* 0100.1001    0x49 ... CAGC */ ( RNA_SPLICE_REV | 0x20 ),
1046     /* 0100.1010    0x4A ... CAGG */ RNA_SPLICE_UNKNOWN,
1047     /* 0100.1011    0x4B ... CAGT */ RNA_SPLICE_UNKNOWN,
1048     /* 0100.1100    0x4C ... CATA */ RNA_SPLICE_UNKNOWN,
1049     /* 0100.1101    0x4D ... CATC */ ( RNA_SPLICE_REV | 0x20 ),
1050     /* 0100.1110    0x4E ... CATG */ RNA_SPLICE_UNKNOWN,
1051     /* 0100.1111    0x4F ... CATT */ RNA_SPLICE_UNKNOWN,
1052 
1053     /* 0101.0000    0x50 ... CCAA */ ( RNA_SPLICE_REV | 0x20 ),
1054     /* 0101.0001    0x51 ... CCAC */ ( RNA_SPLICE_REV | 0x10 ),
1055     /* 0101.0010    0x52 ... CCAG */ ( RNA_SPLICE_REV | 0x20 ),
1056     /* 0101.0011    0x53 ... CCAT */ ( RNA_SPLICE_REV | 0x20 ),
1057     /* 0101.0100    0x54 ... CCCA */ RNA_SPLICE_UNKNOWN,
1058     /* 0101.0101    0x55 ... CCCC */ ( RNA_SPLICE_REV | 0x20 ),
1059     /* 0101.0110    0x56 ... CCCG */ RNA_SPLICE_UNKNOWN,
1060     /* 0101.0111    0x57 ... CCCT */ RNA_SPLICE_UNKNOWN,
1061     /* 0101.1000    0x58 ... CCGA */ RNA_SPLICE_UNKNOWN,
1062     /* 0101.1001    0x59 ... CCGC */ ( RNA_SPLICE_REV | 0x20 ),
1063     /* 0101.1010    0x5A ... CCGG */ RNA_SPLICE_UNKNOWN,
1064     /* 0101.1011    0x5B ... CCGT */ RNA_SPLICE_UNKNOWN,
1065     /* 0101.1100    0x5C ... CCTA */ RNA_SPLICE_UNKNOWN,
1066     /* 0101.1101    0x5D ... CCTC */ ( RNA_SPLICE_REV | 0x20 ),
1067     /* 0101.1110    0x5E ... CCTG */ RNA_SPLICE_UNKNOWN,
1068     /* 0101.1111    0x5F ... CCTT */ RNA_SPLICE_UNKNOWN,
1069 
1070     /* 0110.0000    0x60 ... CGAA */ ( RNA_SPLICE_REV | 0x20 ),
1071     /* 0110.0001    0x61 ... CGAC */ ( RNA_SPLICE_REV | 0x10 ),
1072     /* 0110.0010    0x62 ... CGAG */ ( RNA_SPLICE_REV | 0x20 ),
1073     /* 0110.0011    0x63 ... CGAT */ ( RNA_SPLICE_REV | 0x20 ),
1074     /* 0110.0100    0x64 ... CGCA */ RNA_SPLICE_UNKNOWN,
1075     /* 0110.0101    0x65 ... CGCC */ ( RNA_SPLICE_REV | 0x20 ),
1076     /* 0110.0110    0x66 ... CGCG */ RNA_SPLICE_UNKNOWN,
1077     /* 0110.0111    0x67 ... CGCT */ RNA_SPLICE_UNKNOWN,
1078     /* 0110.1000    0x68 ... CGGA */ RNA_SPLICE_UNKNOWN,
1079     /* 0110.1001    0x69 ... CGGC */ ( RNA_SPLICE_REV | 0x20 ),
1080     /* 0110.1010    0x6A ... CGGG */ RNA_SPLICE_UNKNOWN,
1081     /* 0110.1011    0x6B ... CGGT */ RNA_SPLICE_UNKNOWN,
1082     /* 0110.1100    0x6C ... CGTA */ RNA_SPLICE_UNKNOWN,
1083     /* 0110.1101    0x6D ... CGTC */ ( RNA_SPLICE_REV | 0x20 ),
1084     /* 0110.1110    0x6E ... CGTG */ RNA_SPLICE_UNKNOWN,
1085     /* 0110.1111    0x6F ... CGTT */ RNA_SPLICE_UNKNOWN,
1086 
1087     /* 0111.0000    0x70 ... CTAA */ ( RNA_SPLICE_REV | 0x10 ),
1088     /* 0111.0001    0x71 ... CTAC */ RNA_SPLICE_REV,            /* MAJOR reverse */
1089     /* 0111.0010    0x72 ... CTAG */ ( RNA_SPLICE_FWD | 0x10 ),
1090     /* 0111.0011    0x73 ... CTAT */ ( RNA_SPLICE_REV | 0x10 ),
1091     /* 0111.0100    0x74 ... CTCA */ RNA_SPLICE_UNKNOWN,
1092     /* 0111.0101    0x75 ... CTCC */ ( RNA_SPLICE_REV | 0x10 ),
1093     /* 0111.0110    0x76 ... CTCG */ ( RNA_SPLICE_FWD | 0x20 ),
1094     /* 0111.0111    0x77 ... CTCT */ ( RNA_SPLICE_REV | 0x20 ),
1095     /* 0111.1000    0x78 ... CTGA */ RNA_SPLICE_UNKNOWN,
1096     /* 0111.1001    0x79 ... CTGC */ ( RNA_SPLICE_REV | 0x10 ),
1097     /* 0111.1010    0x7A ... CTGG */ ( RNA_SPLICE_FWD | 0x20 ),
1098     /* 0111.1011    0x7B ... CTGT */ ( RNA_SPLICE_REV | 0x20 ),
1099     /* 0111.1100    0x7C ... CTTA */ RNA_SPLICE_UNKNOWN,
1100     /* 0111.1101    0x7D ... CTTC */ ( RNA_SPLICE_REV | 0x10 ),
1101     /* 0111.1110    0x7E ... CTTG */ ( RNA_SPLICE_FWD | 0x20 ),
1102     /* 0111.1111    0x7F ... CTTT */ ( RNA_SPLICE_REV | 0x20 ),
1103 
1104     /* 1000.0000    0x80 ... GAAA */ ( RNA_SPLICE_FWD | 0x20 ),
1105     /* 1000.0001    0x81 ... GAAC */ ( RNA_SPLICE_FWD | 0x20 ),
1106     /* 1000.0010    0x82 ... GAAG */ ( RNA_SPLICE_FWD | 0x10 ),
1107     /* 1000.0011    0x83 ... GAAT */ ( RNA_SPLICE_REV | 0x10 ),
1108     /* 1000.0100    0x84 ... GACA */ RNA_SPLICE_UNKNOWN,
1109     /* 1000.0101    0x85 ... GACC */ RNA_SPLICE_UNKNOWN,
1110     /* 1000.0110    0x86 ... GACG */ ( RNA_SPLICE_FWD | 0x20 ),
1111     /* 1000.0111    0x87 ... GACT */ ( RNA_SPLICE_REV | 0x20 ),
1112     /* 1000.1000    0x88 ... GAGA */ RNA_SPLICE_UNKNOWN,
1113     /* 1000.1001    0x89 ... GAGC */ RNA_SPLICE_UNKNOWN,
1114     /* 1000.1010    0x8A ... GAGG */ ( RNA_SPLICE_FWD | 0x20 ),
1115     /* 1000.1011    0x8B ... GAGT */ ( RNA_SPLICE_REV | 0x20 ),
1116     /* 1000.1100    0x8C ... GATA */ RNA_SPLICE_UNKNOWN,
1117     /* 1000.1101    0x8D ... GATC */ RNA_SPLICE_UNKNOWN,
1118     /* 1000.1110    0x8E ... GATG */ ( RNA_SPLICE_FWD | 0x20 ),
1119     /* 1000.1111    0x8F ... GATT */ ( RNA_SPLICE_REV | 0x20 ),
1120 
1121     /* 1001.0000    0x90 ... GCAA */ ( RNA_SPLICE_FWD | 0x20 ),
1122     /* 1001.0001    0x91 ... GCAC */ ( RNA_SPLICE_FWD | 0x20 ),
1123     /* 1001.0010    0x92 ... GCAG */ ( RNA_SPLICE_FWD | 0x10 ),
1124     /* 1001.0011    0x93 ... GCAT */ ( RNA_SPLICE_REV | 0x10 ),
1125     /* 1001.0100    0x94 ... GCCA */ RNA_SPLICE_UNKNOWN,
1126     /* 1001.0101    0x95 ... GCCC */ RNA_SPLICE_UNKNOWN,
1127     /* 1001.0110    0x96 ... GCCG */ ( RNA_SPLICE_FWD | 0x20 ),
1128     /* 1001.0111    0x97 ... GCCT */ ( RNA_SPLICE_REV | 0x20 ),
1129     /* 1001.1000    0x98 ... GCGA */ RNA_SPLICE_UNKNOWN,
1130     /* 1001.1001    0x99 ... GCGC */ RNA_SPLICE_UNKNOWN,
1131     /* 1001.1010    0x9A ... GCGG */ ( RNA_SPLICE_FWD | 0x20 ),
1132     /* 1001.1011    0x9B ... GCGT */ ( RNA_SPLICE_REV | 0x20 ),
1133     /* 1001.1100    0x9C ... GCTA */ RNA_SPLICE_UNKNOWN,
1134     /* 1001.1101    0x9D ... GCTC */ RNA_SPLICE_UNKNOWN,
1135     /* 1001.1110    0x9E ... GCTG */ ( RNA_SPLICE_FWD | 0x20 ),
1136     /* 1001.1111    0x9F ... GCTT */ ( RNA_SPLICE_REV | 0x20 ),
1137 
1138     /* 1010.0000    0xA0 ... GGAA */ ( RNA_SPLICE_FWD | 0x20 ),
1139     /* 1010.0001    0xA1 ... GGAC */ ( RNA_SPLICE_FWD | 0x20 ),
1140     /* 1010.0010    0xA2 ... GGAG */ ( RNA_SPLICE_FWD | 0x10 ),
1141     /* 1010.0011    0xA3 ... GGAT */ ( RNA_SPLICE_REV | 0x10 ),
1142     /* 1010.0100    0xA4 ... GGCA */ RNA_SPLICE_UNKNOWN,
1143     /* 1010.0101    0xA5 ... GGCC */ RNA_SPLICE_UNKNOWN,
1144     /* 1010.0110    0xA6 ... GGCG */ ( RNA_SPLICE_FWD | 0x20 ),
1145     /* 1010.0111    0xA7 ... GGCT */ ( RNA_SPLICE_REV | 0x20 ),
1146     /* 1010.1000    0xA8 ... GGGA */ RNA_SPLICE_UNKNOWN,
1147     /* 1010.1001    0xA9 ... GGGC */ RNA_SPLICE_UNKNOWN,
1148     /* 1010.1010    0xAA ... GGGG */ ( RNA_SPLICE_FWD | 0x20 ),
1149     /* 1010.1011    0xAB ... GGGT */ ( RNA_SPLICE_REV | 0x20 ),
1150     /* 1010.1100    0xAC ... GGTA */ RNA_SPLICE_UNKNOWN,
1151     /* 1010.1101    0xAD ... GGTC */ RNA_SPLICE_UNKNOWN,
1152     /* 1010.1110    0xAE ... GGTG */ ( RNA_SPLICE_FWD | 0x20 ),
1153     /* 1010.1111    0xAF ... GGTT */ ( RNA_SPLICE_REV | 0x20 ),
1154 
1155     /* 1011.0000    0xB0 ... GTAA */ ( RNA_SPLICE_FWD | 0x10 ),
1156     /* 1011.0001    0xB1 ... GTAC */ ( RNA_SPLICE_FWD | 0x10 ),
1157     /* 1011.0010    0xB2 ... GTAG */ RNA_SPLICE_FWD,            /* MAJOR forward */
1158     /* 1011.0011    0xB3 ... GTAT */ RNA_SPLICE_REV,            /* MINOR reverse */
1159     /* 1011.0100    0xB4 ... GTCA */ RNA_SPLICE_UNKNOWN,
1160     /* 1011.0101    0xB5 ... GTCC */ ( RNA_SPLICE_REV | 0x20 ),
1161     /* 1011.0110    0xB6 ... GTCG */ ( RNA_SPLICE_FWD | 0x10 ),
1162     /* 1011.0111    0xB7 ... GTCT */ ( RNA_SPLICE_REV | 0x10 ),
1163     /* 1011.1000    0xB8 ... GTGA */ RNA_SPLICE_UNKNOWN,
1164     /* 1011.1001    0xB9 ... GTGC */ ( RNA_SPLICE_REV | 0x20 ),
1165     /* 1011.1010    0xBA ... GTGG */ ( RNA_SPLICE_FWD | 0x10 ),
1166     /* 1011.1011    0xBB ... GTGT */ ( RNA_SPLICE_REV | 0x10 ),
1167     /* 1011.1100    0xBC ... GTTA */ RNA_SPLICE_UNKNOWN,
1168     /* 1011.1101    0xBD ... GTTC */ ( RNA_SPLICE_REV | 0x20 ),
1169     /* 1011.1110    0xBE ... GTTG */ ( RNA_SPLICE_FWD | 0x10 ),
1170     /* 1011.1111    0xBF ... GTTT */ ( RNA_SPLICE_REV | 0x10 ),
1171 
1172     /* 1100.0000    0xC0 ... TAAA */ RNA_SPLICE_UNKNOWN,
1173     /* 1100.0001    0xC1 ... TAAC */ RNA_SPLICE_UNKNOWN,
1174     /* 1100.0010    0xC2 ... TAAG */ RNA_SPLICE_UNKNOWN,
1175     /* 1100.0011    0xC3 ... TAAT */ RNA_SPLICE_UNKNOWN,
1176     /* 1100.0100    0xC4 ... TACA */ RNA_SPLICE_UNKNOWN,
1177     /* 1100.0101    0xC5 ... TACC */ RNA_SPLICE_UNKNOWN,
1178     /* 1100.0110    0xC6 ... TACG */ RNA_SPLICE_UNKNOWN,
1179     /* 1100.0111    0xC7 ... TACT */ RNA_SPLICE_UNKNOWN,
1180     /* 1100.1000    0xC8 ... TAGA */ RNA_SPLICE_UNKNOWN,
1181     /* 1100.1001    0xC9 ... TAGC */ RNA_SPLICE_UNKNOWN,
1182     /* 1100.1010    0xCA ... TAGG */ RNA_SPLICE_UNKNOWN,
1183     /* 1100.1011    0xCB ... TAGT */ RNA_SPLICE_UNKNOWN,
1184     /* 1100.1100    0xCC ... TATA */ RNA_SPLICE_UNKNOWN,
1185     /* 1100.1101    0xCD ... TATC */ RNA_SPLICE_UNKNOWN,
1186     /* 1100.1110    0xCE ... TATG */ RNA_SPLICE_UNKNOWN,
1187     /* 1100.1111    0xCF ... TATT */ RNA_SPLICE_UNKNOWN,
1188 
1189     /* 1101.0000    0xE0 ... TCAA */ RNA_SPLICE_UNKNOWN,
1190     /* 1101.0001    0xE1 ... TCAC */ RNA_SPLICE_UNKNOWN,
1191     /* 1101.0010    0xE2 ... TCAG */ RNA_SPLICE_UNKNOWN,
1192     /* 1101.0011    0xE3 ... TCAT */ RNA_SPLICE_UNKNOWN,
1193     /* 1101.0100    0xE4 ... TCCA */ RNA_SPLICE_UNKNOWN,
1194     /* 1101.0101    0xE5 ... TCCC */ RNA_SPLICE_UNKNOWN,
1195     /* 1101.0110    0xE6 ... TCCG */ RNA_SPLICE_UNKNOWN,
1196     /* 1101.0111    0xE7 ... TCCT */ RNA_SPLICE_UNKNOWN,
1197     /* 1101.1000    0xE8 ... TCGA */ RNA_SPLICE_UNKNOWN,
1198     /* 1101.1001    0xE9 ... TCGC */ RNA_SPLICE_UNKNOWN,
1199     /* 1101.1010    0xEA ... TCGG */ RNA_SPLICE_UNKNOWN,
1200     /* 1101.1011    0xEB ... TCGT */ RNA_SPLICE_UNKNOWN,
1201     /* 1101.1100    0xEC ... TCTA */ RNA_SPLICE_UNKNOWN,
1202     /* 1101.1101    0xED ... TCTC */ RNA_SPLICE_UNKNOWN,
1203     /* 1101.1110    0xEE ... TCTG */ RNA_SPLICE_UNKNOWN,
1204     /* 1101.1111    0xEF ... TCTT */ RNA_SPLICE_UNKNOWN,
1205 
1206     /* 1111.0000    0xF0 ... TTAA */ ( RNA_SPLICE_FWD | 0x20 ),
1207     /* 1111.0001    0xF1 ... TTAC */ ( RNA_SPLICE_REV | 0x10 ),
1208     /* 1111.0010    0xF2 ... TTAG */ ( RNA_SPLICE_FWD | 0x10 ),
1209     /* 1111.0011    0xF3 ... TTAT */ ( RNA_SPLICE_REV | 0x10 ),
1210     /* 1111.0100    0xF4 ... TTCA */ RNA_SPLICE_UNKNOWN,
1211     /* 1111.0101    0xF5 ... TTCC */ ( RNA_SPLICE_REV | 0x20 ),
1212     /* 1111.0110    0xF6 ... TTCG */ ( RNA_SPLICE_FWD | 0x20 ),
1213     /* 1111.0111    0xF7 ... TTCT */ ( RNA_SPLICE_REV | 0x20 ),
1214     /* 1111.1000    0xF8 ... TTGA */ RNA_SPLICE_UNKNOWN,
1215     /* 1111.1001    0xF9 ... TTGC */ ( RNA_SPLICE_REV | 0x20 ),
1216     /* 1111.1010    0xFA ... TTGG */ ( RNA_SPLICE_FWD | 0x20 ),
1217     /* 1111.1011    0xFB ... TTGT */ ( RNA_SPLICE_REV | 0x20 ),
1218     /* 1111.1100    0xFC ... TTTA */ RNA_SPLICE_UNKNOWN,
1219     /* 1111.1101    0xFD ... TTTC */ ( RNA_SPLICE_REV | 0x20 ),
1220     /* 1111.1110    0xFE ... TTTG */ ( RNA_SPLICE_FWD | 0x20 ),
1221     /* 1111.1111    0xFF ... TTTT */ ( RNA_SPLICE_REV | 0x20 )
1222 };
1223 
1224 
1225 /*************************************************************
1226     RNA-splice detector:
1227 
1228     base  0   1   .......  n-2  n-1     direction
1229 
1230           G   T             A    G      forward
1231           A   T             A    C      forward
1232 
1233           C   T             A    C      reverse
1234           G   T             A    T      reverse
1235 
1236 
1237     =========================================================
1238 
1239     zero mismatches ( aka full matches ) :
1240 
1241     ATAC    ... MINOR   [0x31]=RNA_SPLICE_FWD
1242     CTAC    ... MAJOR   [0x71]=RNA_SPLICE_REV
1243     GTAG    ... MAJOR   [0xB2]=RNA_SPLICE_FWD
1244     GTAT    ... MINOR   [0xB3]=RNA_SPLICE_REV
1245 
1246     =========================================================
1247 
1248     one mismatch:
1249 
1250     ATAC    ... MINOR, forward
1251 
1252         *
1253         CTAC ( is also MAJOR reverse ... )
1254         GTAC ( is alow MAJOR reverse, 1 mismatch )
1255         TTAC ( is alow MAJOR reverse, 1 mismatch )
1256 
1257          *
1258         AAAC [0x01]=( RNA_SPLICE_FWD | 0x10 )
1259         ACAC [0x11]=( RNA_SPLICE_FWD | 0x10 )
1260         AGAC [0x21]=( RNA_SPLICE_FWD | 0x10 )
1261 
1262           *
1263         ATCC [0x35]=( RNA_SPLICE_FWD | 0x10 )
1264         ATGC [0x39]=( RNA_SPLICE_FWD | 0x10 )
1265         ATTC [0x3D]=( RNA_SPLICE_FWD | 0x10 )
1266 
1267            *
1268         ATAA [0x30]=( RNA_SPLICE_FWD | 0x10 )
1269         ATAG ( is also MAJOR forward, 1 mismatch )
1270         ATAT [0x33]=( RNA_SPLICE_FWD | 0x10 )
1271 
1272     -----------------------------------------------------------
1273     CTAC    ... MAJOR, reverse
1274 
1275         *
1276         ATAC ( is also MINOR forward, full match )
1277         GTAC ( is also MAJOR forward, 1 mismatch )
1278         TTAC [0xF1]=( RNA_SPLICE_REV | 0x10 )
1279 
1280          *
1281         CAAC [0x41]=( RNA_SPLICE_REV | 0x10 )
1282         CCAC [0x51]=( RNA_SPLICE_REV | 0x10 )
1283         CGAC [0x61]=( RNA_SPLICE_REV | 0x10 )
1284 
1285           *
1286         CTCC [0x75]=( RNA_SPLICE_REV | 0x10 )
1287         CTGC [0x79]=( RNA_SPLICE_REV | 0x10 )
1288         CTTC [0x7D]=( RNA_SPLICE_REV | 0x10 )
1289 
1290            *
1291         CTAA [0x70]=( RNA_SPLICE_REV | 0x10 )
1292         CTAG ( is also MAJOR forward, 1 mismatch )
1293         CTAT [0x73]=( RNA_SPLICE_REV | 0x10 )
1294 
1295     -----------------------------------------------------------
1296     GTAG    ... MAJOR, forward
1297 
1298         *
1299         ATAG [0x32]=( RNA_SPLICE_FWD | 0x10 )
1300         CTAG [0x72]=( RNA_SPLICE_FWD | 0x10 )
1301         TTAG [0xF2]=( RNA_SPLICE_FWD | 0x10 )
1302 
1303          *
1304         GAAG [0x82]=( RNA_SPLICE_FWD | 0x10 )
1305         GCAG [0x92]=( RNA_SPLICE_FWD | 0x10 )
1306         GGAG [0xA2]=( RNA_SPLICE_FWD | 0x10 )
1307 
1308           *
1309         GTCG [0xB6]=( RNA_SPLICE_FWD | 0x10 )
1310         GTGG [0xBA]=( RNA_SPLICE_FWD | 0x10 )
1311         GTTG [0xBE]=( RNA_SPLICE_FWD | 0x10 )
1312 
1313            *
1314         GTAA [0xB0]=( RNA_SPLICE_FWD | 0x10 )
1315         GTAC [0xB1]=( RNA_SPLICE_FWD | 0x10 )
1316         GTAT ( is also MINOR reverse, full match )
1317 
1318     -----------------------------------------------------------
1319     GTAT    ... MINOR, reverse
1320 
1321         *
1322         ATAT ( is also MINOR forward, 1 mismatch )
1323         CTAT ( is also MAJOR reverse, 1 mismatch )
1324         TTAT [0xF3]=( RNA_SPLICE_REV | 0x10 )
1325 
1326          *
1327         GAAT [0x83]=( RNA_SPLICE_REV | 0x10 )
1328         GCAT [0x93]=( RNA_SPLICE_REV | 0x10 )
1329         GGAT [0xA3]=( RNA_SPLICE_REV | 0x10 )
1330 
1331           *
1332         GTCT [0xB7]=( RNA_SPLICE_REV | 0x10 )
1333         GTGT [0xBB]=( RNA_SPLICE_REV | 0x10 )
1334         GTTT [0xBF]=( RNA_SPLICE_REV | 0x10 )
1335 
1336            *
1337         GTAA ( is also MAJOR forward, 1 mismatch )
1338         GTAC ( is also MAJOR forward, 1 mismatch )
1339         GTAG ( is also MAJOR forward, full match )
1340 
1341     =========================================================
1342 
1343     two mismatches:
1344 
1345     ATAC    ... MINOR, forward
1346 
1347         * *
1348         CTAC ( is also MAJOR reverse, full match )
1349         CTCC ( is also MAJOR reverse, 1 mismatch )
1350         CTGC ( is also MAJOR reverse, 1 mismatch )
1351         CTTC ( is also MAJOR reverse, 1 mismatch )
1352         GTAC ( is also MAJOR forward, 1 mismatch )
1353         GTCC ( is also MAJOR reverse, 2 mismatches )
1354         GTGC ( is also MAJOR reverse, 2 mismatches )
1355         GTTC ( is also MAJOR reverse, 2 mismatches )
1356         TTAC ( is also MAJOR reverse, 1 mismatch )
1357         TTCC ( is also MAJOR reverse, 2 mismatches )
1358         TTGC ( is also MAJOR reverse, 2 mismatches )
1359         TTTC ( is also MAJOR reverse, 2 mismatches )
1360 
1361          **
1362         AAAC ( is also MAJOR forward, 1 mismatch )
1363         AACC [0x05]=( RNA_SPLICE_FWD | 0x20 )
1364         AAGC [0x09]=( RNA_SPLICE_FWD | 0x20 )
1365         AATC [0x0D]=( RNA_SPLICE_FWD | 0x20 )
1366         ACAC ( is also MINOR forward, 1 mismatch )
1367         ACCC [0x15]=( RNA_SPLICE_FWD | 0x20 )
1368         ACGC [0x19]=( RNA_SPLICE_FWD | 0x20 )
1369         ACTC [0x1D]=( RNA_SPLICE_FWD | 0x20 )
1370         AGAC ( is also MINOR forward, 1 mismatch )
1371         AGCC [0x25]=( RNA_SPLICE_FWD | 0x20 )
1372         AGGC [0x29]=( RNA_SPLICE_FWD | 0x20 )
1373         AGTC [0x2D]=( RNA_SPLICE_FWD | 0x20 )
1374 
1375         *  *
1376         CTAA ( is also MAJOR reverse, 1 mismatch )
1377         CTAC ( is also MAJOR reverse, full match )
1378         CTAG ( is also MAJOR forward, 1 mismatch )
1379         CTAT ( is also MAJOR reverse, 1 mismatch )
1380         GTAA ( is also MAJOR forward, 1 mismatch )
1381         GTAC ( is also MAJOR forward, 1 mismatch )
1382         GTAG ( is also MAJOR forward, full match )
1383         GTAT ( is also MINOR reverse, full match )
1384         TTAA ( is also MAJOR reverse, 2 mismatches )
1385         TTAC ( is also MAJOR reverse, 1 mismatch )
1386         TTAG ( is also MAJOR forward, 1 mismatch )
1387         TTAT ( is also MINOR reverse, 1 mismatch )
1388 
1389          * *
1390         AAAA [0x00]=( RNA_SPLICE_FWD | 0x20 )
1391         AAAC ( is also MAJOR forward, 1 mismatch )
1392         AAAG [0x02]=( RNA_SPLICE_FWD | 0x20 )
1393         AAAT [0x03]=( RNA_SPLICE_FWD | 0x20 )
1394         ACAA [0x10]=( RNA_SPLICE_FWD | 0x20 )
1395         ACAC ( is also MINOR forward, 1 mismatch )
1396         ACAG [0x12]=( RNA_SPLICE_FWD | 0x20 )
1397         ACAT [0x13]=( RNA_SPLICE_FWD | 0x20 )
1398         AGAA [0x20]=( RNA_SPLICE_FWD | 0x20 )
1399         AGAC ( is also MINOR forward, 1 mismatch )
1400         AGAG [0x22]=( RNA_SPLICE_FWD | 0x20 )
1401         AGAT [0x23]=( RNA_SPLICE_FWD | 0x20 )
1402 
1403     -----------------------------------------------------------
1404     CTAC    ... MAJOR, reverse
1405 
1406         * *
1407         ATAC ( is also MINOR forward, full match )
1408         ATCC ( is also MINOR forward, 1 mismatch )
1409         ATGC ( is also MINOR forward, 1 mismatch )
1410         ATTC ( is also MINOR forward, 1 mismatch )
1411         GTAC ( is also MAJOR forward, 1 mismatch )
1412         GTCC [0xB5]=( RNA_SPLICE_REV | 0x20 )
1413         GTGC [0xB9]=( RNA_SPLICE_REV | 0x20 )
1414         GTTC [0xBD]=( RNA_SPLICE_REV | 0x20 )
1415         TTAC ( is also MAJOR reverse, 1 mismatch )
1416         TTCC [0xF5]=( RNA_SPLICE_REV | 0x20 )
1417         TTGC [0xF9]=( RNA_SPLICE_REV | 0x20 )
1418         TTTC [0xFD]=( RNA_SPLICE_REV | 0x20 )
1419 
1420          **
1421         CAAC ( is also MAJOR reverse, 1 mismatch )
1422         CACC [0x45]=( RNA_SPLICE_REV | 0x20 )
1423         CAGC [0x49]=( RNA_SPLICE_REV | 0x20 )
1424         CATC [0x4D]=( RNA_SPLICE_REV | 0x20 )
1425         CCAC ( is also MAJOR reverse, 1 mismatch )
1426         CCCC [0x55]=( RNA_SPLICE_REV | 0x20 )
1427         CCGC [0x59]=( RNA_SPLICE_REV | 0x20 )
1428         CCTC [0x5D]=( RNA_SPLICE_REV | 0x20 )
1429         CGAC ( is also MAJOR reverse, 1 mismatch )
1430         CGCC [0x65]=( RNA_SPLICE_REV | 0x20 )
1431         CGGC [0x69]=( RNA_SPLICE_REV | 0x20 )
1432         CGTC [0x6D]=( RNA_SPLICE_REV | 0x20 )
1433 
1434         *  *
1435         ATAA ( is also MINOR forward, 1 mismatch )
1436         ATAC ( is also MINOR forward, full match )
1437         ATAG ( is also MAJOR forward, 1 mismatch )
1438         ATAT ( is also MINOR forward, 1 mismatch )
1439         GTAA ( is also MAJOR forward, 1 mismatch )
1440         GTAC ( is also MAJOR forward, 1 mismatch )
1441         GTAG ( is also MAJOR forward, full match )
1442         GTAT ( is also MINOR reverse, full match )
1443         TTAA ( is also MAJOR forward, 2 mismatches )
1444         TTAC ( is also MAJOR reverse, 1 mismatch )
1445         TTAG ( is also MAJOR forward, 1 mismatch )
1446         TTAT ( is also MINOR reverse, 1 mismatch )
1447 
1448          * *
1449         CAAA [0x40]=( RNA_SPLICE_REV | 0x20 )
1450         CAAC ( is also MAJOR reverse, 1 mismatch )
1451         CAAG [0x42]=( RNA_SPLICE_REV | 0x20 )
1452         CAAT [0x43]=( RNA_SPLICE_REV | 0x20 )
1453         CCAA [0x50]=( RNA_SPLICE_REV | 0x20 )
1454         CCAC ( is also MAJOR reverse, 1 mismatch )
1455         CCAG [0x52]=( RNA_SPLICE_REV | 0x20 )
1456         CCAT [0x53]=( RNA_SPLICE_REV | 0x20 )
1457         CGAA [0x60]=( RNA_SPLICE_REV | 0x20 )
1458         CGAC ( is also MAJOR reverse, 1 mismatch )
1459         CGAG [0x62]=( RNA_SPLICE_REV | 0x20 )
1460         CGAT [0x63]=( RNA_SPLICE_REV | 0x20 )
1461 
1462     -----------------------------------------------------------
1463     GTAG    ... MAJOR, forward
1464 
1465         * *
1466         ATAG ( is also MAJOR forward, 1 mismatch )
1467         ATCG [0x36]=( RNA_SPLICE_FWD | 0x20 )
1468         ATGG [0x3A]=( RNA_SPLICE_FWD | 0x20 )
1469         ATTG [0x3E]=( RNA_SPLICE_FWD | 0x20 )
1470         CTAG ( is also MAJOR forward, 1 mismatch )
1471         CTCG [0x76]=( RNA_SPLICE_FWD | 0x20 )
1472         CTGG [0x7A]=( RNA_SPLICE_FWD | 0x20 )
1473         CTTG [0x7E]=( RNA_SPLICE_FWD | 0x20 )
1474         TTAG ( is also MAJOR forward, 1 mismatch )
1475         TTCG [0xF6]=( RNA_SPLICE_FWD | 0x20 )
1476         TTGG [0xFA]=( RNA_SPLICE_FWD | 0x20 )
1477         TTTG [0xFE]=( RNA_SPLICE_FWD | 0x20 )
1478 
1479          **
1480         GAAG ( is also MAJOR forward, 1 mismatch )
1481         GACG [0x86]=( RNA_SPLICE_FWD | 0x20 )
1482         GAGG [0x8A]=( RNA_SPLICE_FWD | 0x20 )
1483         GATG [0x8E]=( RNA_SPLICE_FWD | 0x20 )
1484         GCAG ( is also MAJOR forward, 1 mismatch )
1485         GCCG [0x96]=( RNA_SPLICE_FWD | 0x20 )
1486         GCGG [0x9A]=( RNA_SPLICE_FWD | 0x20 )
1487         GCTG [0x9E]=( RNA_SPLICE_FWD | 0x20 )
1488         GGAG ( is also MAJOR forward, 1 mismatch )
1489         GGCG [0xA6]=( RNA_SPLICE_FWD | 0x20 )
1490         GGGG [0xAA]=( RNA_SPLICE_FWD | 0x20 )
1491         GGTG [0xAE]=( RNA_SPLICE_FWD | 0x20 )
1492 
1493         *  *
1494         ATAA ( is also MINOR forward, 1 mismatch )
1495         ATAC ( is also MINOR forward, full match )
1496         ATAG ( is also MAJOR forward, 1 mismatch )
1497         ATAT ( is also MINOR forward, 1 mismatch )
1498         CTAA ( is also MAJOR reverse, 1 mismatch )
1499         CTAC ( is also MAJOR reverse, full match )
1500         CTAG ( is also MAJOR forward, 1 mismatch )
1501         CTAT ( is also MAJOR reverse, 1 mismatch )
1502         TTAA [0xF0]=( RNA_SPLICE_FWD | 0x20 )
1503         TTAC ( is also MAJOR reverse, 1 mismatch )
1504         TTAG ( is also MAJOR forward, 1 mismatch )
1505         TTAT ( is also MINOR reverse, 1 mismatch )
1506 
1507          * *
1508         GAAA [0x80]=( RNA_SPLICE_FWD | 0x20 )
1509         GAAC [0x81]=( RNA_SPLICE_FWD | 0x20 )
1510         GAAG ( is also MAJOR forward, 1 mismatch )
1511         GAAT ( is also MINOR reverse, 1 mismatch )
1512         GCAA [0x90]=( RNA_SPLICE_FWD | 0x20 )
1513         GCAC [0x91]=( RNA_SPLICE_FWD | 0x20 )
1514         GCAG ( is also MAJOR forward, 1 mismatch )
1515         GCAT ( is also MINOR reverse, 1 mismatch )
1516         GGAA [0xA0]=( RNA_SPLICE_FWD | 0x20 )
1517         GGAC [0xA1]=( RNA_SPLICE_FWD | 0x20 )
1518         GGAG ( is also MAJOR forward, 1 mismatch )
1519         GGAT ( is also MINOR reverse, 1 mismatch )
1520 
1521     -----------------------------------------------------------
1522     GTAT    ... MINOR, reverse
1523 
1524         * *
1525         ATAT ( is also MINOR forward, 1 mismatch )
1526         ATCT [0x37]=( RNA_SPLICE_REV | 0x20 )
1527         ATGT [0x3B]=( RNA_SPLICE_REV | 0x20 )
1528         ATTT [0x3F]=( RNA_SPLICE_REV | 0x20 )
1529         CTAT ( is also MAJOR reverse, 1 mismatch )
1530         CTCT [0x77]=( RNA_SPLICE_REV | 0x20 )
1531         CTGT [0x7B]=( RNA_SPLICE_REV | 0x20 )
1532         CTTT [0x7F]=( RNA_SPLICE_REV | 0x20 )
1533         TTAT ( is also MINOR reverse, 1 mismatch )
1534         TTCT [0xF7]=( RNA_SPLICE_REV | 0x20 )
1535         TTGT [0xFB]=( RNA_SPLICE_REV | 0x20 )
1536         TTTT [0xFF]=( RNA_SPLICE_REV | 0x20 )
1537 
1538          **
1539         GAAT ( is also MINOR reverse, 1 mismatch )
1540         GACT [0x87]=( RNA_SPLICE_REV | 0x20 )
1541         GAGT [0x8B]=( RNA_SPLICE_REV | 0x20 )
1542         GATT [0x8F]=( RNA_SPLICE_REV | 0x20 )
1543         GCAT ( is also MINOR reverse, 1 mismatch )
1544         GCCT [0x97]=( RNA_SPLICE_REV | 0x20 )
1545         GCGT [0x9B]=( RNA_SPLICE_REV | 0x20 )
1546         GCTT [0x9F]=( RNA_SPLICE_REV | 0x20 )
1547         GGAT ( is also MINOR reverse, 1 mismatch )
1548         GGCT [0xA7]=( RNA_SPLICE_REV | 0x20 )
1549         GGGT [0xAB]=( RNA_SPLICE_REV | 0x20 )
1550         GGTT [0xAF]=( RNA_SPLICE_REV | 0x20 )
1551 
1552         *  *
1553         ATAA ( is also MINOR forward, 1 mismatch )
1554         ATAC ( is also MINOR forward, full match )
1555         ATAG ( is also MAJOR forward, 1 mismatch )
1556         ATAT ( is also MINOR forward, 1 mismatch )
1557         CTAA ( is also MAJOR reverse, 1 mismatch )
1558         CTAC ( is also MAJOR reverse, full match )
1559         CTAG ( is also MAJOR forward, 1 mismatch )
1560         CTAT ( is also MAJOR reverse, 1 mismatch )
1561         TTAA ( is also MAJOR forward, 2 mismatches )
1562         TTAC ( is also MAJOR reverse, 1 mismatch )
1563         TTAG ( is also MAJOR forward, 1 mismatch )
1564         TTAT ( is also MINOR reverse, 1 mismatch )
1565 
1566          * *
1567         GAAA ( is also MAJOR forward, 2 mismatches )
1568         GAAC ( is also MAJOR forward, 2 mismatches )
1569         GAAG ( is also MAJOR forward, 1 mismatch )
1570         GAAT ( is also MINOR reverse, 1 mismatch )
1571         GCAA ( is also MAJOR forward, 2 mismatches )
1572         GCAC ( is also MAJOR forward, 2 mismatches )
1573         GCAG ( is also MAJOR forward, 1 mismatch )
1574         GCAT ( is also MINOR reverse, 1 mismatch )
1575         GGAA ( is also MAJOR forward, 2 mismatches )
1576         GGAC ( is also MAJOR forward, 2 mismatches )
1577         GGAG ( is also MAJOR forward, 1 mismatch )
1578         GGAT ( is also MINOR reverse, 1 mismatch )
1579 
1580 *************************************************************/
1581 
check_rna_splicing_candidates_against_ref(struct ReferenceObj const * ref_obj,uint32_t splice_level,INSDC_coord_zero pos,rna_splice_candidates * candidates)1582 rc_t check_rna_splicing_candidates_against_ref( struct ReferenceObj const * ref_obj,
1583                                                 uint32_t splice_level, /* 0, 1, 2 ... allowed mismatches */
1584                                                 INSDC_coord_zero pos,
1585                                                 rna_splice_candidates * candidates )
1586 {
1587     rc_t rc = 0;
1588     uint32_t idx;
1589     for ( idx = 0; idx < candidates->count && rc == 0; ++idx )
1590     {
1591         uint8_t splice[ 4 ];
1592         INSDC_coord_len written;
1593         rna_splice_candidate * rsc = &candidates->candidates[ idx ];
1594         INSDC_coord_zero rd_pos = ( pos + rsc->ref_offset );
1595         rc = ReferenceObj_Read( ref_obj, rd_pos, 2, splice, &written );
1596         if ( rc == 0 && written == 2 )
1597         {
1598             rd_pos += ( rsc->len - 2 );
1599             rc = ReferenceObj_Read( ref_obj, rd_pos, 2, &splice[ 2 ], &written );
1600             if ( rc == 0 && written == 2 )
1601             {
1602                 uint8_t compressed = compress_4_bases_to_byte( splice );   /* 4 bases --> 1 byte */
1603                 uint8_t match = compressed_to_fwd_reverse_0[ compressed ]; /* table lookup */
1604                 uint8_t mismatches = ( match >> 4 );
1605 
1606                 if ( mismatches <= splice_level )
1607                 {
1608                     rsc->matched = match;
1609 
1610                     if ( ( match & RNA_SPLICE_FWD ) == RNA_SPLICE_FWD )
1611                         candidates->fwd_matched++;
1612                     else if ( ( match & RNA_SPLICE_REV ) == RNA_SPLICE_REV )
1613                         candidates->rev_matched++;
1614                 }
1615             }
1616         }
1617     }
1618     return rc;
1619 }
1620 
1621 
discover_rna_splicing_candidates(uint32_t cigar_len,const char * cigar,uint32_t min_len,rna_splice_candidates * candidates)1622 rc_t discover_rna_splicing_candidates( uint32_t cigar_len, const char * cigar, uint32_t min_len, rna_splice_candidates * candidates )
1623 {
1624     rc_t rc = 0;
1625     candidates->cigops_len = ( cigar_len / 2 ) + 1;
1626     candidates->cigops = malloc( ( sizeof * candidates->cigops ) * candidates->cigops_len );
1627     if ( candidates->cigops == NULL )
1628         rc = RC( rcExe, rcNoTarg, rcConstructing, rcMemory, rcExhausted );
1629     else
1630     {
1631         uint32_t ref_offset = 0;
1632         int32_t op_idx;
1633         CigOps * cigops = candidates->cigops;
1634 
1635         candidates->n_cigops = ExplodeCIGAR( cigops, candidates->cigops_len, cigar, cigar_len );
1636         candidates->count = 0;
1637         for ( op_idx = 0; op_idx < ( candidates->n_cigops - 1 ); op_idx++ )
1638         {
1639             char op_code = cigops[ op_idx ].op;
1640             uint32_t op_len = cigops[ op_idx ].oplen;
1641             if ( (op_code == 'D' || op_code == 'N') && op_len >= min_len && candidates->count < MAX_RNA_SPLICE_CANDIDATES )
1642             {
1643                 rna_splice_candidate * rsc = &candidates->candidates[ candidates->count++ ];
1644                 rsc->ref_offset = ref_offset;
1645                 rsc->len = op_len;
1646                 rsc->op_idx = op_idx;
1647                 rsc->matched = RNA_SPLICE_UNKNOWN;  /* we dont know that yet, caller has to do that ( sam-aligned.c ) */
1648             }
1649             if ( op_code == 'M' || op_code == 'X' || op_code == '=' || op_code == 'D' || op_code == 'N' )
1650                 ref_offset += op_len;
1651         }
1652     }
1653     return rc;
1654 }
1655 
1656 
1657 
change_rna_splicing_cigar(uint32_t cigar_len,char * cigar,rna_splice_candidates * candidates,uint32_t * NM_adjustment)1658 rc_t change_rna_splicing_cigar( uint32_t cigar_len, char * cigar, rna_splice_candidates * candidates, uint32_t * NM_adjustment )
1659 {
1660     rc_t rc = 0;
1661     uint32_t winner, sum_of_n_lengths = 0;
1662     int32_t idx, dst;
1663     CigOps * cigops = candidates->cigops;
1664 
1665     /* handle the special case that we do have forward and reverse candidates in one alignement, that cannot be!
1666        we declare a winner ( the direction that occurs most ), zero out the looser(s) and give a warning */
1667     if ( candidates->fwd_matched > candidates->rev_matched )
1668         winner = RNA_SPLICE_FWD;
1669     else
1670         winner = RNA_SPLICE_REV;
1671 
1672     for ( idx = 0; idx < candidates->count; ++idx )
1673     {
1674         rna_splice_candidate * rsc = &candidates->candidates[ idx ];
1675         if ( ( rsc->matched & 0x0F ) == winner && cigops[ rsc->op_idx ].op == 'D' )
1676         {
1677             cigops[ rsc->op_idx ].op = 'N';
1678             sum_of_n_lengths += rsc->len;
1679         }
1680     }
1681 
1682     for ( idx = 0, dst = 0; idx < ( candidates->n_cigops - 1 ) && rc == 0; ++idx )
1683     {
1684         size_t sz;
1685         rc = string_printf( &cigar[ dst ], cigar_len + 1 - dst, &sz, "%u%c", cigops[ idx ].oplen, cigops[ idx ].op );
1686         dst += sz;
1687     }
1688     if ( NM_adjustment != NULL )
1689         *NM_adjustment = sum_of_n_lengths;
1690     return rc;
1691 }
cg_canonical_print_cigar(const char * cigar,size_t cigar_len)1692 rc_t cg_canonical_print_cigar( const char * cigar, size_t cigar_len)
1693 {
1694     rc_t rc;
1695     if ( cigar_len > 0 )
1696     {
1697         int i,total_cnt,cnt;
1698         char op;
1699         for(i=0,cnt=0,op=0,cnt=0;i<cigar_len;i++){
1700                 if(isdigit(cigar[i])){
1701                         cnt=cnt*10+(cigar[i]-'0');
1702                 } else if(isalpha(cigar[i])){
1703                         if(op=='\0'){ /** first op **/
1704                                 total_cnt=cnt;
1705                         } else if(op==cigar[i]){ /** merging consequitive ops **/
1706                                 total_cnt+=cnt;
1707                         } else {
1708                                 if(total_cnt > 0) KOutMsg( "%d%c", total_cnt,op );
1709                                 total_cnt=cnt;
1710                         }
1711                         op=cigar[i];
1712                         cnt=0;
1713                 } else {
1714                         assert(0); /*** should never happen inside this function ***/
1715                 }
1716         }
1717         if(total_cnt && op) KOutMsg( "%d%c", total_cnt,op );
1718     }
1719     else
1720         rc = KOutMsg( "*" );
1721     return rc;
1722 }
1723 
1724