1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include "cg_tools.h"
28 #include "debug.h"
29 #include <klib/out.h>
30
31 #include <klib/printf.h>
32 #include <sysalloc.h>
33 #include <ctype.h>
34 #include <stdlib.h>
35 #include <string.h>
36 #include <assert.h>
37
38
SetCigOp(CigOps * dst,char op,uint32_t oplen)39 static void SetCigOp( CigOps * dst, char op, uint32_t oplen )
40 {
41 dst->op = op;
42 dst->oplen = oplen;
43 switch( op )
44 { /*MX= DN B IS PH*/
45
46 case 'M' :
47 case 'X' :
48 case '=' : dst->ref_sign = +1;
49 dst->seq_sign = +1;
50 break;
51
52 case 'D' :
53 case 'N' : dst->ref_sign = +1;
54 dst->seq_sign = 0;
55 break;
56
57 case 'B' : dst->ref_sign = -1;
58 dst->seq_sign = 0;
59 break;
60
61 case 'S' :
62 case 'I' : dst->ref_sign = 0;
63 dst->seq_sign = +1;
64 break;
65
66 case 'P' :
67 case 'H' :
68 case 0 : dst->ref_sign= 0; /* terminating op */
69 dst->seq_sign= 0;
70 break;
71
72 default : assert( 0 );
73 break;
74 }
75 }
76
77
ExplodeCIGAR(CigOps dst[],uint32_t len,char const cigar[],uint32_t ciglen)78 int32_t ExplodeCIGAR( CigOps dst[], uint32_t len, char const cigar[], uint32_t ciglen )
79 {
80 uint32_t i;
81 uint32_t j;
82 int32_t opLen;
83
84 for ( i = j = opLen = 0; i < ciglen; ++i )
85 {
86 if ( isdigit( cigar[ i ] ) )
87 {
88 opLen = opLen * 10 + cigar[ i ] - '0';
89 }
90 else
91 {
92 assert( isalpha( cigar[ i ] ) );
93 SetCigOp( dst + j, cigar[ i ], opLen );
94 opLen = 0;
95 j++;
96 }
97 }
98 SetCigOp( dst + j, 0, 0 );
99 j++;
100 return j;
101 }
102
103
104 typedef struct cgOp_s
105 {
106 uint16_t length;
107 uint8_t type; /* 0: match, 1: insert, 2: delete */
108 char code;
109 } cgOp;
110
111
print_CG_cigar(int line,cgOp const op[],unsigned const ops,unsigned const gap[3])112 static void print_CG_cigar( int line, cgOp const op[], unsigned const ops, unsigned const gap[ 3 ] )
113 {
114 #if _DEBUGGING
115 unsigned i;
116
117 SAM_DUMP_DBG( 3, ( "%5i: ", line ) );
118 for ( i = 0; i < ops; ++i )
119 {
120 if ( gap && ( i == gap[ 0 ] || i == gap[ 1 ] || i == gap[ 2 ] ) )
121 {
122 SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, tolower( op[ i ].code ) ) );
123 }
124 else
125 {
126 SAM_DUMP_DBG( 3, ( "%u%c", op[ i ].length, toupper( op[ i ].code ) ) );
127 }
128 }
129 SAM_DUMP_DBG( 3, ( "\n" ) );
130 #endif
131 }
132
133
134 /* gap contains the indices of the wobbles in op
135 * gap[0] is between read 1 and 2; it is the 'B'
136 * gap[1] is between read 2 and 3; it is an 'N'
137 * gap[2] is between read 3 and 4; it is an 'N'
138 */
139
140 typedef struct cg_cigar_temp
141 {
142 unsigned gap[ 3 ];
143 cgOp cigOp[ MAX_READ_LEN ];
144 unsigned opCnt;
145 unsigned S_adjust;
146 unsigned CG_adjust;
147 bool has_ref_offset_type;
148 } cg_cigar_temp;
149
150
CIGAR_to_CG_Ops(const cg_cigar_input * input,cg_cigar_temp * tmp)151 static rc_t CIGAR_to_CG_Ops( const cg_cigar_input * input,
152 cg_cigar_temp * tmp )
153 {
154 unsigned i;
155 unsigned ops = 0;
156 unsigned gapno;
157
158 tmp->opCnt = 0;
159 tmp->S_adjust = 0;
160 tmp->CG_adjust = 0;
161 tmp->has_ref_offset_type = false;
162 for ( i = 0; i < input->p_cigar.len; ++ops )
163 {
164 char opChar = 0;
165 int opLen = 0;
166 int n;
167
168 for ( n = 0; ( ( n + i ) < input->p_cigar.len ) && ( isdigit( input->p_cigar.ptr[ n + i ] ) ); n++ )
169 {
170 opLen = opLen * 10 + input->p_cigar.ptr[ n + i ] - '0';
171 }
172
173 if ( ( n + i ) < input->p_cigar.len )
174 {
175 opChar = input->p_cigar.ptr[ n + i ];
176 n++;
177 }
178
179 if ( ( ops + 1 ) >= MAX_READ_LEN )
180 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
181
182 i += n;
183
184 tmp->cigOp[ ops ].length = opLen;
185 tmp->cigOp[ ops ].code = opChar;
186 switch ( opChar )
187 {
188 case 'M' :
189 case '=' :
190 case 'X' : tmp->cigOp[ ops ].type = 0;
191 break;
192
193 case 'S' : tmp->S_adjust += opLen;
194 case 'I' : tmp->cigOp[ ops ].type = 1;
195 tmp->cigOp[ ops ].code = 'I';
196 break;
197 case 'N': tmp->has_ref_offset_type = true;
198 case 'D': tmp->cigOp[ ops ].type = 2;
199 break;
200
201 default : return RC( rcExe, rcData, rcReading, rcConstraint, rcViolated );
202 }
203 }
204
205 tmp->opCnt = ops;
206 tmp->gap[ 0 ] = tmp->gap[ 1 ] = tmp->gap[ 2 ] = ops;
207 print_CG_cigar( __LINE__, tmp->cigOp, ops, NULL );
208
209
210 if ( ops < 3 )
211 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
212
213 {
214 unsigned fwd = 0; /* 5 10 10 10 */
215 unsigned rev = 0; /* 10 10 10 5 */
216 unsigned acc; /* accumulated length */
217
218 if ( ( input->seq_req_id == 1 && !input->orientation ) ||
219 ( input->seq_req_id == 2 && input->orientation ) )
220 {
221 for ( i = 0, acc = 0; i < ops && acc <= 5; ++i )
222 {
223 if ( tmp->cigOp[ i ].type != 2 )
224 {
225 acc += tmp->cigOp[ i ].length;
226 if ( acc == 5 && tmp->cigOp[ i + 1 ].type == 1 )
227 {
228 fwd = i + 1;
229 break;
230 }
231 else if ( acc > 5 )
232 {
233 unsigned const right = acc - 5;
234 unsigned const left = tmp->cigOp[ i ].length - right;
235
236 memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
237 ops += 2;
238 tmp->cigOp[ i ].length = left;
239 tmp->cigOp[ i + 2 ].length = right;
240
241 tmp->cigOp[ i + 1 ].type = 1;
242 tmp->cigOp[ i + 1 ].code = 'B';
243 tmp->cigOp[ i + 1 ].length = 0;
244
245 fwd = i + 1;
246 break;
247 }
248 }
249 }
250 }
251 else if ( ( input->seq_req_id == 2 && !input->orientation ) ||
252 ( input->seq_req_id == 1 && input->orientation ) )
253 {
254 for ( i = ops, acc = 0; i > 0 && acc <= 5; )
255 {
256 --i;
257 if ( tmp->cigOp[ i ].type != 2 )
258 {
259 acc += tmp->cigOp[ i ].length;
260 if ( acc == 5 && tmp->cigOp[ i ].type == 1 )
261 {
262 rev = i;
263 break;
264 }
265 else if ( acc > 5 )
266 {
267 unsigned const left = acc - 5;
268 unsigned const right = tmp->cigOp[ i ].length - left;
269
270 memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
271 ops += 2;
272 tmp->cigOp[ i ].length = left;
273 tmp->cigOp[ i + 2 ].length = right;
274
275 tmp->cigOp[ i + 1 ].type = 1;
276 tmp->cigOp[ i + 1 ].code = 'B';
277 tmp->cigOp[ i + 1 ].length = 0;
278
279 rev = i + 1;
280 break;
281 }
282 }
283 }
284 }
285 else
286 {
287 /* fprintf(stderr, "guessing layout\n"); */
288 for ( i = 0, acc = 0; i < ops && acc <= 5; ++i )
289 {
290 if ( tmp->cigOp[ i ].type != 2 )
291 {
292 acc += tmp->cigOp[ i ].length;
293 if ( acc == 5 && tmp->cigOp[ i + 1 ].type == 1 )
294 {
295 fwd = i + 1;
296 }
297 }
298 }
299 for ( i = ops, acc = 0; i > 0 && acc <= 5; )
300 {
301 --i;
302 if ( tmp->cigOp[ i ].type != 2 )
303 {
304 acc += tmp->cigOp[ i ].length;
305 if ( acc == 5 && tmp->cigOp[i].type == 1 )
306 {
307 rev = i;
308 }
309 }
310 }
311 if ( !tmp->has_ref_offset_type && (( fwd == 0 && rev == 0 ) || ( fwd != 0 && rev != 0 ) ))
312 {
313 for ( i = 0; i < ops; ++i )
314 {
315 if ( tmp->cigOp[ i ].type == 2 )
316 {
317 tmp->cigOp[ i ].code = 'N';
318 tmp->CG_adjust += tmp->cigOp[ i ].length;
319 }
320 }
321 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
322 }
323 }
324 if ( fwd && tmp->cigOp[ fwd ].type == 1 )
325 {
326 for ( i = ops, acc = 0; i > fwd + 1; )
327 {
328 --i;
329 if ( tmp->cigOp[ i ].type != 2 )
330 {
331 acc += tmp->cigOp[ i ].length;
332 if ( acc >= 10 )
333 {
334 if ( acc > 10 )
335 {
336 unsigned const r = 10 + tmp->cigOp[ i ].length - acc;
337 unsigned const l = tmp->cigOp[ i ].length - r;
338
339 if ( ops + 2 >= MAX_READ_LEN )
340 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
341 memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
342 ops += 2;
343 tmp->cigOp[ i + 2 ].length = r;
344 tmp->cigOp[ i ].length = l;
345
346 tmp->cigOp[ i + 1 ].length = 0;
347 tmp->cigOp[ i + 1 ].type = 2;
348 tmp->cigOp[ i + 1 ].code = 'N';
349 i += 2;
350 }
351 else if ( i - 1 > fwd )
352 {
353 if ( tmp->cigOp[ i - 1 ].type == 2 )
354 tmp->cigOp[ i - 1 ].code = 'N';
355 else
356 {
357 if ( ops + 1 >= MAX_READ_LEN )
358 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
359 memmove( &tmp->cigOp[ i + 1 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
360 ops += 1;
361 tmp->cigOp[ i ].length = 0;
362 tmp->cigOp[ i ].type = 2;
363 tmp->cigOp[ i ].code = 'N';
364 i += 1;
365 }
366 }
367 acc = 0;
368 }
369 }
370 }
371 /** change I to B+M **/
372 tmp->cigOp[ fwd ].code = 'B';
373 memmove( &tmp->cigOp[ fwd + 1 ], &tmp->cigOp[ fwd ], ( ops - fwd ) * sizeof( tmp->cigOp[ 0 ] ) );
374 ops += 1;
375 tmp->cigOp[ fwd + 1 ].code = 'M';
376 tmp->opCnt = ops;
377 /** set the gaps now **/
378 for ( gapno = 3, i = ops; gapno > 1 && i > 0; )
379 {
380 --i;
381 if ( tmp->cigOp[ i ].code == 'N' )
382 tmp->gap[ --gapno ] = i;
383 }
384 tmp->gap[ 0 ] = fwd;
385 print_CG_cigar( __LINE__, tmp->cigOp, ops, tmp->gap );
386 return 0;
387 }
388 if ( rev && tmp->cigOp[ rev ].type == 1 )
389 {
390 for ( acc = i = 0; i < rev; ++i )
391 {
392 if ( tmp->cigOp[ i ].type != 2 )
393 {
394 acc += tmp->cigOp[ i ].length;
395 if ( acc >= 10 )
396 {
397 if ( acc > 10 )
398 {
399 unsigned const l = 10 + tmp->cigOp[ i ].length - acc;
400 unsigned const r = tmp->cigOp[ i ].length - l;
401
402 if ( ops + 2 >= MAX_READ_LEN )
403 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
404 memmove( &tmp->cigOp[ i + 2 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
405 ops += 2;
406 tmp->cigOp[ i + 2 ].length = r;
407 tmp->cigOp[ i ].length = l;
408
409 tmp->cigOp[ i + 1 ].length = 0;
410 tmp->cigOp[ i + 1 ].type = 2;
411 tmp->cigOp[ i + 1 ].code = 'N';
412 rev += 2;
413 i += 2;
414 }
415 else if ( i + 1 < rev )
416 {
417 if ( tmp->cigOp[ i + 1 ].type == 2 )
418 tmp->cigOp[ i + 1 ].code = 'N';
419 else
420 {
421 if ( ops + 1 >= MAX_READ_LEN )
422 return RC( rcExe, rcData, rcReading, rcBuffer, rcInsufficient );
423 memmove( &tmp->cigOp[ i + 1 ], &tmp->cigOp[ i ], ( ops - i ) * sizeof( tmp->cigOp[ 0 ] ) );
424 ops += 1;
425 tmp->cigOp[ i + 1 ].length = 0;
426 tmp->cigOp[ i + 1 ].type = 2;
427 tmp->cigOp[ i + 1 ].code = 'N';
428 rev += 1;
429 i += 1;
430 }
431 }
432 acc = 0;
433 }
434 }
435 }
436 for ( gapno = 3, i = 0; gapno > 1 && i < ops; ++i )
437 {
438 if ( tmp->cigOp[ i ].code == 'N' )
439 tmp->gap[ --gapno ] = i;
440 }
441 tmp->gap[ 0 ] = rev;
442 tmp->cigOp[ rev ].code = 'B';
443 memmove( &tmp->cigOp[ rev + 1 ], &tmp->cigOp[ rev ], ( ops - rev ) * sizeof( tmp->cigOp[ 0 ] ) );
444 ops += 1;
445 tmp->cigOp[ rev + 1 ].code = 'M';
446 tmp->opCnt = ops;
447 print_CG_cigar( __LINE__, tmp->cigOp, ops, tmp->gap );
448 return 0;
449 }
450 }
451 return RC( rcExe, rcData, rcReading, rcFormat, rcNotFound ); /* CG pattern not found */
452 }
453
454
adjust_cigar(const cg_cigar_input * input,cg_cigar_temp * tmp,cg_cigar_output * output)455 static rc_t adjust_cigar( const cg_cigar_input * input, cg_cigar_temp * tmp, cg_cigar_output * output )
456 {
457 rc_t rc = 0;
458 unsigned i, j;
459
460 print_CG_cigar( __LINE__, tmp->cigOp, tmp->opCnt, NULL );
461
462 /* remove zero length ops */
463 for ( j = i = 0; i < tmp->opCnt; )
464 {
465 if ( tmp->cigOp[ j ].length == 0 )
466 {
467 ++j;
468 --(tmp->opCnt);
469 continue;
470 }
471 tmp->cigOp[ i++ ] = tmp->cigOp[ j++ ];
472 }
473
474 print_CG_cigar( __LINE__, tmp->cigOp, tmp->opCnt, NULL );
475
476 if ( input->edit_dist_available )
477 {
478 int const adjusted = input->edit_dist + tmp->S_adjust - tmp->CG_adjust;
479 output->edit_dist = adjusted > 0 ? adjusted : 0;
480 SAM_DUMP_DBG( 4, ( "NM: before: %u, after: %u(+%u-%u)\n", input->edit_dist, output->edit_dist, tmp->S_adjust, tmp->CG_adjust ) );
481 }
482 else
483 {
484 output->edit_dist = input->edit_dist;
485 }
486
487 /* merge adjacent ops */
488 for ( i = tmp->opCnt; i > 1; )
489 {
490 --i;
491 if ( tmp->cigOp[ i - 1 ].code == tmp->cigOp[ i ].code )
492 {
493 tmp->cigOp[ i - 1 ].length += tmp->cigOp[ i ].length;
494 memmove( &tmp->cigOp[ i ], &tmp->cigOp[ i + 1 ], ( tmp->opCnt - 1 - i ) * sizeof( tmp->cigOp[ 0 ] ) );
495 --(tmp->opCnt);
496 }
497 }
498 print_CG_cigar( __LINE__, tmp->cigOp, tmp->opCnt, NULL );
499 for ( i = j = 0; i < tmp->opCnt && rc == 0; ++i )
500 {
501 size_t sz;
502 rc = string_printf( &output->cigar[ j ], sizeof( output->cigar ) - j, &sz, "%u%c", tmp->cigOp[ i ].length, tmp->cigOp[ i ].code );
503 j += sz;
504 }
505 output->cigar_len = j;
506 return rc;
507 }
508
509
make_cg_cigar(const cg_cigar_input * input,cg_cigar_output * output)510 rc_t make_cg_cigar( const cg_cigar_input * input, cg_cigar_output * output )
511 {
512 rc_t rc;
513 cg_cigar_temp tmp;
514 memset( &tmp, 0, sizeof tmp );
515
516 rc = CIGAR_to_CG_Ops( input, &tmp );
517 if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
518 {
519 memmove( &output->cigar[ 0 ], &input->p_cigar.ptr[ 0 ], input->p_cigar.len );
520 output->cigar_len = input->p_cigar.len;
521 output->cigar[ output->cigar_len ] = 0;
522 output->edit_dist = input->edit_dist;
523 output->p_cigar.ptr = output->cigar;
524 output->p_cigar.len = output->cigar_len;
525 rc = 0;
526 }
527 else if ( rc == 0 )
528 {
529 if ( tmp.CG_adjust == 0 && !tmp.has_ref_offset_type)
530 {
531 if ( tmp.gap[ 0 ] < tmp.opCnt )
532 tmp.CG_adjust = tmp.cigOp[ tmp.gap[ 0 ] ].length;
533
534 if ( tmp.gap[ 1 ] < tmp.opCnt )
535 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 1 ] ].length;
536
537 if ( tmp.gap[ 2 ] < tmp.opCnt )
538 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 2 ] ].length;
539 }
540
541 rc = adjust_cigar( input, &tmp, output );
542 }
543 return rc;
544 }
545
546
merge_M_type_ops(char a,char b)547 static char merge_M_type_ops( char a, char b )
548 { /*MX=*/
549 char c = 0;
550 switch( b )
551 {
552 case 'X' : switch( a )
553 {
554 case '=' : c = 'X'; break;
555 case 'X' : c = 'M'; break; /**we don't know - 2X may create '=' **/
556 case 'M' : c = 'M'; break;
557 }
558 break;
559 case 'M' : c = 'M'; break;
560 case '=' : c = a; break;
561 }
562 assert( c != 0 );
563 return c;
564 }
565
566
fmt_cigar_elem(char * dst,uint32_t cig_oplen,char cig_op)567 static size_t fmt_cigar_elem( char * dst, uint32_t cig_oplen, char cig_op )
568 {
569 size_t num_writ;
570 rc_t rc = string_printf ( dst, 11, &num_writ, "%d%c", cig_oplen, cig_op );
571 assert ( rc == 0 && num_writ > 1 );
572 return num_writ;
573 }
574
CombineCIGAR(char dst[],CigOps const seqOp[],uint32_t seq_len,uint32_t refPos,CigOps const refOp[],uint32_t ref_len)575 uint32_t CombineCIGAR( char dst[], CigOps const seqOp[], uint32_t seq_len,
576 uint32_t refPos, CigOps const refOp[], uint32_t ref_len )
577 {
578 bool done = false;
579 uint32_t ciglen = 0, last_ciglen = 0, last_cig_oplen = 0;
580 int32_t si = 0, ri = 0;
581 char last_cig_op = '0'; /* never used operation, forces a mismatch in MACRO_BUILD_CIGAR */
582 CigOps seq_cop = { 0, 0, 0, 0 }, ref_cop = { 0, 0, 0, 0 };
583 int32_t seq_pos = 0; /** seq_pos is tracked roughly - with every extraction from seqOp **/
584 int32_t ref_pos = 0; /** ref_pos is tracked precisely - with every delta and consumption in cigar **/
585 int32_t delta = refPos; /*** delta in relative positions of seq and ref **/
586 /*** when delta < 0 - rewind or extend the reference ***/
587 /*** wher delta > 0 - skip reference ***/
588 #define MACRO_BUILD_CIGAR(OP,OPLEN) \
589 if( last_cig_oplen > 0 && last_cig_op == OP){ \
590 last_cig_oplen += OPLEN; \
591 ciglen = last_ciglen + fmt_cigar_elem( dst + last_ciglen, last_cig_oplen,last_cig_op ); \
592 } else { \
593 last_ciglen = ciglen; \
594 last_cig_oplen = OPLEN; \
595 last_cig_op = OP; \
596 ciglen = ciglen + fmt_cigar_elem( dst + last_ciglen, last_cig_oplen, last_cig_op ); \
597 }
598 while( !done )
599 {
600 while ( delta < 0 )
601 {
602 ref_pos += delta; /** we will make it to back up this way **/
603 if( ri > 0 )
604 { /** backing up on ref if possible ***/
605 int avail_oplen = refOp[ ri - 1 ].oplen - ref_cop.oplen;
606 if ( avail_oplen > 0 )
607 {
608 if ( ( -delta ) <= avail_oplen * ref_cop.ref_sign )
609 { /*** rewind within last operation **/
610 ref_cop.oplen -= delta;
611 delta -= delta * ref_cop.ref_sign;
612 }
613 else
614 { /*** rewind the whole ***/
615 ref_cop.oplen += avail_oplen;
616 delta += avail_oplen * ref_cop.ref_sign;
617 }
618 }
619 else
620 {
621 ri--;
622 /** pick the previous as used up **/
623 ref_cop = refOp[ri-1];
624 ref_cop.oplen =0;
625 }
626 }
627 else
628 { /** extending the reference **/
629 SetCigOp( &ref_cop, '=', ref_cop.oplen - delta );
630 delta = 0;
631 }
632 ref_pos -= delta;
633 }
634 if ( ref_cop.oplen == 0 )
635 { /*** advance the reference ***/
636 ref_cop = refOp[ri++];
637 if ( ref_cop.oplen == 0 )
638 { /** extending beyond the reference **/
639 SetCigOp( &ref_cop,'=', 1000 );
640 }
641 assert( ref_cop.oplen > 0 );
642 }
643 if ( delta > 0 )
644 { /*** skip refOps ***/
645 ref_pos += delta; /** may need to back up **/
646 if ( delta >= ref_cop.oplen )
647 { /** full **/
648 delta -= ref_cop.oplen * ref_cop.ref_sign;
649 ref_cop.oplen = 0;
650 }
651 else
652 { /** partial **/
653 ref_cop.oplen -= delta;
654 delta -= delta * ref_cop.ref_sign;
655 }
656 ref_pos -= delta; /** if something left - restore ***/
657 continue;
658 }
659
660 /*** seq and ref should be synchronized here **/
661 assert( delta == 0 );
662 if ( seq_cop.oplen == 0 )
663 { /*** advance sequence ***/
664 if ( seq_pos < seq_len )
665 {
666 seq_cop = seqOp[ si++ ];
667 assert( seq_cop.oplen > 0 );
668 seq_pos += seq_cop.oplen * seq_cop.seq_sign;
669 }
670 else
671 {
672 done=true;
673 }
674 }
675
676 if( !done )
677 {
678 int seq_seq_step = seq_cop.oplen * seq_cop.seq_sign; /** sequence movement**/
679 int seq_ref_step = seq_cop.oplen * seq_cop.ref_sign; /** influence of sequence movement on intermediate reference **/
680 int ref_seq_step = ref_cop.oplen * ref_cop.seq_sign; /** movement of the intermediate reference ***/
681 int ref_ref_step = ref_cop.oplen * ref_cop.ref_sign; /** influence of the intermediate reference movement on final reference ***/
682 assert( ref_ref_step >= 0 ); /** no B in the reference **/
683 if ( seq_ref_step <= 0 )
684 { /** BSIPH in the sequence against anything ***/
685 MACRO_BUILD_CIGAR( seq_cop.op, seq_cop.oplen );
686 seq_cop.oplen = 0;
687 delta = seq_ref_step; /** if negative - will force rewind next cycle **/
688 }
689 else if ( ref_ref_step <= 0 )
690 { /** MX=DN against SIPH in the reference***/
691 if ( ref_seq_step == 0 )
692 { /** MX=DN against PH **/
693 MACRO_BUILD_CIGAR( ref_cop.op,ref_cop.oplen);
694 ref_cop.oplen = 0;
695 }
696 else
697 {
698 int min_len = ( seq_cop.oplen < ref_cop.oplen ) ? seq_cop.oplen : ref_cop.oplen;
699 if( seq_seq_step == 0 )
700 { /** DN agains SI **/
701 MACRO_BUILD_CIGAR( 'P', min_len );
702 }
703 else
704 { /** MX= agains SI ***/
705 MACRO_BUILD_CIGAR( ref_cop.op,min_len );
706 }
707 seq_cop.oplen -= min_len;
708 ref_cop.oplen -= min_len;
709 }
710 }
711 else
712 {
713 /*MX=DN against MX=DN*/
714 int min_len = ( seq_cop.oplen < ref_cop.oplen ) ? seq_cop.oplen : ref_cop.oplen;
715 if ( seq_seq_step == 0 )
716 { /* DN against MX=DN */
717 if ( ref_seq_step == 0 )
718 { /** padding DN against DN **/
719 MACRO_BUILD_CIGAR( 'P', min_len );
720 ref_cop.oplen -= min_len;
721 seq_cop.oplen -= min_len;
722 }
723 else
724 { /* DN against MX= **/
725 MACRO_BUILD_CIGAR( seq_cop.op, min_len );
726 seq_cop.oplen -= min_len;
727 }
728 }
729 else if ( ref_cop.seq_sign == 0 )
730 { /* MX= against DN - always wins */
731 MACRO_BUILD_CIGAR( ref_cop.op, min_len );
732 ref_cop.oplen -= min_len;
733 }
734 else
735 { /** MX= against MX= ***/
736 char curr_op = merge_M_type_ops( seq_cop.op, ref_cop.op );
737 /* or otherwise merge_M_type_ops() will be called twice by the macro! */
738 MACRO_BUILD_CIGAR( curr_op, min_len );
739 ref_cop.oplen -= min_len;
740 seq_cop.oplen -= min_len;
741 }
742 ref_pos += min_len;
743 }
744 }
745 }
746 return ciglen;
747 }
748
749
750 typedef struct cg_merger
751 {
752 char newSeq[ MAX_READ_LEN ];
753 char newQual[ MAX_READ_LEN ];
754 char tags[ MAX_CG_CIGAR_LEN * 2 ];
755 } cg_merger;
756
757
merge_cg_cigar(const cg_cigar_input * input,cg_cigar_output * output)758 rc_t merge_cg_cigar( const cg_cigar_input * input, cg_cigar_output * output )
759 {
760 rc_t rc;
761 cg_cigar_temp tmp;
762 memset( &tmp, 0, sizeof tmp );
763
764 rc = CIGAR_to_CG_Ops( input, &tmp );
765 if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
766 {
767 memmove( &output->cigar[ 0 ], &input->p_cigar.ptr[ 0 ], input->p_cigar.len );
768 output->cigar_len = input->p_cigar.len;
769 output->cigar[ output->cigar_len ] = 0;
770 output->edit_dist = input->edit_dist;
771 rc = 0;
772 }
773 else if ( rc == 0 )
774 {
775
776 if ( tmp.CG_adjust == 0 && !tmp.has_ref_offset_type )
777 {
778 if ( tmp.gap[ 0 ] < tmp.opCnt )
779 tmp.CG_adjust = tmp.cigOp[ tmp.gap[ 0 ] ].length;
780
781 if ( tmp.gap[ 1 ] < tmp.opCnt )
782 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 1 ] ].length;
783
784 if ( tmp.gap[ 2 ] < tmp.opCnt )
785 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 2 ] ].length;
786 }
787
788 rc = adjust_cigar( input, &tmp, output );
789
790 }
791 return rc;
792 }
793
794
make_cg_merge(const cg_cigar_input * input,cg_cigar_output * output)795 rc_t make_cg_merge( const cg_cigar_input * input, cg_cigar_output * output )
796 {
797 rc_t rc;
798 cg_cigar_temp tmp;
799 memset( &tmp, 0, sizeof tmp );
800
801 rc = CIGAR_to_CG_Ops( input, &tmp );
802 if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcFormat )
803 {
804 memmove( &output->cigar[ 0 ], &input->p_cigar.ptr[ 0 ], input->p_cigar.len );
805 output->cigar_len = input->p_cigar.len;
806 output->cigar[ output->cigar_len ] = 0;
807 output->edit_dist = input->edit_dist;
808 rc = 0;
809 }
810 else if ( rc == 0 )
811 {
812
813 if ( tmp.CG_adjust == 0 && !tmp.has_ref_offset_type )
814 {
815 if ( tmp.gap[ 0 ] < tmp.opCnt )
816 tmp.CG_adjust = tmp.cigOp[ tmp.gap[ 0 ] ].length;
817
818 if ( tmp.gap[ 1 ] < tmp.opCnt )
819 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 1 ] ].length;
820
821 if ( tmp.gap[ 2 ] < tmp.opCnt )
822 tmp.CG_adjust += tmp.cigOp[ tmp.gap[ 2 ] ].length;
823 }
824 }
825
826 if ( rc == 0 )
827 {
828 uint32_t const B_len = tmp.cigOp[ tmp.gap[ 0 ] ].length;
829 uint32_t const B_at = tmp.gap[ 0 ] < tmp.gap[ 2 ] ? 5 : 30;
830
831 if ( 0 < B_len && B_len < 5 )
832 {
833 memmove( output->newSeq, input->p_read.ptr, MAX_READ_LEN );
834 memmove( output->newQual, input->p_quality.ptr, MAX_READ_LEN );
835
836 output->p_read.ptr = output->newSeq;
837 output->p_read.len = ( MAX_READ_LEN - B_len );
838
839 output->p_quality.ptr = output->newQual;
840 output->p_quality.len = ( MAX_READ_LEN - B_len );
841
842 output->p_tags.ptr = output->tags;
843 output->p_tags.len = 0;
844
845 /* nBnM -> nB0M */
846 tmp.cigOp[ tmp.gap[ 0 ] + 1 ].length -= B_len;
847 if ( tmp.gap[ 0 ] < tmp.gap[ 2 ] )
848 {
849 size_t written;
850 rc = string_printf( output->tags, sizeof( output->tags ), &written,
851 "GC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
852 5 - B_len, B_len, 30 - B_len, 2 * B_len, &output->newSeq[ 5 - B_len ], 2 * B_len, &output->newQual[ 5 - B_len ] );
853 if ( rc == 0 )
854 output->p_tags.len = written;
855 memmove( &tmp.cigOp[ tmp.gap[ 0 ] ],
856 &tmp.cigOp[ tmp.gap[ 0 ] + 1 ],
857 ( tmp.opCnt - ( tmp.gap[ 0 ] + 1 ) ) * sizeof( tmp.cigOp[ 0 ] ) );
858 --tmp.opCnt;
859 }
860 else
861 {
862 size_t written;
863 rc = string_printf( output->tags, sizeof( output->tags ), &written,
864 "GC:Z:%uS%uG%uS\tGS:Z:%.*s\tGQ:Z:%.*s",
865 30 - B_len, B_len, 5 - B_len, 2 * B_len, &output->newSeq[ 30 - B_len ], 2 * B_len, &output->newQual[ 30 - B_len ] );
866 if ( rc == 0 )
867 output->p_tags.len = written;
868 memmove( &tmp.cigOp[ tmp.gap[ 0 ] ],
869 &tmp.cigOp[ tmp.gap[ 0 ] + 1 ],
870 ( tmp.opCnt - ( tmp.gap[ 0 ] + 1 ) ) * sizeof( tmp.cigOp[ 0 ] ) );
871 --tmp.opCnt;
872 }
873 if ( rc == 0 )
874 {
875 uint32_t i;
876 for ( i = B_at; i < B_at + B_len; ++i )
877 {
878 uint32_t const Lq = output->newQual[ i - B_len ];
879 uint32_t const Rq = output->newQual[ i ];
880
881 if ( Lq <= Rq )
882 {
883 output->newSeq[ i - B_len ] = output->newSeq[ i ];
884 output->newQual[ i - B_len ] = Rq;
885 }
886 else
887 {
888 output->newSeq[ i ] = output->newSeq[ i - B_len ];
889 output->newQual[ i ] = Lq;
890 }
891 }
892 memmove( &output->newSeq [ B_at ], &output->newSeq [ B_at + B_len ], MAX_READ_LEN - B_at - B_len );
893 memmove( &output->newQual[ B_at ], &output->newQual[ B_at + B_len ], MAX_READ_LEN - B_at - B_len );
894 }
895 }
896 else
897 {
898 uint32_t i, len = tmp.cigOp[ tmp.gap[ 0 ] ].length;
899
900 tmp.cigOp[ tmp.gap[ 0 ] ].code = 'I';
901 for ( i = tmp.gap[ 0 ] + 1; i < tmp.opCnt && len > 0; ++i )
902 {
903 if ( tmp.cigOp[ i ].length <= len )
904 {
905 len -= tmp.cigOp[ i ].length;
906 tmp.cigOp[ i ].length = 0;
907 }
908 else
909 {
910 tmp.cigOp[ i ].length -= len;
911 len = 0;
912 }
913 }
914 tmp.CG_adjust -= tmp.cigOp[ tmp.gap[ 0 ] ].length;
915 }
916 }
917
918 if ( rc == 0 )
919 rc = adjust_cigar( input, &tmp, output );
920
921 return rc;
922 }
923
924
925 /* A-->0x00, C-->0x01 G-->0x02 T-->0x03 */
926
927 static const uint8_t ASCII_to_2na[] = {
928 /* 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F */
929 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
930
931 /* 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F */
932 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
933
934 /* 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F */
935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
936
937 /* 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F */
938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
939
940 /* 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F */
941 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
942
943 /* 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F */
944 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
945
946 /* 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F */
947 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
948
949 /* 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F */
950 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
951 };
952
compress_4_bases_to_byte(uint8_t * bases)953 static uint8_t compress_4_bases_to_byte( uint8_t * bases )
954 {
955 uint8_t res = ASCII_to_2na[ bases[ 0 ] & 0x7F ];
956 res <<= 2;
957 res |= ASCII_to_2na[ bases[ 1 ] & 0x7F ];
958 res <<= 2;
959 res |= ASCII_to_2na[ bases[ 2 ] & 0x7F ];
960 res <<= 2;
961 res |= ASCII_to_2na[ bases[ 3 ] & 0x7F ];
962 return res;
963 }
964
965
966 static const uint8_t compressed_to_fwd_reverse_0[] = {
967
968 /* 0000.0000 0x00 ... AAAA */ ( RNA_SPLICE_FWD | 0x20 ),
969 /* 0000.0001 0x01 ... AAAC */ ( RNA_SPLICE_FWD | 0x10 ),
970 /* 0000.0010 0x02 ... AAAG */ ( RNA_SPLICE_FWD | 0x20 ),
971 /* 0000.0011 0x03 ... AAAT */ ( RNA_SPLICE_FWD | 0x20 ),
972 /* 0000.0100 0x04 ... AACA */ RNA_SPLICE_UNKNOWN,
973 /* 0000.0101 0x05 ... AACC */ ( RNA_SPLICE_FWD | 0x20 ),
974 /* 0000.0110 0x06 ... AACG */ RNA_SPLICE_UNKNOWN,
975 /* 0000.0111 0x07 ... AACT */ RNA_SPLICE_UNKNOWN,
976 /* 0000.1000 0x08 ... AAGA */ RNA_SPLICE_UNKNOWN,
977 /* 0000.1001 0x09 ... AAGC */ ( RNA_SPLICE_FWD | 0x20 ),
978 /* 0000.1010 0x0A ... AAGG */ RNA_SPLICE_UNKNOWN,
979 /* 0000.1011 0x0B ... AAGT */ RNA_SPLICE_UNKNOWN,
980 /* 0000.1100 0x0C ... AATA */ RNA_SPLICE_UNKNOWN,
981 /* 0000.1101 0x0D ... AATC */ ( RNA_SPLICE_FWD | 0x20 ),
982 /* 0000.1110 0x0E ... AATG */ RNA_SPLICE_UNKNOWN,
983 /* 0000.1111 0x0F ... AATT */ RNA_SPLICE_UNKNOWN,
984
985 /* 0001.0000 0x10 ... ACAA */ ( RNA_SPLICE_FWD | 0x20 ),
986 /* 0001.0001 0x11 ... ACAC */ ( RNA_SPLICE_FWD | 0x10 ),
987 /* 0001.0010 0x12 ... ACAG */ ( RNA_SPLICE_FWD | 0x20 ),
988 /* 0001.0011 0x13 ... ACAT */ ( RNA_SPLICE_FWD | 0x20 ),
989 /* 0001.0100 0x14 ... ACCA */ RNA_SPLICE_UNKNOWN,
990 /* 0001.0101 0x15 ... ACCC */ ( RNA_SPLICE_FWD | 0x20 ),
991 /* 0001.0110 0x16 ... ACCG */ RNA_SPLICE_UNKNOWN,
992 /* 0001.0111 0x17 ... ACCT */ RNA_SPLICE_UNKNOWN,
993 /* 0001.1000 0x18 ... ACGA */ RNA_SPLICE_UNKNOWN,
994 /* 0001.1001 0x19 ... ACGC */ ( RNA_SPLICE_FWD | 0x20 ),
995 /* 0001.1010 0x1A ... ACGG */ RNA_SPLICE_UNKNOWN,
996 /* 0001.1011 0x1B ... ACGT */ RNA_SPLICE_UNKNOWN,
997 /* 0001.1100 0x1C ... ACTA */ RNA_SPLICE_UNKNOWN,
998 /* 0001.1101 0x1D ... ACTC */ ( RNA_SPLICE_FWD | 0x20 ),
999 /* 0001.1110 0x1E ... ACTG */ RNA_SPLICE_UNKNOWN,
1000 /* 0001.1111 0x1F ... ACTT */ RNA_SPLICE_UNKNOWN,
1001
1002 /* 0010.0000 0x20 ... AGAA */ ( RNA_SPLICE_FWD | 0x20 ),
1003 /* 0010.0001 0x21 ... AGAC */ ( RNA_SPLICE_FWD | 0x10 ),
1004 /* 0010.0010 0x22 ... AGAG */ ( RNA_SPLICE_FWD | 0x20 ),
1005 /* 0010.0011 0x23 ... AGAT */ ( RNA_SPLICE_FWD | 0x20 ),
1006 /* 0010.0100 0x24 ... AGCA */ RNA_SPLICE_UNKNOWN,
1007 /* 0010.0101 0x25 ... AGCC */ ( RNA_SPLICE_FWD | 0x20 ),
1008 /* 0010.0110 0x26 ... AGCG */ RNA_SPLICE_UNKNOWN,
1009 /* 0010.0111 0x27 ... AGCT */ RNA_SPLICE_UNKNOWN,
1010 /* 0010.1000 0x28 ... AGGA */ RNA_SPLICE_UNKNOWN,
1011 /* 0010.1001 0x29 ... AGGC */ ( RNA_SPLICE_FWD | 0x20 ),
1012 /* 0010.1010 0x2A ... AGGG */ RNA_SPLICE_UNKNOWN,
1013 /* 0010.1011 0x2B ... AGGT */ RNA_SPLICE_UNKNOWN,
1014 /* 0010.1100 0x2C ... AGTA */ RNA_SPLICE_UNKNOWN,
1015 /* 0010.1101 0x2D ... AGTC */ ( RNA_SPLICE_FWD | 0x20 ),
1016 /* 0010.1110 0x2E ... AGTG */ RNA_SPLICE_UNKNOWN,
1017 /* 0010.1111 0x2F ... AGTT */ RNA_SPLICE_UNKNOWN,
1018
1019 /* 0011.0000 0x30 ... ATAA */ ( RNA_SPLICE_FWD | 0x10 ),
1020 /* 0011.0001 0x31 ... ATAC */ RNA_SPLICE_FWD, /* MINOR forward */
1021 /* 0011.0010 0x32 ... ATAG */ ( RNA_SPLICE_FWD | 0x10 ),
1022 /* 0011.0011 0x33 ... ATAT */ ( RNA_SPLICE_FWD | 0x10 ),
1023 /* 0011.0100 0x34 ... ATCA */ RNA_SPLICE_UNKNOWN,
1024 /* 0011.0101 0x35 ... ATCC */ ( RNA_SPLICE_FWD | 0x10 ),
1025 /* 0011.0110 0x36 ... ATCG */ ( RNA_SPLICE_FWD | 0x20 ),
1026 /* 0011.0111 0x37 ... ATCT */ ( RNA_SPLICE_REV | 0x20 ),
1027 /* 0011.1000 0x38 ... ATGA */ RNA_SPLICE_UNKNOWN,
1028 /* 0011.1001 0x39 ... ATGC */ ( RNA_SPLICE_FWD | 0x10 ),
1029 /* 0011.1010 0x3A ... ATGG */ ( RNA_SPLICE_FWD | 0x20 ),
1030 /* 0011.1011 0x3B ... ATGT */ ( RNA_SPLICE_REV | 0x20 ),
1031 /* 0011.1100 0x3C ... ATTA */ RNA_SPLICE_UNKNOWN,
1032 /* 0011.1101 0x3D ... ATTC */ ( RNA_SPLICE_FWD | 0x10 ),
1033 /* 0011.1110 0x3E ... ATTG */ ( RNA_SPLICE_FWD | 0x20 ),
1034 /* 0011.1111 0x3F ... ATTT */ ( RNA_SPLICE_REV | 0x20 ),
1035
1036 /* 0100.0000 0x40 ... CAAA */ ( RNA_SPLICE_REV | 0x20 ),
1037 /* 0100.0001 0x41 ... CAAC */ ( RNA_SPLICE_REV | 0x10 ),
1038 /* 0100.0010 0x42 ... CAAG */ ( RNA_SPLICE_REV | 0x20 ),
1039 /* 0100.0011 0x43 ... CAAT */ ( RNA_SPLICE_REV | 0x20 ),
1040 /* 0100.0100 0x44 ... CACA */ RNA_SPLICE_UNKNOWN,
1041 /* 0100.0101 0x45 ... CACC */ ( RNA_SPLICE_REV | 0x20 ),
1042 /* 0100.0110 0x46 ... CACG */ RNA_SPLICE_UNKNOWN,
1043 /* 0100.0111 0x47 ... CACT */ RNA_SPLICE_UNKNOWN,
1044 /* 0100.1000 0x48 ... CAGA */ RNA_SPLICE_UNKNOWN,
1045 /* 0100.1001 0x49 ... CAGC */ ( RNA_SPLICE_REV | 0x20 ),
1046 /* 0100.1010 0x4A ... CAGG */ RNA_SPLICE_UNKNOWN,
1047 /* 0100.1011 0x4B ... CAGT */ RNA_SPLICE_UNKNOWN,
1048 /* 0100.1100 0x4C ... CATA */ RNA_SPLICE_UNKNOWN,
1049 /* 0100.1101 0x4D ... CATC */ ( RNA_SPLICE_REV | 0x20 ),
1050 /* 0100.1110 0x4E ... CATG */ RNA_SPLICE_UNKNOWN,
1051 /* 0100.1111 0x4F ... CATT */ RNA_SPLICE_UNKNOWN,
1052
1053 /* 0101.0000 0x50 ... CCAA */ ( RNA_SPLICE_REV | 0x20 ),
1054 /* 0101.0001 0x51 ... CCAC */ ( RNA_SPLICE_REV | 0x10 ),
1055 /* 0101.0010 0x52 ... CCAG */ ( RNA_SPLICE_REV | 0x20 ),
1056 /* 0101.0011 0x53 ... CCAT */ ( RNA_SPLICE_REV | 0x20 ),
1057 /* 0101.0100 0x54 ... CCCA */ RNA_SPLICE_UNKNOWN,
1058 /* 0101.0101 0x55 ... CCCC */ ( RNA_SPLICE_REV | 0x20 ),
1059 /* 0101.0110 0x56 ... CCCG */ RNA_SPLICE_UNKNOWN,
1060 /* 0101.0111 0x57 ... CCCT */ RNA_SPLICE_UNKNOWN,
1061 /* 0101.1000 0x58 ... CCGA */ RNA_SPLICE_UNKNOWN,
1062 /* 0101.1001 0x59 ... CCGC */ ( RNA_SPLICE_REV | 0x20 ),
1063 /* 0101.1010 0x5A ... CCGG */ RNA_SPLICE_UNKNOWN,
1064 /* 0101.1011 0x5B ... CCGT */ RNA_SPLICE_UNKNOWN,
1065 /* 0101.1100 0x5C ... CCTA */ RNA_SPLICE_UNKNOWN,
1066 /* 0101.1101 0x5D ... CCTC */ ( RNA_SPLICE_REV | 0x20 ),
1067 /* 0101.1110 0x5E ... CCTG */ RNA_SPLICE_UNKNOWN,
1068 /* 0101.1111 0x5F ... CCTT */ RNA_SPLICE_UNKNOWN,
1069
1070 /* 0110.0000 0x60 ... CGAA */ ( RNA_SPLICE_REV | 0x20 ),
1071 /* 0110.0001 0x61 ... CGAC */ ( RNA_SPLICE_REV | 0x10 ),
1072 /* 0110.0010 0x62 ... CGAG */ ( RNA_SPLICE_REV | 0x20 ),
1073 /* 0110.0011 0x63 ... CGAT */ ( RNA_SPLICE_REV | 0x20 ),
1074 /* 0110.0100 0x64 ... CGCA */ RNA_SPLICE_UNKNOWN,
1075 /* 0110.0101 0x65 ... CGCC */ ( RNA_SPLICE_REV | 0x20 ),
1076 /* 0110.0110 0x66 ... CGCG */ RNA_SPLICE_UNKNOWN,
1077 /* 0110.0111 0x67 ... CGCT */ RNA_SPLICE_UNKNOWN,
1078 /* 0110.1000 0x68 ... CGGA */ RNA_SPLICE_UNKNOWN,
1079 /* 0110.1001 0x69 ... CGGC */ ( RNA_SPLICE_REV | 0x20 ),
1080 /* 0110.1010 0x6A ... CGGG */ RNA_SPLICE_UNKNOWN,
1081 /* 0110.1011 0x6B ... CGGT */ RNA_SPLICE_UNKNOWN,
1082 /* 0110.1100 0x6C ... CGTA */ RNA_SPLICE_UNKNOWN,
1083 /* 0110.1101 0x6D ... CGTC */ ( RNA_SPLICE_REV | 0x20 ),
1084 /* 0110.1110 0x6E ... CGTG */ RNA_SPLICE_UNKNOWN,
1085 /* 0110.1111 0x6F ... CGTT */ RNA_SPLICE_UNKNOWN,
1086
1087 /* 0111.0000 0x70 ... CTAA */ ( RNA_SPLICE_REV | 0x10 ),
1088 /* 0111.0001 0x71 ... CTAC */ RNA_SPLICE_REV, /* MAJOR reverse */
1089 /* 0111.0010 0x72 ... CTAG */ ( RNA_SPLICE_FWD | 0x10 ),
1090 /* 0111.0011 0x73 ... CTAT */ ( RNA_SPLICE_REV | 0x10 ),
1091 /* 0111.0100 0x74 ... CTCA */ RNA_SPLICE_UNKNOWN,
1092 /* 0111.0101 0x75 ... CTCC */ ( RNA_SPLICE_REV | 0x10 ),
1093 /* 0111.0110 0x76 ... CTCG */ ( RNA_SPLICE_FWD | 0x20 ),
1094 /* 0111.0111 0x77 ... CTCT */ ( RNA_SPLICE_REV | 0x20 ),
1095 /* 0111.1000 0x78 ... CTGA */ RNA_SPLICE_UNKNOWN,
1096 /* 0111.1001 0x79 ... CTGC */ ( RNA_SPLICE_REV | 0x10 ),
1097 /* 0111.1010 0x7A ... CTGG */ ( RNA_SPLICE_FWD | 0x20 ),
1098 /* 0111.1011 0x7B ... CTGT */ ( RNA_SPLICE_REV | 0x20 ),
1099 /* 0111.1100 0x7C ... CTTA */ RNA_SPLICE_UNKNOWN,
1100 /* 0111.1101 0x7D ... CTTC */ ( RNA_SPLICE_REV | 0x10 ),
1101 /* 0111.1110 0x7E ... CTTG */ ( RNA_SPLICE_FWD | 0x20 ),
1102 /* 0111.1111 0x7F ... CTTT */ ( RNA_SPLICE_REV | 0x20 ),
1103
1104 /* 1000.0000 0x80 ... GAAA */ ( RNA_SPLICE_FWD | 0x20 ),
1105 /* 1000.0001 0x81 ... GAAC */ ( RNA_SPLICE_FWD | 0x20 ),
1106 /* 1000.0010 0x82 ... GAAG */ ( RNA_SPLICE_FWD | 0x10 ),
1107 /* 1000.0011 0x83 ... GAAT */ ( RNA_SPLICE_REV | 0x10 ),
1108 /* 1000.0100 0x84 ... GACA */ RNA_SPLICE_UNKNOWN,
1109 /* 1000.0101 0x85 ... GACC */ RNA_SPLICE_UNKNOWN,
1110 /* 1000.0110 0x86 ... GACG */ ( RNA_SPLICE_FWD | 0x20 ),
1111 /* 1000.0111 0x87 ... GACT */ ( RNA_SPLICE_REV | 0x20 ),
1112 /* 1000.1000 0x88 ... GAGA */ RNA_SPLICE_UNKNOWN,
1113 /* 1000.1001 0x89 ... GAGC */ RNA_SPLICE_UNKNOWN,
1114 /* 1000.1010 0x8A ... GAGG */ ( RNA_SPLICE_FWD | 0x20 ),
1115 /* 1000.1011 0x8B ... GAGT */ ( RNA_SPLICE_REV | 0x20 ),
1116 /* 1000.1100 0x8C ... GATA */ RNA_SPLICE_UNKNOWN,
1117 /* 1000.1101 0x8D ... GATC */ RNA_SPLICE_UNKNOWN,
1118 /* 1000.1110 0x8E ... GATG */ ( RNA_SPLICE_FWD | 0x20 ),
1119 /* 1000.1111 0x8F ... GATT */ ( RNA_SPLICE_REV | 0x20 ),
1120
1121 /* 1001.0000 0x90 ... GCAA */ ( RNA_SPLICE_FWD | 0x20 ),
1122 /* 1001.0001 0x91 ... GCAC */ ( RNA_SPLICE_FWD | 0x20 ),
1123 /* 1001.0010 0x92 ... GCAG */ ( RNA_SPLICE_FWD | 0x10 ),
1124 /* 1001.0011 0x93 ... GCAT */ ( RNA_SPLICE_REV | 0x10 ),
1125 /* 1001.0100 0x94 ... GCCA */ RNA_SPLICE_UNKNOWN,
1126 /* 1001.0101 0x95 ... GCCC */ RNA_SPLICE_UNKNOWN,
1127 /* 1001.0110 0x96 ... GCCG */ ( RNA_SPLICE_FWD | 0x20 ),
1128 /* 1001.0111 0x97 ... GCCT */ ( RNA_SPLICE_REV | 0x20 ),
1129 /* 1001.1000 0x98 ... GCGA */ RNA_SPLICE_UNKNOWN,
1130 /* 1001.1001 0x99 ... GCGC */ RNA_SPLICE_UNKNOWN,
1131 /* 1001.1010 0x9A ... GCGG */ ( RNA_SPLICE_FWD | 0x20 ),
1132 /* 1001.1011 0x9B ... GCGT */ ( RNA_SPLICE_REV | 0x20 ),
1133 /* 1001.1100 0x9C ... GCTA */ RNA_SPLICE_UNKNOWN,
1134 /* 1001.1101 0x9D ... GCTC */ RNA_SPLICE_UNKNOWN,
1135 /* 1001.1110 0x9E ... GCTG */ ( RNA_SPLICE_FWD | 0x20 ),
1136 /* 1001.1111 0x9F ... GCTT */ ( RNA_SPLICE_REV | 0x20 ),
1137
1138 /* 1010.0000 0xA0 ... GGAA */ ( RNA_SPLICE_FWD | 0x20 ),
1139 /* 1010.0001 0xA1 ... GGAC */ ( RNA_SPLICE_FWD | 0x20 ),
1140 /* 1010.0010 0xA2 ... GGAG */ ( RNA_SPLICE_FWD | 0x10 ),
1141 /* 1010.0011 0xA3 ... GGAT */ ( RNA_SPLICE_REV | 0x10 ),
1142 /* 1010.0100 0xA4 ... GGCA */ RNA_SPLICE_UNKNOWN,
1143 /* 1010.0101 0xA5 ... GGCC */ RNA_SPLICE_UNKNOWN,
1144 /* 1010.0110 0xA6 ... GGCG */ ( RNA_SPLICE_FWD | 0x20 ),
1145 /* 1010.0111 0xA7 ... GGCT */ ( RNA_SPLICE_REV | 0x20 ),
1146 /* 1010.1000 0xA8 ... GGGA */ RNA_SPLICE_UNKNOWN,
1147 /* 1010.1001 0xA9 ... GGGC */ RNA_SPLICE_UNKNOWN,
1148 /* 1010.1010 0xAA ... GGGG */ ( RNA_SPLICE_FWD | 0x20 ),
1149 /* 1010.1011 0xAB ... GGGT */ ( RNA_SPLICE_REV | 0x20 ),
1150 /* 1010.1100 0xAC ... GGTA */ RNA_SPLICE_UNKNOWN,
1151 /* 1010.1101 0xAD ... GGTC */ RNA_SPLICE_UNKNOWN,
1152 /* 1010.1110 0xAE ... GGTG */ ( RNA_SPLICE_FWD | 0x20 ),
1153 /* 1010.1111 0xAF ... GGTT */ ( RNA_SPLICE_REV | 0x20 ),
1154
1155 /* 1011.0000 0xB0 ... GTAA */ ( RNA_SPLICE_FWD | 0x10 ),
1156 /* 1011.0001 0xB1 ... GTAC */ ( RNA_SPLICE_FWD | 0x10 ),
1157 /* 1011.0010 0xB2 ... GTAG */ RNA_SPLICE_FWD, /* MAJOR forward */
1158 /* 1011.0011 0xB3 ... GTAT */ RNA_SPLICE_REV, /* MINOR reverse */
1159 /* 1011.0100 0xB4 ... GTCA */ RNA_SPLICE_UNKNOWN,
1160 /* 1011.0101 0xB5 ... GTCC */ ( RNA_SPLICE_REV | 0x20 ),
1161 /* 1011.0110 0xB6 ... GTCG */ ( RNA_SPLICE_FWD | 0x10 ),
1162 /* 1011.0111 0xB7 ... GTCT */ ( RNA_SPLICE_REV | 0x10 ),
1163 /* 1011.1000 0xB8 ... GTGA */ RNA_SPLICE_UNKNOWN,
1164 /* 1011.1001 0xB9 ... GTGC */ ( RNA_SPLICE_REV | 0x20 ),
1165 /* 1011.1010 0xBA ... GTGG */ ( RNA_SPLICE_FWD | 0x10 ),
1166 /* 1011.1011 0xBB ... GTGT */ ( RNA_SPLICE_REV | 0x10 ),
1167 /* 1011.1100 0xBC ... GTTA */ RNA_SPLICE_UNKNOWN,
1168 /* 1011.1101 0xBD ... GTTC */ ( RNA_SPLICE_REV | 0x20 ),
1169 /* 1011.1110 0xBE ... GTTG */ ( RNA_SPLICE_FWD | 0x10 ),
1170 /* 1011.1111 0xBF ... GTTT */ ( RNA_SPLICE_REV | 0x10 ),
1171
1172 /* 1100.0000 0xC0 ... TAAA */ RNA_SPLICE_UNKNOWN,
1173 /* 1100.0001 0xC1 ... TAAC */ RNA_SPLICE_UNKNOWN,
1174 /* 1100.0010 0xC2 ... TAAG */ RNA_SPLICE_UNKNOWN,
1175 /* 1100.0011 0xC3 ... TAAT */ RNA_SPLICE_UNKNOWN,
1176 /* 1100.0100 0xC4 ... TACA */ RNA_SPLICE_UNKNOWN,
1177 /* 1100.0101 0xC5 ... TACC */ RNA_SPLICE_UNKNOWN,
1178 /* 1100.0110 0xC6 ... TACG */ RNA_SPLICE_UNKNOWN,
1179 /* 1100.0111 0xC7 ... TACT */ RNA_SPLICE_UNKNOWN,
1180 /* 1100.1000 0xC8 ... TAGA */ RNA_SPLICE_UNKNOWN,
1181 /* 1100.1001 0xC9 ... TAGC */ RNA_SPLICE_UNKNOWN,
1182 /* 1100.1010 0xCA ... TAGG */ RNA_SPLICE_UNKNOWN,
1183 /* 1100.1011 0xCB ... TAGT */ RNA_SPLICE_UNKNOWN,
1184 /* 1100.1100 0xCC ... TATA */ RNA_SPLICE_UNKNOWN,
1185 /* 1100.1101 0xCD ... TATC */ RNA_SPLICE_UNKNOWN,
1186 /* 1100.1110 0xCE ... TATG */ RNA_SPLICE_UNKNOWN,
1187 /* 1100.1111 0xCF ... TATT */ RNA_SPLICE_UNKNOWN,
1188
1189 /* 1101.0000 0xE0 ... TCAA */ RNA_SPLICE_UNKNOWN,
1190 /* 1101.0001 0xE1 ... TCAC */ RNA_SPLICE_UNKNOWN,
1191 /* 1101.0010 0xE2 ... TCAG */ RNA_SPLICE_UNKNOWN,
1192 /* 1101.0011 0xE3 ... TCAT */ RNA_SPLICE_UNKNOWN,
1193 /* 1101.0100 0xE4 ... TCCA */ RNA_SPLICE_UNKNOWN,
1194 /* 1101.0101 0xE5 ... TCCC */ RNA_SPLICE_UNKNOWN,
1195 /* 1101.0110 0xE6 ... TCCG */ RNA_SPLICE_UNKNOWN,
1196 /* 1101.0111 0xE7 ... TCCT */ RNA_SPLICE_UNKNOWN,
1197 /* 1101.1000 0xE8 ... TCGA */ RNA_SPLICE_UNKNOWN,
1198 /* 1101.1001 0xE9 ... TCGC */ RNA_SPLICE_UNKNOWN,
1199 /* 1101.1010 0xEA ... TCGG */ RNA_SPLICE_UNKNOWN,
1200 /* 1101.1011 0xEB ... TCGT */ RNA_SPLICE_UNKNOWN,
1201 /* 1101.1100 0xEC ... TCTA */ RNA_SPLICE_UNKNOWN,
1202 /* 1101.1101 0xED ... TCTC */ RNA_SPLICE_UNKNOWN,
1203 /* 1101.1110 0xEE ... TCTG */ RNA_SPLICE_UNKNOWN,
1204 /* 1101.1111 0xEF ... TCTT */ RNA_SPLICE_UNKNOWN,
1205
1206 /* 1111.0000 0xF0 ... TTAA */ ( RNA_SPLICE_FWD | 0x20 ),
1207 /* 1111.0001 0xF1 ... TTAC */ ( RNA_SPLICE_REV | 0x10 ),
1208 /* 1111.0010 0xF2 ... TTAG */ ( RNA_SPLICE_FWD | 0x10 ),
1209 /* 1111.0011 0xF3 ... TTAT */ ( RNA_SPLICE_REV | 0x10 ),
1210 /* 1111.0100 0xF4 ... TTCA */ RNA_SPLICE_UNKNOWN,
1211 /* 1111.0101 0xF5 ... TTCC */ ( RNA_SPLICE_REV | 0x20 ),
1212 /* 1111.0110 0xF6 ... TTCG */ ( RNA_SPLICE_FWD | 0x20 ),
1213 /* 1111.0111 0xF7 ... TTCT */ ( RNA_SPLICE_REV | 0x20 ),
1214 /* 1111.1000 0xF8 ... TTGA */ RNA_SPLICE_UNKNOWN,
1215 /* 1111.1001 0xF9 ... TTGC */ ( RNA_SPLICE_REV | 0x20 ),
1216 /* 1111.1010 0xFA ... TTGG */ ( RNA_SPLICE_FWD | 0x20 ),
1217 /* 1111.1011 0xFB ... TTGT */ ( RNA_SPLICE_REV | 0x20 ),
1218 /* 1111.1100 0xFC ... TTTA */ RNA_SPLICE_UNKNOWN,
1219 /* 1111.1101 0xFD ... TTTC */ ( RNA_SPLICE_REV | 0x20 ),
1220 /* 1111.1110 0xFE ... TTTG */ ( RNA_SPLICE_FWD | 0x20 ),
1221 /* 1111.1111 0xFF ... TTTT */ ( RNA_SPLICE_REV | 0x20 )
1222 };
1223
1224
1225 /*************************************************************
1226 RNA-splice detector:
1227
1228 base 0 1 ....... n-2 n-1 direction
1229
1230 G T A G forward
1231 A T A C forward
1232
1233 C T A C reverse
1234 G T A T reverse
1235
1236
1237 =========================================================
1238
1239 zero mismatches ( aka full matches ) :
1240
1241 ATAC ... MINOR [0x31]=RNA_SPLICE_FWD
1242 CTAC ... MAJOR [0x71]=RNA_SPLICE_REV
1243 GTAG ... MAJOR [0xB2]=RNA_SPLICE_FWD
1244 GTAT ... MINOR [0xB3]=RNA_SPLICE_REV
1245
1246 =========================================================
1247
1248 one mismatch:
1249
1250 ATAC ... MINOR, forward
1251
1252 *
1253 CTAC ( is also MAJOR reverse ... )
1254 GTAC ( is alow MAJOR reverse, 1 mismatch )
1255 TTAC ( is alow MAJOR reverse, 1 mismatch )
1256
1257 *
1258 AAAC [0x01]=( RNA_SPLICE_FWD | 0x10 )
1259 ACAC [0x11]=( RNA_SPLICE_FWD | 0x10 )
1260 AGAC [0x21]=( RNA_SPLICE_FWD | 0x10 )
1261
1262 *
1263 ATCC [0x35]=( RNA_SPLICE_FWD | 0x10 )
1264 ATGC [0x39]=( RNA_SPLICE_FWD | 0x10 )
1265 ATTC [0x3D]=( RNA_SPLICE_FWD | 0x10 )
1266
1267 *
1268 ATAA [0x30]=( RNA_SPLICE_FWD | 0x10 )
1269 ATAG ( is also MAJOR forward, 1 mismatch )
1270 ATAT [0x33]=( RNA_SPLICE_FWD | 0x10 )
1271
1272 -----------------------------------------------------------
1273 CTAC ... MAJOR, reverse
1274
1275 *
1276 ATAC ( is also MINOR forward, full match )
1277 GTAC ( is also MAJOR forward, 1 mismatch )
1278 TTAC [0xF1]=( RNA_SPLICE_REV | 0x10 )
1279
1280 *
1281 CAAC [0x41]=( RNA_SPLICE_REV | 0x10 )
1282 CCAC [0x51]=( RNA_SPLICE_REV | 0x10 )
1283 CGAC [0x61]=( RNA_SPLICE_REV | 0x10 )
1284
1285 *
1286 CTCC [0x75]=( RNA_SPLICE_REV | 0x10 )
1287 CTGC [0x79]=( RNA_SPLICE_REV | 0x10 )
1288 CTTC [0x7D]=( RNA_SPLICE_REV | 0x10 )
1289
1290 *
1291 CTAA [0x70]=( RNA_SPLICE_REV | 0x10 )
1292 CTAG ( is also MAJOR forward, 1 mismatch )
1293 CTAT [0x73]=( RNA_SPLICE_REV | 0x10 )
1294
1295 -----------------------------------------------------------
1296 GTAG ... MAJOR, forward
1297
1298 *
1299 ATAG [0x32]=( RNA_SPLICE_FWD | 0x10 )
1300 CTAG [0x72]=( RNA_SPLICE_FWD | 0x10 )
1301 TTAG [0xF2]=( RNA_SPLICE_FWD | 0x10 )
1302
1303 *
1304 GAAG [0x82]=( RNA_SPLICE_FWD | 0x10 )
1305 GCAG [0x92]=( RNA_SPLICE_FWD | 0x10 )
1306 GGAG [0xA2]=( RNA_SPLICE_FWD | 0x10 )
1307
1308 *
1309 GTCG [0xB6]=( RNA_SPLICE_FWD | 0x10 )
1310 GTGG [0xBA]=( RNA_SPLICE_FWD | 0x10 )
1311 GTTG [0xBE]=( RNA_SPLICE_FWD | 0x10 )
1312
1313 *
1314 GTAA [0xB0]=( RNA_SPLICE_FWD | 0x10 )
1315 GTAC [0xB1]=( RNA_SPLICE_FWD | 0x10 )
1316 GTAT ( is also MINOR reverse, full match )
1317
1318 -----------------------------------------------------------
1319 GTAT ... MINOR, reverse
1320
1321 *
1322 ATAT ( is also MINOR forward, 1 mismatch )
1323 CTAT ( is also MAJOR reverse, 1 mismatch )
1324 TTAT [0xF3]=( RNA_SPLICE_REV | 0x10 )
1325
1326 *
1327 GAAT [0x83]=( RNA_SPLICE_REV | 0x10 )
1328 GCAT [0x93]=( RNA_SPLICE_REV | 0x10 )
1329 GGAT [0xA3]=( RNA_SPLICE_REV | 0x10 )
1330
1331 *
1332 GTCT [0xB7]=( RNA_SPLICE_REV | 0x10 )
1333 GTGT [0xBB]=( RNA_SPLICE_REV | 0x10 )
1334 GTTT [0xBF]=( RNA_SPLICE_REV | 0x10 )
1335
1336 *
1337 GTAA ( is also MAJOR forward, 1 mismatch )
1338 GTAC ( is also MAJOR forward, 1 mismatch )
1339 GTAG ( is also MAJOR forward, full match )
1340
1341 =========================================================
1342
1343 two mismatches:
1344
1345 ATAC ... MINOR, forward
1346
1347 * *
1348 CTAC ( is also MAJOR reverse, full match )
1349 CTCC ( is also MAJOR reverse, 1 mismatch )
1350 CTGC ( is also MAJOR reverse, 1 mismatch )
1351 CTTC ( is also MAJOR reverse, 1 mismatch )
1352 GTAC ( is also MAJOR forward, 1 mismatch )
1353 GTCC ( is also MAJOR reverse, 2 mismatches )
1354 GTGC ( is also MAJOR reverse, 2 mismatches )
1355 GTTC ( is also MAJOR reverse, 2 mismatches )
1356 TTAC ( is also MAJOR reverse, 1 mismatch )
1357 TTCC ( is also MAJOR reverse, 2 mismatches )
1358 TTGC ( is also MAJOR reverse, 2 mismatches )
1359 TTTC ( is also MAJOR reverse, 2 mismatches )
1360
1361 **
1362 AAAC ( is also MAJOR forward, 1 mismatch )
1363 AACC [0x05]=( RNA_SPLICE_FWD | 0x20 )
1364 AAGC [0x09]=( RNA_SPLICE_FWD | 0x20 )
1365 AATC [0x0D]=( RNA_SPLICE_FWD | 0x20 )
1366 ACAC ( is also MINOR forward, 1 mismatch )
1367 ACCC [0x15]=( RNA_SPLICE_FWD | 0x20 )
1368 ACGC [0x19]=( RNA_SPLICE_FWD | 0x20 )
1369 ACTC [0x1D]=( RNA_SPLICE_FWD | 0x20 )
1370 AGAC ( is also MINOR forward, 1 mismatch )
1371 AGCC [0x25]=( RNA_SPLICE_FWD | 0x20 )
1372 AGGC [0x29]=( RNA_SPLICE_FWD | 0x20 )
1373 AGTC [0x2D]=( RNA_SPLICE_FWD | 0x20 )
1374
1375 * *
1376 CTAA ( is also MAJOR reverse, 1 mismatch )
1377 CTAC ( is also MAJOR reverse, full match )
1378 CTAG ( is also MAJOR forward, 1 mismatch )
1379 CTAT ( is also MAJOR reverse, 1 mismatch )
1380 GTAA ( is also MAJOR forward, 1 mismatch )
1381 GTAC ( is also MAJOR forward, 1 mismatch )
1382 GTAG ( is also MAJOR forward, full match )
1383 GTAT ( is also MINOR reverse, full match )
1384 TTAA ( is also MAJOR reverse, 2 mismatches )
1385 TTAC ( is also MAJOR reverse, 1 mismatch )
1386 TTAG ( is also MAJOR forward, 1 mismatch )
1387 TTAT ( is also MINOR reverse, 1 mismatch )
1388
1389 * *
1390 AAAA [0x00]=( RNA_SPLICE_FWD | 0x20 )
1391 AAAC ( is also MAJOR forward, 1 mismatch )
1392 AAAG [0x02]=( RNA_SPLICE_FWD | 0x20 )
1393 AAAT [0x03]=( RNA_SPLICE_FWD | 0x20 )
1394 ACAA [0x10]=( RNA_SPLICE_FWD | 0x20 )
1395 ACAC ( is also MINOR forward, 1 mismatch )
1396 ACAG [0x12]=( RNA_SPLICE_FWD | 0x20 )
1397 ACAT [0x13]=( RNA_SPLICE_FWD | 0x20 )
1398 AGAA [0x20]=( RNA_SPLICE_FWD | 0x20 )
1399 AGAC ( is also MINOR forward, 1 mismatch )
1400 AGAG [0x22]=( RNA_SPLICE_FWD | 0x20 )
1401 AGAT [0x23]=( RNA_SPLICE_FWD | 0x20 )
1402
1403 -----------------------------------------------------------
1404 CTAC ... MAJOR, reverse
1405
1406 * *
1407 ATAC ( is also MINOR forward, full match )
1408 ATCC ( is also MINOR forward, 1 mismatch )
1409 ATGC ( is also MINOR forward, 1 mismatch )
1410 ATTC ( is also MINOR forward, 1 mismatch )
1411 GTAC ( is also MAJOR forward, 1 mismatch )
1412 GTCC [0xB5]=( RNA_SPLICE_REV | 0x20 )
1413 GTGC [0xB9]=( RNA_SPLICE_REV | 0x20 )
1414 GTTC [0xBD]=( RNA_SPLICE_REV | 0x20 )
1415 TTAC ( is also MAJOR reverse, 1 mismatch )
1416 TTCC [0xF5]=( RNA_SPLICE_REV | 0x20 )
1417 TTGC [0xF9]=( RNA_SPLICE_REV | 0x20 )
1418 TTTC [0xFD]=( RNA_SPLICE_REV | 0x20 )
1419
1420 **
1421 CAAC ( is also MAJOR reverse, 1 mismatch )
1422 CACC [0x45]=( RNA_SPLICE_REV | 0x20 )
1423 CAGC [0x49]=( RNA_SPLICE_REV | 0x20 )
1424 CATC [0x4D]=( RNA_SPLICE_REV | 0x20 )
1425 CCAC ( is also MAJOR reverse, 1 mismatch )
1426 CCCC [0x55]=( RNA_SPLICE_REV | 0x20 )
1427 CCGC [0x59]=( RNA_SPLICE_REV | 0x20 )
1428 CCTC [0x5D]=( RNA_SPLICE_REV | 0x20 )
1429 CGAC ( is also MAJOR reverse, 1 mismatch )
1430 CGCC [0x65]=( RNA_SPLICE_REV | 0x20 )
1431 CGGC [0x69]=( RNA_SPLICE_REV | 0x20 )
1432 CGTC [0x6D]=( RNA_SPLICE_REV | 0x20 )
1433
1434 * *
1435 ATAA ( is also MINOR forward, 1 mismatch )
1436 ATAC ( is also MINOR forward, full match )
1437 ATAG ( is also MAJOR forward, 1 mismatch )
1438 ATAT ( is also MINOR forward, 1 mismatch )
1439 GTAA ( is also MAJOR forward, 1 mismatch )
1440 GTAC ( is also MAJOR forward, 1 mismatch )
1441 GTAG ( is also MAJOR forward, full match )
1442 GTAT ( is also MINOR reverse, full match )
1443 TTAA ( is also MAJOR forward, 2 mismatches )
1444 TTAC ( is also MAJOR reverse, 1 mismatch )
1445 TTAG ( is also MAJOR forward, 1 mismatch )
1446 TTAT ( is also MINOR reverse, 1 mismatch )
1447
1448 * *
1449 CAAA [0x40]=( RNA_SPLICE_REV | 0x20 )
1450 CAAC ( is also MAJOR reverse, 1 mismatch )
1451 CAAG [0x42]=( RNA_SPLICE_REV | 0x20 )
1452 CAAT [0x43]=( RNA_SPLICE_REV | 0x20 )
1453 CCAA [0x50]=( RNA_SPLICE_REV | 0x20 )
1454 CCAC ( is also MAJOR reverse, 1 mismatch )
1455 CCAG [0x52]=( RNA_SPLICE_REV | 0x20 )
1456 CCAT [0x53]=( RNA_SPLICE_REV | 0x20 )
1457 CGAA [0x60]=( RNA_SPLICE_REV | 0x20 )
1458 CGAC ( is also MAJOR reverse, 1 mismatch )
1459 CGAG [0x62]=( RNA_SPLICE_REV | 0x20 )
1460 CGAT [0x63]=( RNA_SPLICE_REV | 0x20 )
1461
1462 -----------------------------------------------------------
1463 GTAG ... MAJOR, forward
1464
1465 * *
1466 ATAG ( is also MAJOR forward, 1 mismatch )
1467 ATCG [0x36]=( RNA_SPLICE_FWD | 0x20 )
1468 ATGG [0x3A]=( RNA_SPLICE_FWD | 0x20 )
1469 ATTG [0x3E]=( RNA_SPLICE_FWD | 0x20 )
1470 CTAG ( is also MAJOR forward, 1 mismatch )
1471 CTCG [0x76]=( RNA_SPLICE_FWD | 0x20 )
1472 CTGG [0x7A]=( RNA_SPLICE_FWD | 0x20 )
1473 CTTG [0x7E]=( RNA_SPLICE_FWD | 0x20 )
1474 TTAG ( is also MAJOR forward, 1 mismatch )
1475 TTCG [0xF6]=( RNA_SPLICE_FWD | 0x20 )
1476 TTGG [0xFA]=( RNA_SPLICE_FWD | 0x20 )
1477 TTTG [0xFE]=( RNA_SPLICE_FWD | 0x20 )
1478
1479 **
1480 GAAG ( is also MAJOR forward, 1 mismatch )
1481 GACG [0x86]=( RNA_SPLICE_FWD | 0x20 )
1482 GAGG [0x8A]=( RNA_SPLICE_FWD | 0x20 )
1483 GATG [0x8E]=( RNA_SPLICE_FWD | 0x20 )
1484 GCAG ( is also MAJOR forward, 1 mismatch )
1485 GCCG [0x96]=( RNA_SPLICE_FWD | 0x20 )
1486 GCGG [0x9A]=( RNA_SPLICE_FWD | 0x20 )
1487 GCTG [0x9E]=( RNA_SPLICE_FWD | 0x20 )
1488 GGAG ( is also MAJOR forward, 1 mismatch )
1489 GGCG [0xA6]=( RNA_SPLICE_FWD | 0x20 )
1490 GGGG [0xAA]=( RNA_SPLICE_FWD | 0x20 )
1491 GGTG [0xAE]=( RNA_SPLICE_FWD | 0x20 )
1492
1493 * *
1494 ATAA ( is also MINOR forward, 1 mismatch )
1495 ATAC ( is also MINOR forward, full match )
1496 ATAG ( is also MAJOR forward, 1 mismatch )
1497 ATAT ( is also MINOR forward, 1 mismatch )
1498 CTAA ( is also MAJOR reverse, 1 mismatch )
1499 CTAC ( is also MAJOR reverse, full match )
1500 CTAG ( is also MAJOR forward, 1 mismatch )
1501 CTAT ( is also MAJOR reverse, 1 mismatch )
1502 TTAA [0xF0]=( RNA_SPLICE_FWD | 0x20 )
1503 TTAC ( is also MAJOR reverse, 1 mismatch )
1504 TTAG ( is also MAJOR forward, 1 mismatch )
1505 TTAT ( is also MINOR reverse, 1 mismatch )
1506
1507 * *
1508 GAAA [0x80]=( RNA_SPLICE_FWD | 0x20 )
1509 GAAC [0x81]=( RNA_SPLICE_FWD | 0x20 )
1510 GAAG ( is also MAJOR forward, 1 mismatch )
1511 GAAT ( is also MINOR reverse, 1 mismatch )
1512 GCAA [0x90]=( RNA_SPLICE_FWD | 0x20 )
1513 GCAC [0x91]=( RNA_SPLICE_FWD | 0x20 )
1514 GCAG ( is also MAJOR forward, 1 mismatch )
1515 GCAT ( is also MINOR reverse, 1 mismatch )
1516 GGAA [0xA0]=( RNA_SPLICE_FWD | 0x20 )
1517 GGAC [0xA1]=( RNA_SPLICE_FWD | 0x20 )
1518 GGAG ( is also MAJOR forward, 1 mismatch )
1519 GGAT ( is also MINOR reverse, 1 mismatch )
1520
1521 -----------------------------------------------------------
1522 GTAT ... MINOR, reverse
1523
1524 * *
1525 ATAT ( is also MINOR forward, 1 mismatch )
1526 ATCT [0x37]=( RNA_SPLICE_REV | 0x20 )
1527 ATGT [0x3B]=( RNA_SPLICE_REV | 0x20 )
1528 ATTT [0x3F]=( RNA_SPLICE_REV | 0x20 )
1529 CTAT ( is also MAJOR reverse, 1 mismatch )
1530 CTCT [0x77]=( RNA_SPLICE_REV | 0x20 )
1531 CTGT [0x7B]=( RNA_SPLICE_REV | 0x20 )
1532 CTTT [0x7F]=( RNA_SPLICE_REV | 0x20 )
1533 TTAT ( is also MINOR reverse, 1 mismatch )
1534 TTCT [0xF7]=( RNA_SPLICE_REV | 0x20 )
1535 TTGT [0xFB]=( RNA_SPLICE_REV | 0x20 )
1536 TTTT [0xFF]=( RNA_SPLICE_REV | 0x20 )
1537
1538 **
1539 GAAT ( is also MINOR reverse, 1 mismatch )
1540 GACT [0x87]=( RNA_SPLICE_REV | 0x20 )
1541 GAGT [0x8B]=( RNA_SPLICE_REV | 0x20 )
1542 GATT [0x8F]=( RNA_SPLICE_REV | 0x20 )
1543 GCAT ( is also MINOR reverse, 1 mismatch )
1544 GCCT [0x97]=( RNA_SPLICE_REV | 0x20 )
1545 GCGT [0x9B]=( RNA_SPLICE_REV | 0x20 )
1546 GCTT [0x9F]=( RNA_SPLICE_REV | 0x20 )
1547 GGAT ( is also MINOR reverse, 1 mismatch )
1548 GGCT [0xA7]=( RNA_SPLICE_REV | 0x20 )
1549 GGGT [0xAB]=( RNA_SPLICE_REV | 0x20 )
1550 GGTT [0xAF]=( RNA_SPLICE_REV | 0x20 )
1551
1552 * *
1553 ATAA ( is also MINOR forward, 1 mismatch )
1554 ATAC ( is also MINOR forward, full match )
1555 ATAG ( is also MAJOR forward, 1 mismatch )
1556 ATAT ( is also MINOR forward, 1 mismatch )
1557 CTAA ( is also MAJOR reverse, 1 mismatch )
1558 CTAC ( is also MAJOR reverse, full match )
1559 CTAG ( is also MAJOR forward, 1 mismatch )
1560 CTAT ( is also MAJOR reverse, 1 mismatch )
1561 TTAA ( is also MAJOR forward, 2 mismatches )
1562 TTAC ( is also MAJOR reverse, 1 mismatch )
1563 TTAG ( is also MAJOR forward, 1 mismatch )
1564 TTAT ( is also MINOR reverse, 1 mismatch )
1565
1566 * *
1567 GAAA ( is also MAJOR forward, 2 mismatches )
1568 GAAC ( is also MAJOR forward, 2 mismatches )
1569 GAAG ( is also MAJOR forward, 1 mismatch )
1570 GAAT ( is also MINOR reverse, 1 mismatch )
1571 GCAA ( is also MAJOR forward, 2 mismatches )
1572 GCAC ( is also MAJOR forward, 2 mismatches )
1573 GCAG ( is also MAJOR forward, 1 mismatch )
1574 GCAT ( is also MINOR reverse, 1 mismatch )
1575 GGAA ( is also MAJOR forward, 2 mismatches )
1576 GGAC ( is also MAJOR forward, 2 mismatches )
1577 GGAG ( is also MAJOR forward, 1 mismatch )
1578 GGAT ( is also MINOR reverse, 1 mismatch )
1579
1580 *************************************************************/
1581
check_rna_splicing_candidates_against_ref(struct ReferenceObj const * ref_obj,uint32_t splice_level,INSDC_coord_zero pos,rna_splice_candidates * candidates)1582 rc_t check_rna_splicing_candidates_against_ref( struct ReferenceObj const * ref_obj,
1583 uint32_t splice_level, /* 0, 1, 2 ... allowed mismatches */
1584 INSDC_coord_zero pos,
1585 rna_splice_candidates * candidates )
1586 {
1587 rc_t rc = 0;
1588 uint32_t idx;
1589 for ( idx = 0; idx < candidates->count && rc == 0; ++idx )
1590 {
1591 uint8_t splice[ 4 ];
1592 INSDC_coord_len written;
1593 rna_splice_candidate * rsc = &candidates->candidates[ idx ];
1594 INSDC_coord_zero rd_pos = ( pos + rsc->ref_offset );
1595 rc = ReferenceObj_Read( ref_obj, rd_pos, 2, splice, &written );
1596 if ( rc == 0 && written == 2 )
1597 {
1598 rd_pos += ( rsc->len - 2 );
1599 rc = ReferenceObj_Read( ref_obj, rd_pos, 2, &splice[ 2 ], &written );
1600 if ( rc == 0 && written == 2 )
1601 {
1602 uint8_t compressed = compress_4_bases_to_byte( splice ); /* 4 bases --> 1 byte */
1603 uint8_t match = compressed_to_fwd_reverse_0[ compressed ]; /* table lookup */
1604 uint8_t mismatches = ( match >> 4 );
1605
1606 if ( mismatches <= splice_level )
1607 {
1608 rsc->matched = match;
1609
1610 if ( ( match & RNA_SPLICE_FWD ) == RNA_SPLICE_FWD )
1611 candidates->fwd_matched++;
1612 else if ( ( match & RNA_SPLICE_REV ) == RNA_SPLICE_REV )
1613 candidates->rev_matched++;
1614 }
1615 }
1616 }
1617 }
1618 return rc;
1619 }
1620
1621
discover_rna_splicing_candidates(uint32_t cigar_len,const char * cigar,uint32_t min_len,rna_splice_candidates * candidates)1622 rc_t discover_rna_splicing_candidates( uint32_t cigar_len, const char * cigar, uint32_t min_len, rna_splice_candidates * candidates )
1623 {
1624 rc_t rc = 0;
1625 candidates->cigops_len = ( cigar_len / 2 ) + 1;
1626 candidates->cigops = malloc( ( sizeof * candidates->cigops ) * candidates->cigops_len );
1627 if ( candidates->cigops == NULL )
1628 rc = RC( rcExe, rcNoTarg, rcConstructing, rcMemory, rcExhausted );
1629 else
1630 {
1631 uint32_t ref_offset = 0;
1632 int32_t op_idx;
1633 CigOps * cigops = candidates->cigops;
1634
1635 candidates->n_cigops = ExplodeCIGAR( cigops, candidates->cigops_len, cigar, cigar_len );
1636 candidates->count = 0;
1637 for ( op_idx = 0; op_idx < ( candidates->n_cigops - 1 ); op_idx++ )
1638 {
1639 char op_code = cigops[ op_idx ].op;
1640 uint32_t op_len = cigops[ op_idx ].oplen;
1641 if ( (op_code == 'D' || op_code == 'N') && op_len >= min_len && candidates->count < MAX_RNA_SPLICE_CANDIDATES )
1642 {
1643 rna_splice_candidate * rsc = &candidates->candidates[ candidates->count++ ];
1644 rsc->ref_offset = ref_offset;
1645 rsc->len = op_len;
1646 rsc->op_idx = op_idx;
1647 rsc->matched = RNA_SPLICE_UNKNOWN; /* we dont know that yet, caller has to do that ( sam-aligned.c ) */
1648 }
1649 if ( op_code == 'M' || op_code == 'X' || op_code == '=' || op_code == 'D' || op_code == 'N' )
1650 ref_offset += op_len;
1651 }
1652 }
1653 return rc;
1654 }
1655
1656
1657
change_rna_splicing_cigar(uint32_t cigar_len,char * cigar,rna_splice_candidates * candidates,uint32_t * NM_adjustment)1658 rc_t change_rna_splicing_cigar( uint32_t cigar_len, char * cigar, rna_splice_candidates * candidates, uint32_t * NM_adjustment )
1659 {
1660 rc_t rc = 0;
1661 uint32_t winner, sum_of_n_lengths = 0;
1662 int32_t idx, dst;
1663 CigOps * cigops = candidates->cigops;
1664
1665 /* handle the special case that we do have forward and reverse candidates in one alignement, that cannot be!
1666 we declare a winner ( the direction that occurs most ), zero out the looser(s) and give a warning */
1667 if ( candidates->fwd_matched > candidates->rev_matched )
1668 winner = RNA_SPLICE_FWD;
1669 else
1670 winner = RNA_SPLICE_REV;
1671
1672 for ( idx = 0; idx < candidates->count; ++idx )
1673 {
1674 rna_splice_candidate * rsc = &candidates->candidates[ idx ];
1675 if ( ( rsc->matched & 0x0F ) == winner && cigops[ rsc->op_idx ].op == 'D' )
1676 {
1677 cigops[ rsc->op_idx ].op = 'N';
1678 sum_of_n_lengths += rsc->len;
1679 }
1680 }
1681
1682 for ( idx = 0, dst = 0; idx < ( candidates->n_cigops - 1 ) && rc == 0; ++idx )
1683 {
1684 size_t sz;
1685 rc = string_printf( &cigar[ dst ], cigar_len + 1 - dst, &sz, "%u%c", cigops[ idx ].oplen, cigops[ idx ].op );
1686 dst += sz;
1687 }
1688 if ( NM_adjustment != NULL )
1689 *NM_adjustment = sum_of_n_lengths;
1690 return rc;
1691 }
cg_canonical_print_cigar(const char * cigar,size_t cigar_len)1692 rc_t cg_canonical_print_cigar( const char * cigar, size_t cigar_len)
1693 {
1694 rc_t rc;
1695 if ( cigar_len > 0 )
1696 {
1697 int i,total_cnt,cnt;
1698 char op;
1699 for(i=0,cnt=0,op=0,cnt=0;i<cigar_len;i++){
1700 if(isdigit(cigar[i])){
1701 cnt=cnt*10+(cigar[i]-'0');
1702 } else if(isalpha(cigar[i])){
1703 if(op=='\0'){ /** first op **/
1704 total_cnt=cnt;
1705 } else if(op==cigar[i]){ /** merging consequitive ops **/
1706 total_cnt+=cnt;
1707 } else {
1708 if(total_cnt > 0) KOutMsg( "%d%c", total_cnt,op );
1709 total_cnt=cnt;
1710 }
1711 op=cigar[i];
1712 cnt=0;
1713 } else {
1714 assert(0); /*** should never happen inside this function ***/
1715 }
1716 }
1717 if(total_cnt && op) KOutMsg( "%d%c", total_cnt,op );
1718 }
1719 else
1720 rc = KOutMsg( "*" );
1721 return rc;
1722 }
1723
1724