1 static char rcsid[] = "$Id: dynprog_single.c 218162 2019-01-17 05:59:55Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "dynprog_single.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h> /* For ceil, log, pow */
11 #include <ctype.h> /* For tolower */
12
13
14 #include "bool.h"
15 #include "except.h"
16 #include "assert.h"
17 #include "mem.h"
18 #include "comp.h"
19 #include "pair.h"
20 #include "pairdef.h"
21 #include "boyer-moore.h"
22 #include "complement.h"
23 #include "intron.h"
24 #include "maxent.h"
25 #include "maxent_hr.h"
26 #include "fastlog.h"
27 #include "dynprog_simd.h"
28
29
30 /* Tests whether get_genomic_nt == genomicseg in compute_scores procedures */
31 /* #define EXTRACT_GENOMICSEG 1 */
32
33
34 /* Prints parameters and results */
35 #ifdef DEBUG
36 #define debug(x) x
37 #else
38 #define debug(x)
39 #endif
40
41 /* Microexon search */
42 #ifdef DEBUG1
43 #define debug1(x) x
44 #else
45 #define debug1(x)
46 #endif
47
48 /* Getting genomic nt */
49 #ifdef DEBUG8
50 #define debug8(x) x
51 #else
52 #define debug8(x)
53 #endif
54
55 /* print_vector */
56 #ifdef DEBUG15
57 #define debug15(x) x
58 #else
59 #define debug15(x)
60 #endif
61
62 /* Homopolymer (e.g., PacBio) */
63 #ifdef DEBUG16
64 #define debug16(x) x
65 #else
66 #define debug16(x)
67 #endif
68
69 /* Homopolymer details */
70 #ifdef DEBUG16A
71 #define debug16a(x) x
72 #else
73 #define debug16a(x)
74 #endif
75
76
77 #define MICROEXON_PVALUE_HIGHQ 0.01
78 #define MICROEXON_PVALUE_MEDQ 0.001
79 #define MICROEXON_PVALUE_LOWQ 0.0001
80 #define ENDSEQUENCE_PVALUE 0.001 /* Have stricter threshold for making end exons */
81
82 #define MIN_MICROEXON_LENGTH 3
83 #ifdef PMAP
84 #define MAX_MICROEXON_LENGTH 17 /* Should be oligomer length - 1 plus peelback */
85 #else
86 #define MAX_MICROEXON_LENGTH 12 /* Should be oligomer length - 1 plus peelback */
87 #endif
88 #define MICROINTRON_LENGTH 9
89
90
91
92 #define T Dynprog_T
93
94 bool homopolymerp;
95
96 void
Dynprog_single_setup(bool homopolymerp_in)97 Dynprog_single_setup (bool homopolymerp_in) {
98
99 homopolymerp = homopolymerp_in;
100
101 return;
102 }
103
104
105 static char complCode[128] = COMPLEMENT_LC;
106
107 static char
get_genomic_nt(char * g_alt,int genomicpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)108 get_genomic_nt (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
109 bool watsonp) {
110 char c2, c2_alt;
111 Univcoord_T pos;
112
113 #if 0
114 /* If the read has a deletion, then we will extend beyond 0 or genomiclength, so do not restrict. */
115 if (genomicpos < 0) {
116 return '*';
117
118 } else if (genomicpos >= genomiclength) {
119 return '*';
120
121 }
122 #endif
123
124 if (watsonp) {
125 if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
126 *g_alt = '*';
127 return '*';
128
129 } else if (pos >= chrhigh) {
130 *g_alt = '*';
131 return '*';
132
133 #if 0
134 } else if (genome) {
135 /* Not necessary, because Genome_get_char_blocks should work */
136 debug8(printf("At %u, genomicnt is %c\n",
137 genomicpos,Genome_get_char(genome,pos)));
138 return Genome_get_char(genome,pos);
139 #endif
140
141 } else {
142 /* GMAP with user-supplied genomic segment */
143 debug8(printf("At %u, genomicnt is %c\n",
144 genomicpos,Genome_get_char_blocks(pos)));
145 return Genome_get_char_blocks(&(*g_alt),pos);
146 }
147
148 } else {
149 if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
150 *g_alt = '*';
151 return '*';
152
153 } else if (pos >= chrhigh) {
154 *g_alt = '*';
155 return '*';
156
157 #if 0
158 } else if (genome) {
159 /* Not necessary, because Genome_get_char_blocks should work */
160 c2 = Genome_get_char(genome,pos);
161 #endif
162
163 } else {
164 /* GMAP with user-supplied genomic segment */
165 c2 = Genome_get_char_blocks(&c2_alt,pos);
166 }
167 debug8(printf("At %u, genomicnt is %c\n",genomicpos,complCode[(int) c2]));
168 *g_alt = complCode[(int) c2_alt];
169 return complCode[(int) c2];
170 }
171 }
172
173
174 /************************************************************************
175 * Homopolymer
176 ************************************************************************/
177
178 static List_T
augment_pairs(List_T pairs,int * rsequence_nreps,int roffset,int * gsequence_nreps,int goffset,Pairpool_T pairpool,int dynprogindex)179 augment_pairs (List_T pairs, int *rsequence_nreps, int roffset,
180 int *gsequence_nreps, int goffset,
181 Pairpool_T pairpool, int dynprogindex) {
182 List_T augmented = NULL, p;
183 Pair_T pair;
184 int r, c, r_nreps, c_nreps, i;
185 int r_cum = 0, c_cum = 0;
186
187
188 #ifdef DEBUG16
189 printf("r_nreps: ");
190 for (i = 0; i < r_uniqlength; i++) {
191 printf("%d",rsequence_nreps[i]);
192 }
193 printf("\n");
194
195 printf("g_nreps: ");
196 for (i = 0; i < g_uniqlength; i++) {
197 printf("%d",gsequence_nreps[i]);
198 }
199 printf("\n");
200 #endif
201
202 for (p = pairs; p != NULL; p = p->rest) {
203 pair = (Pair_T) p->first;
204 r = pair->querypos - roffset;
205 c = pair->genomepos - goffset;
206
207 if (pair->comp == INDEL_COMP) {
208 augmented = Pairpool_push_existing(augmented,pairpool,pair);
209 debug16a(Pair_dump_one(pair,true));
210 pair->querypos += r_cum;
211 pair->genomepos += c_cum;
212
213 if (pair->cdna == ' ') {
214 c_nreps = gsequence_nreps[c];
215 debug16a(printf(" genomepos %u, c_cum %d, nreps %d\n",c,c_cum,c_nreps));
216 if (c_nreps == 0) {
217 /* Do nothing */
218 } else {
219 for (i = 1; i <= c_nreps; i++) {
220 augmented = Pairpool_push(augmented,pairpool,r+r_cum+roffset,c+c_cum+i+goffset,
221 /*cdna*/' ',INDEL_COMP,pair->genome,pair->genomealt,
222 dynprogindex);
223 }
224 c_cum += c_nreps;
225 }
226
227 } else if (pair->genome == ' ') {
228 r_nreps = rsequence_nreps[r];
229 debug16a(printf(" querypos %d, r_cum %d, nreps %d\n",r,r_cum,r_nreps));
230 if (r_nreps == 0) {
231 /* Do nothing */
232 } else {
233 for (i = 1; i <= r_nreps; i++) {
234 augmented = Pairpool_push(augmented,pairpool,r+r_cum+i+roffset,c+c_cum+goffset,
235 pair->cdna,INDEL_COMP,/*genome*/' ',/*genomealt*/' ',
236 dynprogindex);
237 }
238 r_cum += r_nreps;
239 }
240
241 } else {
242 fprintf(stderr,"Indel pair is missing both cdna and genome nts\n");
243 abort();
244 }
245
246 } else {
247 r_nreps = rsequence_nreps[r];
248 c_nreps = gsequence_nreps[c];
249 debug16a(printf(" querypos %d, r_cum %d, nreps %d, genomepos %u, c_cum %d, nreps %d\n",
250 r,r_cum,r_nreps,c,c_cum,c_nreps));
251 augmented = Pairpool_push_existing(augmented,pairpool,pair);
252 pair->querypos += r_cum;
253 pair->genomepos += c_cum;
254 if (r_nreps == 0 && c_nreps == 0) {
255 /* Do nothing */
256 } else if (r_nreps == c_nreps) {
257 for (i = 1; i <= r_nreps; i++) {
258 augmented = Pairpool_push_copy(augmented,pairpool,pair);
259 ((Pair_T) augmented->first)->querypos += i;
260 ((Pair_T) augmented->first)->genomepos += i;
261 }
262 r_cum += r_nreps;
263 c_cum += c_nreps;
264
265 } else if (r_nreps < c_nreps) {
266 for (i = 1; i <= r_nreps; i++) {
267 augmented = Pairpool_push_copy(augmented,pairpool,pair);
268 ((Pair_T) augmented->first)->querypos += i;
269 ((Pair_T) augmented->first)->genomepos += i;
270 }
271 r_cum += r_nreps;
272
273 for ( ; i <= c_nreps; i++) {
274 /* Add 1 to r to advance to next coordinate */
275 augmented = Pairpool_push(augmented,pairpool,r+r_cum+roffset + 1,c+c_cum+i+goffset,
276 /*cdna*/' ',INDEL_COMP,pair->genome,pair->genomealt,
277 dynprogindex);
278 }
279 c_cum += c_nreps;
280
281 } else {
282 for (i = 1; i <= c_nreps; i++) {
283 augmented = Pairpool_push_copy(augmented,pairpool,pair);
284 ((Pair_T) augmented->first)->querypos += i;
285 ((Pair_T) augmented->first)->genomepos += i;
286 }
287 c_cum += c_nreps;
288
289 for ( ; i <= r_nreps; i++) {
290 /* Add 1 to c to advance to next coordinate */
291 augmented = Pairpool_push(augmented,pairpool,r+r_cum+i+roffset,c+c_cum+goffset + 1,
292 pair->cdna,INDEL_COMP,/*genome*/' ',/*genomealt*/' ',
293 dynprogindex);
294 }
295 r_cum += r_nreps;
296
297 }
298 }
299 }
300
301 debug16(Pair_dump_list(augmented,true));
302
303 return List_reverse(augmented);
304 }
305
306 static char *
uniq_string(int ** nreps,int * uniqlength,char * string,int length)307 uniq_string (int **nreps, int *uniqlength, char *string, int length) {
308 char *uniq, *p, nt, lastnt;
309 int i, k, *a;
310
311 *uniqlength = 1;
312 lastnt = string[0];
313 for (i = 1; i < length; i++) {
314 if ((nt = string[i]) != lastnt) {
315 (*uniqlength)++;
316 lastnt = nt;
317 }
318 }
319
320 p = uniq = (char *) MALLOC(((*uniqlength) + 1) * sizeof(char));
321 a = *nreps = (int *) MALLOC((*uniqlength) * sizeof(int));
322 k = 0;
323
324 lastnt = string[0];
325 for (i = 1; i < length; i++) {
326 if ((nt = string[i]) != lastnt) {
327 *p++ = lastnt;
328 *a++ = k;
329 lastnt = nt;
330 k = 0;
331 } else {
332 k++;
333 }
334 }
335 *p = lastnt;
336 *a = k;
337
338 #ifdef DEBUG16
339 printf("string: %.*s\n",length,string);
340 printf("uniq: %.*s\n",*uniqlength,uniq);
341 printf("nreps ");
342 for (i = 0; i < *uniqlength; i++) {
343 printf("%d",(*nreps)[i]);
344 }
345 printf("\n");
346 #endif
347
348 return uniq;
349 }
350
351
352
353 static List_T
single_gap_simple(int * finalscore,int * nmatches,int * nmismatches,char * rsequence,char * rsequenceuc,int rlength,char * gsequence,char * gsequence_alt,int roffset,int goffset,Pairpool_T pairpool,int mismatchtype,int genestrand,int dynprogindex)354 single_gap_simple (int *finalscore, int *nmatches, int *nmismatches,
355 char *rsequence, char *rsequenceuc, int rlength, char *gsequence, char *gsequence_alt,
356 int roffset, int goffset, Pairpool_T pairpool,
357 int mismatchtype, int genestrand, int dynprogindex) {
358 int score;
359 List_T pairs = NULL;
360 int r;
361 int querycoord, genomecoord;
362 int c1, c1_uc, c2, c2_alt;
363 Pairdistance_T **pairdistance_array_type;
364
365 debug(printf("Starting single_gap_simple\n"));
366 pairdistance_array_type = pairdistance_array[mismatchtype];
367
368 *finalscore = 0;
369 *nmatches = *nmismatches = 0;
370
371 /* Push from left to right, so we don't need to do List_reverse() later */
372 for (r = 1; r <= rlength; r++) {
373 querycoord = genomecoord = r-1;
374
375 c1 = rsequence[querycoord];
376 c1_uc = rsequenceuc[querycoord];
377 c2 = gsequence[genomecoord];
378 c2_alt = gsequence_alt[genomecoord];
379
380 if (c2 == '*') {
381 /* Don't push pairs past end of chromosome */
382 debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u\n",goffset,genomecoord));
383
384 } else if (c1_uc == c2 || c1_uc == c2_alt) {
385 debug(printf("Pushing simple %d,%d [%d,%d] (%c,%c) - match\n",
386 r,/*c*/r,roffset+querycoord,goffset+genomecoord,c1_uc,c2));
387 score = pairdistance_array_type[c1_uc][c2];
388 if (pairdistance_array_type[c1_uc][c2_alt] > score) {
389 score = pairdistance_array_type[c1_uc][c2_alt];
390 }
391 *finalscore += score;
392 *nmatches += 1;
393 pairs = Pairpool_push(pairs,pairpool,roffset+querycoord,goffset+genomecoord,
394 c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
395
396 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
397 debug(printf("Pushing simple %d,%d [%d,%d] (%c,%c) - ambiguous\n",
398 r,/*c*/r,roffset+querycoord,goffset+genomecoord,c1_uc,c2));
399 score = pairdistance_array_type[c1_uc][c2];
400 if (pairdistance_array_type[c1_uc][c2_alt] > score) {
401 score = pairdistance_array_type[c1_uc][c2_alt];
402 }
403 *finalscore += score;
404 *nmatches += 1;
405 pairs = Pairpool_push(pairs,pairpool,roffset+querycoord,goffset+genomecoord,
406 c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
407
408 } else {
409 debug(printf("Pushing simple %d,%d [%d,%d] (%c,%c) - mismatch\n",
410 r,/*c*/r,roffset+querycoord,goffset+genomecoord,c1_uc,c2));
411 score = pairdistance_array_type[c1_uc][c2];
412 if (pairdistance_array_type[c1_uc][c2_alt] > score) {
413 score = pairdistance_array_type[c1_uc][c2_alt];
414 }
415 *finalscore += score;
416 *nmismatches += 1;
417 pairs = Pairpool_push(pairs,pairpool,roffset+querycoord,goffset+genomecoord,
418 c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
419 }
420 }
421
422 if (*nmismatches > 1) {
423 return (List_T) NULL;
424 } else {
425 return pairs;
426 }
427 }
428
429
430 List_T
Dynprog_single_gap(int * dynprogindex,int * finalscore,int * nmatches,int * nmismatches,int * nopens,int * nindels,T dynprog,char * rsequence,char * rsequenceuc,int rlength,int glength,int roffset,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,int extraband_single,double defect_rate,bool widebandp)431 Dynprog_single_gap (int *dynprogindex, int *finalscore, int *nmatches, int *nmismatches, int *nopens, int *nindels,
432 T dynprog, char *rsequence, char *rsequenceuc,
433 int rlength, int glength, int roffset, int goffset,
434 Univcoord_T chroffset, Univcoord_T chrhigh,
435 bool watsonp, int genestrand, bool jump_late_p, Pairpool_T pairpool,
436 int extraband_single, double defect_rate, bool widebandp) {
437 List_T pairs = NULL;
438 char *gsequence, *gsequence_alt;
439
440 char *gsequence_orig, *rsequence_orig;
441 int *gsequence_nreps, *rsequence_nreps;
442 int glength_orig, rlength_orig;
443
444 Mismatchtype_T mismatchtype;
445 int lband, uband;
446 int open, extend;
447 #if defined(HAVE_SSE2)
448 Score8_T **matrix8;
449 Direction8_T **directions8_nogap, **directions8_Egap, **directions8_Fgap;
450
451 Score16_T **matrix16;
452 Direction16_T **directions16_nogap, **directions16_Egap, **directions16_Fgap;
453 #else
454 Score32_T **matrix;
455 Direction32_T **directions_nogap, **directions_Egap, **directions_Fgap;
456 #endif
457 /* bool onesidegapp; */
458
459 if (defect_rate < DEFECT_HIGHQ) {
460 mismatchtype = HIGHQ;
461 open = SINGLE_OPEN_HIGHQ;
462 extend = SINGLE_EXTEND_HIGHQ;
463 /* onesidegapp = false; */
464 } else if (defect_rate < DEFECT_MEDQ) {
465 mismatchtype = MEDQ;
466 open = SINGLE_OPEN_MEDQ;
467 extend = SINGLE_EXTEND_MEDQ;
468 /* onesidegapp = true; */
469 } else {
470 mismatchtype = LOWQ;
471 open = SINGLE_OPEN_LOWQ;
472 extend = SINGLE_EXTEND_LOWQ;
473 /* onesidegapp = true; */
474 }
475
476 #if 0
477 if (close_indels_mode == +1) {
478 /* Allow close indels */
479 onesidegapp = false;
480 } else if (close_indels_mode == -1) {
481 /* Disallow close indels */
482 onesidegapp = true;
483 } else {
484 /* Allow close indels for high quality alignments, as determined above */
485 }
486 #endif
487
488 /* Rlength: maxlookback+MAXPEELBACK. Glength +EXTRAMATERIAL */
489 debug(printf("%c: ",*dynprogindex > 0 ? (*dynprogindex-1)%26+'a' : (-(*dynprogindex)-1)%26+'A'));
490 debug(printf("Aligning single gap middle with wideband = %d and extraband %d\n",widebandp,extraband_single));
491 #ifdef EXTRACT_GENOMICSEG
492 debug(printf("At genomic offset %d-%d, %.*s\n",goffset,goffset+glength-1,glength,gsequence));
493 #endif
494 debug(printf("\n"));
495
496 #if 0
497 /* Can be violated when extension search in GSNAP tries different diagonals */
498 assert(glength > 0);
499 #endif
500
501 if (rlength <= 0 || glength <= 0 ||
502 rlength > dynprog->max_rlength || glength > dynprog->max_glength) {
503 debug(printf("rlength %d or glength %d is too long. Returning NULL\n",rlength,glength));
504 *finalscore = NEG_INFINITY_32;
505 *nmatches = *nmismatches = *nopens = *nindels = 0;
506 #if 0
507 /* Don't push a gapholder for single gap, because gapholder already exists */
508 pairs = Pairpool_push_gapholder(NULL,pairpool,rlength,glength,
509 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
510 #endif
511 *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
512 return (List_T) NULL;
513 }
514
515 debug(printf("At query offset %d-%d, %.*s\n",roffset,roffset+rlength-1,rlength,rsequence));
516
517 /* If extraband_single is too large, then gaps may be inserted on
518 both sides, like this
519
520 CACCC AGAGCAGGCACTGCCT
521 |||||--- ||| ||---|||||
522 CACCCCAGGGAGGAG CTGCCC
523
524 */
525
526
527 if (homopolymerp == true) {
528 rsequence_orig = rsequence;
529 rlength_orig = rlength;
530 rsequence = uniq_string(&rsequence_nreps,&rlength,rsequence_orig,rlength_orig);
531
532 gsequence_orig = (char *) MALLOCA((glength+1) * sizeof(char));
533 gsequence_alt = (char *) MALLOCA((glength+1) * sizeof(char));
534
535 if (watsonp) {
536 Genome_get_segment_blocks_right(gsequence_orig,gsequence_alt,/*left*/chroffset+goffset,
537 glength,chrhigh,/*revcomp*/false);
538 } else {
539 Genome_get_segment_blocks_left(gsequence_orig,gsequence_alt,/*right*/chrhigh-goffset+1,
540 glength,chroffset,/*revcomp*/true);
541 }
542 if (gsequence_orig[0] == '\0') {
543 *finalscore = NEG_INFINITY_32;
544 *nmatches = *nmismatches = *nopens = *nindels = 0;
545 FREEA(gsequence_alt);
546 FREEA(gsequence_orig);
547 return (List_T) NULL;
548 } else {
549 glength_orig = glength;
550 rsequence = uniq_string(&gsequence_nreps,&glength,gsequence_orig,glength_orig);
551 }
552
553 } else {
554 gsequence = (char *) MALLOCA((glength+1) * sizeof(char));
555 gsequence_alt = (char *) MALLOCA((glength+1) * sizeof(char));
556
557 if (watsonp) {
558 Genome_get_segment_blocks_right(gsequence,gsequence_alt,/*left*/chroffset+goffset,
559 glength,chrhigh,/*revcomp*/false);
560 } else {
561 Genome_get_segment_blocks_left(gsequence,gsequence_alt,/*right*/chrhigh-goffset+1,
562 glength,chroffset,/*revcomp*/true);
563 }
564 if (gsequence[0] == '\0') {
565 *finalscore = NEG_INFINITY_32;
566 *nmatches = *nmismatches = *nopens = *nindels = 0;
567 FREEA(gsequence_alt);
568 FREEA(gsequence);
569 return (List_T) NULL;
570 } else if (glength == rlength &&
571 (pairs = single_gap_simple(&(*finalscore),&(*nmatches),&(*nmismatches),
572 rsequence,rsequenceuc,rlength,gsequence,gsequence_alt,roffset,goffset,
573 pairpool,mismatchtype,genestrand,*dynprogindex)) != NULL) {
574 FREEA(gsequence_alt);
575 FREEA(gsequence);
576
577 *nopens = *nindels = 0;
578 *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
579 return pairs;
580 }
581 }
582
583
584 Dynprog_compute_bands(&lband,&uband,rlength,glength,extraband_single,widebandp);
585 #if defined(HAVE_SSE2)
586 /* Use || because we want the minimum length (which determines the diagonal length) to achieve a score less than 128 */
587 /* Use && because we don't want to overflow in either direction */
588 if (rlength < use8p_size[mismatchtype] && glength < use8p_size[mismatchtype]) {
589 matrix8 = Dynprog_simd_8(&directions8_nogap,&directions8_Egap,&directions8_Fgap,dynprog,
590 rsequence,gsequence,gsequence_alt,rlength,glength,
591 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
592 goffset,chroffset,chrhigh,watsonp,
593 #endif
594 mismatchtype,open,extend,
595 lband,uband,jump_late_p,/*revp*/false);
596 *finalscore = (int) matrix8[glength][rlength];
597
598 *nmatches = *nmismatches = *nopens = *nindels = 0;
599 pairs = Dynprog_traceback_8(NULL,&(*nmatches),&(*nmismatches),&(*nopens),&(*nindels),
600 directions8_nogap,directions8_Egap,directions8_Fgap,rlength,glength,
601 rsequence,rsequenceuc,
602 gsequence,gsequence_alt,roffset,goffset,pairpool,/*revp*/false,
603 chroffset,chrhigh,watsonp,genestrand,*dynprogindex);
604
605 } else {
606 matrix16 = Dynprog_simd_16(&directions16_nogap,&directions16_Egap,&directions16_Fgap,dynprog,
607 rsequence,gsequence,gsequence_alt,rlength,glength,
608 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
609 goffset,chroffset,chrhigh,watsonp,
610 #endif
611 mismatchtype,open,extend,
612 lband,uband,jump_late_p,/*revp*/false);
613 *finalscore = (int) matrix16[glength][rlength];
614
615 *nmatches = *nmismatches = *nopens = *nindels = 0;
616 pairs = Dynprog_traceback_16(NULL,&(*nmatches),&(*nmismatches),&(*nopens),&(*nindels),
617 directions16_nogap,directions16_Egap,directions16_Fgap,rlength,glength,
618 rsequence,rsequenceuc,
619 gsequence,gsequence_alt,roffset,goffset,pairpool,/*revp*/false,
620 chroffset,chrhigh,watsonp,genestrand,*dynprogindex);
621 }
622
623 #else
624
625 matrix = Dynprog_standard(&directions_nogap,&directions_Egap,&directions_Fgap,dynprog,
626 rsequence,gsequence,gsequence_alt,rlength,glength,
627 goffset,chroffset,chrhigh,watsonp,mismatchtype,open,extend,
628 lband,uband,jump_late_p,/*revp*/false,/*saturation*/NEG_INFINITY_INT,
629 /*upperp*/true,/*lowerp*/true);
630 *finalscore = (int) matrix[glength][rlength];
631
632 *nmatches = *nmismatches = *nopens = *nindels = 0;
633 pairs = Dynprog_traceback_std(NULL,&(*nmatches),&(*nmismatches),&(*nopens),&(*nindels),
634 directions_nogap,directions_Egap,directions_Fgap,rlength,glength,
635 rsequence,rsequenceuc,
636 gsequence,gsequence_alt,roffset,goffset,pairpool,/*revp*/false,
637 chroffset,chrhigh,watsonp,genestrand,*dynprogindex);
638 #endif
639
640 if (homopolymerp == true) {
641 pairs = augment_pairs(pairs,rsequence_nreps,roffset,
642 gsequence_nreps,goffset,pairpool,*dynprogindex);
643 FREE(gsequence_nreps);
644 FREEA(gsequence_orig);
645 FREEA(gsequence);
646
647 FREE(rsequence_nreps);
648 /* Do not free rsequence_orig */
649 FREE(rsequence);
650
651 } else {
652 FREEA(gsequence_alt);
653 FREEA(gsequence);
654 }
655
656 /*
657 Directions_free(directions);
658 Matrix_free(matrix);
659 */
660
661 debug(printf("End of dynprog single gap\n"));
662
663 *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
664 return List_reverse(pairs);
665 }
666
667
668
669 static const Except_T microexon_error = {"Microexon error"};
670
671 static List_T
make_microexon_pairs_double(int roffsetL,int roffsetM,int roffsetR,int goffsetL,int goffsetM,int goffsetR,int lengthL,int lengthM,int lengthR,char * rsequence,char * rsequenceuc,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,Pairpool_T pairpool,char gapchar,int dynprogindex)672 make_microexon_pairs_double (int roffsetL, int roffsetM, int roffsetR,
673 int goffsetL, int goffsetM, int goffsetR,
674 int lengthL, int lengthM, int lengthR,
675 char *rsequence, char *rsequenceuc,
676 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
677 Pairpool_T pairpool, char gapchar, int dynprogindex) {
678 List_T pairs = NULL;
679 Pair_T gappair;
680 char c1, c1_uc, c2, c2_alt;
681 int i;
682
683 /* Left segment */
684 for (i = 0; i < lengthL; i++) {
685 c1 = rsequence[roffsetL+i];
686 c1_uc = rsequenceuc[roffsetL+i];
687
688 c2 = get_genomic_nt(&c2_alt,goffsetL+i,chroffset,chrhigh,watsonp);
689 #ifdef EXTRACT_GENOMICSEG
690 assert(c2 == genomicseg[goffsetL+i]);
691 #endif
692
693 if (c1_uc == c2 || c1_uc == c2_alt) {
694 pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
695 dynprogindex);
696 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
697 pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
698 dynprogindex);
699 } else {
700 pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,MISMATCH_COMP,c2,c2_alt,
701 dynprogindex);
702 }
703 }
704
705 /* First gap */
706 /* Don't have to adjust querypos/genomepos, since no cdna/genome skips allowed */
707 pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/goffsetM-(goffsetL+lengthL),
708 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
709
710 /* Assign pair->comp, because might occur after assign_gap_types */
711 gappair = (Pair_T) List_head(pairs);
712 gappair->comp = gapchar;
713
714
715 /* Microexon */
716 for (i = 0; i < lengthM; i++) {
717 c1 = rsequence[roffsetM+i];
718 c1_uc = rsequenceuc[roffsetM+i];
719
720 c2 = get_genomic_nt(&c2_alt,goffsetM+i,chroffset,chrhigh,watsonp);
721 #ifdef EXTRACT_GENOMICSEG
722 assert(c2 == genomicseg[goffsetM+i]);
723 #endif
724
725 if (c1_uc == c2 || c1_uc == c2_alt) {
726 pairs = Pairpool_push(pairs,pairpool,roffsetM+i,goffsetM+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
727 dynprogindex);
728 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
729 pairs = Pairpool_push(pairs,pairpool,roffsetM+i,goffsetM+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
730 dynprogindex);
731 } else {
732 pairs = Pairpool_push(pairs,pairpool,roffsetM+i,goffsetM+i,c1,MISMATCH_COMP,c2,c2_alt,
733 dynprogindex);
734 }
735 }
736
737 /* Second gap */
738 /* Don't have to adjust querypos/genomepos, since no cdna/genome skips allowed */
739 if (lengthR == 0) {
740 /* If lengthR is zero, then we will have a gap after a gap */
741 Except_raise(µexon_error,__FILE__,__LINE__);
742 } else {
743 pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/goffsetR-(goffsetM+lengthM),
744 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
745 }
746
747 /* Assign pair->comp, because might occur after assign_gap_types */
748 gappair = (Pair_T) List_head(pairs);
749 gappair->comp = gapchar;
750
751
752 /* Right segment */
753 for (i = 0; i < lengthR; i++) {
754 c1 = rsequence[roffsetR+i];
755 c1_uc = rsequenceuc[roffsetR+i];
756
757 c2 = get_genomic_nt(&c2_alt,goffsetR+i,chroffset,chrhigh,watsonp);
758 #ifdef EXTRACT_GENOMICSEG
759 assert(c2 == genomicseg[goffsetR+i]);
760 #endif
761
762 if (c1_uc == c2 || c1_uc == c2_alt) {
763 pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
764 dynprogindex);
765 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
766 pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
767 dynprogindex);
768 } else {
769 pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,MISMATCH_COMP,c2,c2_alt,
770 dynprogindex);
771 }
772 }
773
774 return pairs;
775 }
776
777
778 #if 0
779 static List_T
780 make_microexon_pairs_single (int roffsetL, int roffsetR,
781 int goffsetL, int goffsetR,
782 int lengthL, int lengthR, char *rsequence, char *rsequenceuc,
783 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
784 Pairpool_T pairpool, char gapchar, int dynprogindex) {
785 List_T pairs = NULL;
786 Pair_T gappair;
787 char c1, c1_uc, c2, c2_alt;
788 int i;
789
790 /* Microexon */
791 for (i = 0; i < lengthL; i++) {
792 c1 = rsequence[roffsetL+i];
793 c1_uc = rsequenceuc[roffsetL+i];
794
795 c2 = get_genomic_nt(&c2_alt,goffsetL+i,chroffset,chrhigh,watsonp);
796 #ifdef EXTRACT_GENOMICSEG
797 assert(c2 == genomicseg[goffsetL+i]);
798 #endif
799
800 if (c1_uc == c2 || c1_uc == c2_alt) {
801 pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
802 dynprogindex);
803 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
804 pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
805 dynprogindex);
806 } else {
807 pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,MISMATCH_COMP,c2,c2_alt,
808 dynprogindex);
809 }
810 }
811
812 /* Gap */
813 /* Don't have to adjust querypos/genomepos, since no cdna/genome skips allowed */
814 pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/goffsetR-(goffsetL+lengthL),
815 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
816
817 /* Assign pair->comp, because might occur after assign_gap_types */
818 gappair = (Pair_T) List_head(pairs);
819 gappair->comp = gapchar;
820
821
822 /* Right segment */
823 for (i = 0; i < lengthR; i++) {
824 c1 = rsequence[roffsetR+i];
825 c1_uc = rsequenceuc[roffsetR+i];
826
827 c2 = get_genomic_nt(&c2_alt,goffsetR+i,chroffset,chrhigh,watsonp);
828 #ifdef EXTRACT_GENOMICSEG
829 assert(c2 == genomicseg[goffsetR+i]);
830 #endif
831
832 if (c1_uc == c2 || c1_uc == c2_alt) {
833 pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
834 dynprogindex);
835 } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
836 pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
837 dynprogindex);
838 } else {
839 pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,MISMATCH_COMP,c2,c2_alt,
840 dynprogindex);
841 }
842 }
843
844 return pairs;
845 }
846 #endif
847
848
849 /************************************************************************/
850
851 #ifdef PMAP
852 /* Same as in boyer-moore.c */
853 /* Handle only cases in iupac table in sequence.c */
854 static bool matchtable[26][26] =
855 /* A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z */
856 {{1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0}, /* A */
857 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* B */
858 {0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0}, /* C */
859 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* D */
860 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* E */
861 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* F */
862 {0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0}, /* G */
863 {1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0}, /* H = [ACT] */
864 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* I */
865 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* J */
866 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* K */
867 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* L */
868 {1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,1,0,1,0}, /* M = [AC] */
869 {1,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0}, /* N = [ACGT] */
870 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* O */
871 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* P */
872 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* Q */
873 {1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,1,0,0,0}, /* R = [AG] */
874 {0,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0}, /* S = [CG] */
875 {0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0}, /* T */
876 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* U */
877 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* V */
878 {1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,1,0,1,0}, /* W = [AT] */
879 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* X */
880 {0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0}, /* Y = [CT] */
881 {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}}; /* Z */
882 #endif
883
884 /************************************************************************/
885
886
887 List_T
Dynprog_microexon_int(double * bestprob2,double * bestprob3,int * dynprogindex,int * microintrontype,char * rsequence,char * rsequenceuc,int rlength,int glengthL,int glengthR,int roffset,int goffsetL,int rev_goffsetR,int cdna_direction,char * queryseq,char * queryuc,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,Pairpool_T pairpool)888 Dynprog_microexon_int (double *bestprob2, double *bestprob3, int *dynprogindex, int *microintrontype,
889 char *rsequence, char *rsequenceuc, int rlength,
890 #ifdef EXTRACT_GENOMICSEG
891 int glengthL, int glengthR,
892 #endif
893 int roffset, int goffsetL, int rev_goffsetR, int cdna_direction,
894 char *queryseq, char *queryuc,
895 Univcoord_T chroffset, Univcoord_T chrhigh,
896 bool watsonp, int genestrand, Pairpool_T pairpool) {
897 List_T pairs = NULL;
898 Intlist_T hits = NULL, p;
899 #ifdef EXTRACT_GENOMICSEG
900 Intlist_T hits_old;
901 #endif
902 int bestcL = -1, bestcR = -1, best_middlelength;
903 int middlelength, cL, cR, mincR, maxcR, leftbound, rightbound, textleft, textright,
904 best_candidate, candidate, i;
905 int nmismatches;
906 char left1, left2, right2, right1, left1_alt, left2_alt, right2_alt, right1_alt;
907 char c, c_alt;
908 char c1_alt, c2_alt, c3_alt, c4_alt;
909 char intron1, intron2, intron3, intron4, gapchar;
910 float bestprob = 0.0, prob2, prob3;
911 Univcoord_T splicesitepos;
912 /* float pvalue; */
913 /* int span; */
914
915
916 *bestprob2 = *bestprob3 = 0.0;
917
918 #if 0
919 if (defect_rate < DEFECT_HIGHQ) {
920 pvalue = MICROEXON_PVALUE_HIGHQ;
921 } else if (defect_rate < DEFECT_MEDQ) {
922 pvalue = MICROEXON_PVALUE_MEDQ;
923 } else {
924 pvalue = MICROEXON_PVALUE_LOWQ;
925 }
926 #endif
927
928 #ifdef PMAP
929 intron1 = 'G';
930 intron2 = 'T';
931 intron3 = 'A';
932 intron4 = 'G';
933 gapchar = FWD_CANONICAL_INTRON_COMP;
934 *microintrontype = GTAG_FWD;
935 #else
936 if (cdna_direction > 0) {
937 intron1 = 'G';
938 intron2 = 'T';
939 intron3 = 'A';
940 intron4 = 'G';
941 gapchar = FWD_CANONICAL_INTRON_COMP;
942 *microintrontype = GTAG_FWD;
943 } else if (cdna_direction < 0) {
944 intron1 = 'C';
945 intron2 = 'T';
946 intron3 = 'A';
947 intron4 = 'C';
948 gapchar = REV_CANONICAL_INTRON_COMP;
949 *microintrontype = GTAG_REV;
950 } else {
951 /* Can occur when called by Stage3_merge_local_splice */
952 /* fprintf(stderr,"cdna_direction is 0 in Dynprog_microexon_int\n"); */
953 *microintrontype = NONINTRON;
954 return NULL;
955 }
956 #endif
957
958 #ifdef EXTRACT_GENOMICSEG
959 debug1(printf("Begin microexon search for %.*s and %.*s\n",
960 glengthL,gsequenceL,glengthR,&(rev_gsequenceR[-glengthR+1])));
961 #else
962 debug1(printf("Begin microexon search\n"));
963 #endif
964
965 debug1(printf(" Query sequence is %.*s\n",rlength,rsequence));
966
967 /* span = rev_goffsetR-goffsetL; */
968 debug1(printf(" Genomic span is of length %d\n",span));
969
970 #if 0
971 if (span <= 0) {
972 fprintf(stderr,"Bug in Dynprog_microexon_int. span %d <= 0. Please report to twu@gene.com\n",span);
973 abort();
974 } else {
975 min_microexon_length = ceilf(-fasterlog(1.0-powf(1.0-pvalue,1.0/(float) span)) / /*log(4)*/1.386294);
976 }
977 min_microexon_length -= 8; /* Two donor-acceptor pairs */
978 debug1(printf(" Min microexon length is %d\n",min_microexon_length));
979 if (min_microexon_length > MAX_MICROEXON_LENGTH) {
980 *microintrontype = NONINTRON;
981 return NULL;
982 } else if (min_microexon_length < MIN_MICROEXON_LENGTH) {
983 min_microexon_length = MIN_MICROEXON_LENGTH;
984 }
985 #elif 0
986 min_microexon_length = 6;
987 #endif
988
989 debug1(printf("\nFinding starting boundary on left\n"));
990 leftbound = 0;
991 nmismatches = 0;
992 while (leftbound < rlength - 1 && nmismatches <= 1) {
993 debug1(printf(" leftbound = %d, nmismatches = %d.",leftbound,nmismatches));
994 c = get_genomic_nt(&c_alt,goffsetL+leftbound,chroffset,chrhigh,watsonp);
995 #ifdef EXTRACT_GENOMICSEG
996 assert(c == gsequence_ucL[leftbound]);
997 #endif
998 debug1(printf(" Comparing %c with %c\n",rsequence[leftbound],c));
999 #ifdef PMAP
1000 if (matchtable[rsequence[leftbound]-'A'][c-'A'] == false) {
1001 nmismatches++;
1002 }
1003 #else
1004 if (rsequenceuc[leftbound] != c) {
1005 nmismatches++;
1006 }
1007 #endif
1008 leftbound++;
1009 }
1010 leftbound--; /* This is where the leftmost mismatch occurred */
1011
1012 debug1(printf("\nFinding starting boundary on right\n"));
1013 rightbound = 0;
1014 i = rlength-1;
1015 nmismatches = 0;
1016 while (i >= 0 && nmismatches <= 1) {
1017 debug1(printf(" rightbound = %d, nmismatches = %d.",rightbound,nmismatches));
1018 c = get_genomic_nt(&c_alt,rev_goffsetR-rightbound,chroffset,chrhigh,watsonp);
1019 #ifdef EXTRACT_GENOMICSEG
1020 assert(c == rev_gsequence_ucR[-rightbound]);
1021 #endif
1022 debug1(printf(" Comparing %c with %c\n",rsequence[i],c));
1023 #ifdef PMAP
1024 if (matchtable[rsequence[i]-'A'][c-'A'] == false) {
1025 nmismatches++;
1026 }
1027 #else
1028 if (rsequenceuc[i] != c) {
1029 nmismatches++;
1030 }
1031 #endif
1032 rightbound++;
1033 i--;
1034 }
1035 rightbound--; /* This is where the rightmost mismatch occurred */
1036
1037 debug1(printf(" Left must start before %d from left end of query. Right must start after %d from right end of query\n",
1038 leftbound,rightbound));
1039
1040 /* We require that cL >= 1 and cR >= 1 so that lengthL and lengthR are >= 1 */
1041 for (cL = 1; cL <= leftbound; cL++) {
1042 left1 = get_genomic_nt(&left1_alt,goffsetL+cL,chroffset,chrhigh,watsonp);
1043 left2 = get_genomic_nt(&left2_alt,goffsetL+cL+1,chroffset,chrhigh,watsonp);
1044 #ifdef EXTRACT_GENOMICSEG
1045 assert(left1 == gsequence_ucL[cL]);
1046 assert(left2 == gsequence_ucL[cL+1]);
1047 #endif
1048
1049 debug1(printf(" %d: %c%c\n",cL,left1,left2));
1050 if (left1 == intron1 && left2 == intron2) {
1051 mincR = rlength - MAX_MICROEXON_LENGTH - cL;
1052 debug1(printf(" mincR %d = rlength %d - MAX_MICROEXON_LENGTH %d - cL %d\n",
1053 mincR,rlength,MAX_MICROEXON_LENGTH,cL));
1054 if (mincR < 1) {
1055 mincR = 1;
1056 }
1057 maxcR = rlength - MIN_MICROEXON_LENGTH - cL;
1058 debug1(printf(" maxcR %d = rlength %d - MIN_MICROEXON_LENGTH %d - cL %d\n",
1059 maxcR,rlength,MIN_MICROEXON_LENGTH,cL));
1060 if (maxcR > rightbound) {
1061 maxcR = rightbound;
1062 }
1063 debug1(printf(" Found left GT at %d. Scanning from %d - cL - (1-7), or %d to %d\n",
1064 cL,rlength,mincR,maxcR));
1065 for (cR = mincR; cR <= maxcR; cR++) {
1066 right2 = get_genomic_nt(&right2_alt,rev_goffsetR-cR-1,chroffset,chrhigh,watsonp);
1067 right1 = get_genomic_nt(&right1_alt,rev_goffsetR-cR,chroffset,chrhigh,watsonp);
1068 #ifdef EXTRACT_GENOMICSEG
1069 assert(right2 == rev_gsequence_ucR[-cR-1]);
1070 assert(right1 == rev_gsequence_ucR[-cR]);
1071 #endif
1072 debug1(printf(" Checking %d: %c%c\n",cR,right2,right1));
1073 if (right2 == intron3 && right1 == intron4) {
1074 middlelength = rlength - cL - cR;
1075 debug1(printf(" Found pair at %d to %d, length %d. Middle sequence is %.*s\n",
1076 cL,cR,middlelength,middlelength,&(rsequence[cL])));
1077
1078 textleft = goffsetL + cL + MICROINTRON_LENGTH;
1079 textright = rev_goffsetR - cR - MICROINTRON_LENGTH;
1080
1081 if (textright >= textleft + middlelength) {
1082 hits = BoyerMoore_nt(&(rsequence[cL]),middlelength,textleft,textright-textleft,
1083 chroffset,chrhigh,watsonp);
1084 #ifdef EXTRACT_GENOMICSEG
1085 hits_old = BoyerMoore(&(rsequenceuc[cL]),middlelength,&(genomicuc[textleft]),textright-textleft);
1086 assert(Intlist_equal(hits,hits_old));
1087 Intlist_free(&hits_old);
1088 #endif
1089 for (p = hits; p != NULL; p = Intlist_next(p)) {
1090 candidate = textleft + Intlist_head(p);
1091 #ifdef EXTRACT_GENOMICSEG
1092 assert(get_genomic_nt(candidate-2,chroffset,chrhigh,watsonp) == genomicuc[candidate - 2]);
1093 assert(get_genomic_nt(candidate-1,chroffset,chrhigh,watsonp) == genomicuc[candidate - 1]);
1094 assert(get_genomic_nt(candidate+middlelength,chroffset,chrhigh,watsonp) == genomicuc[candidate + middlelength]);
1095 assert(get_genomic_nt(candidate+middlelength+1,chroffset,chrhigh,watsonp) == genomicuc[candidate + middlelength+1]);
1096 #endif
1097 if (/*genomicuc[candidate - 2]*/ get_genomic_nt(&c3_alt,candidate-2,chroffset,chrhigh,watsonp) == intron3 &&
1098 /*genomicuc[candidate - 1]*/ get_genomic_nt(&c4_alt,candidate-1,chroffset,chrhigh,watsonp) == intron4 &&
1099 /*genomicuc[candidate + middlelength]*/ get_genomic_nt(&c1_alt,candidate+middlelength,chroffset,chrhigh,watsonp) == intron1 &&
1100 /*genomicuc[candidate + middlelength + 1]*/ get_genomic_nt(&c2_alt,candidate+middlelength+1,chroffset,chrhigh,watsonp) == intron2) {
1101 debug1(printf(" Successful microexon at %d >>> %d..%d >>> %d\n",goffsetL+cL,candidate,candidate+middlelength,rev_goffsetR-cR));
1102
1103 /* Not handling known splice sites yet */
1104 if (watsonp == true) {
1105 if (cdna_direction > 0) {
1106 splicesitepos = chroffset + (candidate-1) + 1;
1107 prob2 = Maxent_hr_acceptor_prob(splicesitepos,chroffset);
1108 splicesitepos = chroffset + candidate+middlelength;
1109 prob3 = Maxent_hr_donor_prob(splicesitepos,chroffset);
1110 } else {
1111 splicesitepos = chroffset + (candidate-1) + 1;
1112 prob2 = Maxent_hr_antidonor_prob(splicesitepos,chroffset);
1113 splicesitepos = chroffset + candidate+middlelength;
1114 prob3 = Maxent_hr_antiacceptor_prob(splicesitepos,chroffset);
1115 }
1116 } else {
1117 if (cdna_direction > 0) {
1118 splicesitepos = chrhigh - (candidate-1);
1119 prob2 = Maxent_hr_antiacceptor_prob(splicesitepos,chroffset);
1120 splicesitepos = chrhigh - (candidate+middlelength) + 1;
1121 prob3 = Maxent_hr_antidonor_prob(splicesitepos,chroffset);
1122 } else {
1123 splicesitepos = chrhigh - (candidate-1);
1124 prob2 = Maxent_hr_donor_prob(splicesitepos,chroffset);
1125 splicesitepos = chrhigh - (candidate+middlelength) + 1;
1126 prob3 = Maxent_hr_acceptor_prob(splicesitepos,chroffset);
1127 }
1128 }
1129
1130 debug1(printf("microexon probabilities: prob2 = %f, prob3 = %f\n",prob2,prob3));
1131 if (prob2 + prob3 > bestprob) {
1132 bestcL = cL;
1133 bestcR = cR;
1134 best_candidate = candidate;
1135 best_middlelength = middlelength;
1136 *bestprob2 = prob2;
1137 *bestprob3 = prob3;
1138 bestprob = prob2 + prob3;
1139 }
1140 }
1141 }
1142 Intlist_free(&hits);
1143 }
1144
1145 }
1146 }
1147 }
1148 }
1149
1150 if (bestcL < 0 || bestcR < 0) {
1151 debug1(printf("End of dynprog microexon int\n"));
1152
1153 *microintrontype = NONINTRON;
1154 return NULL;
1155
1156 } else {
1157 debug1(printf("Making microexon pairs with candidate %u\n",best_candidate));
1158 pairs = make_microexon_pairs_double(roffset,/*roffsetM*/roffset+bestcL,/*roffsetR*/roffset+bestcL+best_middlelength,
1159 goffsetL,/*candidate*/best_candidate,/*goffsetR*/rev_goffsetR-bestcR+1,
1160 /*lengthL*/bestcL,/*lengthM*/best_middlelength,/*lengthR*/bestcR,
1161 queryseq,queryuc,
1162 chroffset,chrhigh,watsonp,genestrand,pairpool,gapchar,*dynprogindex);
1163 *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
1164 return pairs;
1165 }
1166 }
1167
1168
1169 #if 0
1170 /* Based on probability of seeing a pattern of length n in L is
1171 1-(1-p1)^L, where p1 is 4^n. We determine L so chance probability
1172 is less than ENDSEQUENCE_PVALUE */
1173 static int
1174 search_length (int endlength, int maxlength, bool end_microexons_p) {
1175 double p1;
1176 int effective_maxlength, extrant, result;
1177
1178 if (end_microexons_p == true) {
1179 extrant = 4; /* Count the four nucleotides at the intron bounds */
1180 effective_maxlength = maxlength;
1181 } else {
1182 extrant = 0; /* Don't count the four nucleotides */
1183 effective_maxlength = 5000;
1184 if (maxlength < effective_maxlength) {
1185 effective_maxlength = maxlength;
1186 }
1187 }
1188
1189 if (endlength + extrant > 12) {
1190 debug(printf(" Search length for endlength of %d is maxlength %d\n",endlength,effective_maxlength));
1191 return effective_maxlength;
1192 } else {
1193 p1 = 1.0/pow(4.0,(double) (endlength + extrant));
1194 result = (int) (fasterlog(1.0-ENDSEQUENCE_PVALUE)/fasterlog(1-p1));
1195 debug(printf(" Search length for endlength of %d plus extra nt of %d is %d\n",endlength,extrant,result));
1196 if (result > effective_maxlength) {
1197 return effective_maxlength;
1198 } else {
1199 return result;
1200 }
1201 }
1202 }
1203 #endif
1204
1205
1206 #if 0
1207 /* Not currently used */
1208 List_T
1209 Dynprog_microexon_5 (int *dynprogindex, int *microintrontype, int *microexonlength,
1210 char *rev_rsequence, char *rev_rsequenceuc, char *rev_gsequence, char *rev_gsequence_uc,
1211 int rlength, int glength, int rev_roffset, int rev_goffset, int cdna_direction,
1212 char *queryseq, char *queryuc, char *genomicseg, char *genomicuc,
1213 Pairpool_T pairpool, bool end_microexons_p) {
1214 List_T pairs = NULL;
1215 Intlist_T hits = NULL, p;
1216 int endlength, maxc, c, textleft, textright, candidate, nmismatches = 0;
1217 char right2, right1;
1218 char intron1, intron2, intron3, intron4, gapchar;
1219
1220 #ifdef PMAP
1221 intron1 = 'G';
1222 intron2 = 'T';
1223 intron3 = 'A';
1224 intron4 = 'G';
1225 gapchar = FWD_CANONICAL_INTRON_COMP;
1226 *microintrontype = GTAG_FWD;
1227 #else
1228 if (cdna_direction > 0) {
1229 intron1 = 'G';
1230 intron2 = 'T';
1231 intron3 = 'A';
1232 intron4 = 'G';
1233 gapchar = FWD_CANONICAL_INTRON_COMP;
1234 *microintrontype = GTAG_FWD;
1235 } else if (cdna_direction < 0) {
1236 intron1 = 'C';
1237 intron2 = 'T';
1238 intron3 = 'A';
1239 intron4 = 'C';
1240 gapchar = REV_CANONICAL_INTRON_COMP;
1241 *microintrontype = GTAG_REV;
1242 } else {
1243 *microintrontype = NONINTRON;
1244 return (List_T) NULL;
1245 abort();
1246 }
1247 #endif
1248
1249 #ifdef EXTRACT_GENOMICSEG
1250 debug(printf("Begin microexon search at 5' for %.*s\n",
1251 glength,&(rev_gsequence[-glength+1])));
1252 #else
1253 debug(printf("Begin microexon search at 5'\n"));
1254 #endif
1255
1256 debug(printf(" Query sequence is %.*s\n",rlength,&(rev_rsequence[-rlength+1])));
1257
1258 *microexonlength = 0;
1259 if (glength < rlength) {
1260 maxc = glength - MIN_MICROEXON_LENGTH;
1261 } else {
1262 maxc = rlength - MIN_MICROEXON_LENGTH;
1263 }
1264 for (c = 0; c < maxc; c++) {
1265 right2 = rev_gsequence_uc[-c-1];
1266 right1 = rev_gsequence_uc[-c];
1267 debug(printf(" Checking %c%c\n",right2,right1));
1268 #ifdef PMAP
1269 if (c > 0 && matchtable[rev_rsequence[-c+1]-'A'][rev_gsequence_uc[-c+1]-'A'] == false) {
1270 nmismatches++;
1271 }
1272 #else
1273 if (c > 0 && rev_rsequenceuc[-c+1] != rev_gsequence_uc[-c+1]) {
1274 nmismatches++;
1275 }
1276 #endif
1277 if (nmismatches > 1) {
1278 debug(printf(" Aborting at %c != %c\n",rev_rsequence[-c+1],rev_gsequence[-c+1]));
1279 *microintrontype = NONINTRON;
1280 return NULL;
1281 }
1282 if (right2 == intron3 && right1 == intron4) {
1283 endlength = rlength - c;
1284 debug(printf(" Found acceptor at %d, length %d. End sequence is %.*s\n",
1285 c,endlength,endlength,&(rev_rsequence[-endlength+1])));
1286
1287 textright = rev_goffset - c - MICROINTRON_LENGTH;
1288 textleft = textright - search_length(endlength,textright,end_microexons_p) + MICROINTRON_LENGTH;
1289
1290 if (textright >= textleft + endlength) {
1291 hits = BoyerMoore_nt(&(rev_rsequence[-c-endlength+1]),endlength,textleft,textright-textleft,
1292 chroffset,chrhigh,watsonp);
1293 for (p = hits; p != NULL; p = Intlist_next(p)) {
1294 candidate = textleft + Intlist_head(p);
1295 if (genomicseg[candidate + endlength] == intron1 &&
1296 genomicseg[candidate + endlength + 1] == intron2) {
1297 debug(printf(" Successful microexon at %d\n",candidate));
1298
1299 Intlist_free(&hits);
1300 *microexonlength = endlength;
1301 pairs = make_microexon_pairs_single(rev_roffset-c-endlength+1,rev_roffset-c+1,
1302 candidate,rev_goffset-c+1,endlength,c,
1303 queryseq,queryuc,chroffset,watsonp,pairpool,gapchar,*dynprogindex);
1304 *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
1305 return pairs;
1306 }
1307 }
1308 Intlist_free(&hits);
1309 }
1310
1311 }
1312 }
1313
1314 debug(printf("End of dynprog microexon 5\n"));
1315
1316 *microintrontype = NONINTRON;
1317 return NULL;
1318 }
1319 #endif
1320
1321
1322 #if 0
1323 /* Not currently used */
1324 List_T
1325 Dynprog_microexon_3 (int *dynprogindex, int *microintrontype, int *microexonlength,
1326 char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_uc,
1327 int rlength, int glength, int roffset, int goffset, int cdna_direction,
1328 char *queryseq, char *queryuc, char *genomicseg, char *genomicuc,
1329 int genomiclength, Pairpool_T pairpool, bool end_microexons_p) {
1330 List_T pairs = NULL;
1331 Intlist_T hits = NULL, p;
1332 int endlength, maxc, c, textleft, textright, candidate, nmismatches = 0;
1333 char left1, left2;
1334 char intron1, intron2, intron3, intron4, gapchar;
1335
1336 #ifdef PMAP
1337 intron1 = 'G';
1338 intron2 = 'T';
1339 intron3 = 'A';
1340 intron4 = 'G';
1341 gapchar = FWD_CANONICAL_INTRON_COMP;
1342 *microintrontype = GTAG_FWD;
1343 #else
1344 if (cdna_direction > 0) {
1345 intron1 = 'G';
1346 intron2 = 'T';
1347 intron3 = 'A';
1348 intron4 = 'G';
1349 gapchar = FWD_CANONICAL_INTRON_COMP;
1350 *microintrontype = GTAG_FWD;
1351 } else if (cdna_direction < 0) {
1352 intron1 = 'C';
1353 intron2 = 'T';
1354 intron3 = 'A';
1355 intron4 = 'C';
1356 gapchar = REV_CANONICAL_INTRON_COMP;
1357 *microintrontype = GTAG_REV;
1358 } else {
1359 *microintrontype = NONINTRON;
1360 return (List_T) NULL;
1361 abort();
1362 }
1363 #endif
1364
1365 #ifdef EXTRACT_GENOMICSEG
1366 debug(printf("Begin microexon search at 3' for %.*s\n",glength,gsequence));
1367 #else
1368 debug(printf("Begin microexon search at 3'\n"));
1369 #endif
1370
1371 debug(printf(" Query sequence is %.*s\n",rlength,rsequence));
1372
1373 *microexonlength = 0;
1374 if (glength < rlength) {
1375 maxc = glength - MIN_MICROEXON_LENGTH;
1376 } else {
1377 maxc = rlength - MIN_MICROEXON_LENGTH;
1378 }
1379 for (c = 0; c < maxc; c++) {
1380 left1 = gsequence_uc[c];
1381 left2 = gsequence_uc[c+1];
1382 debug(printf(" Checking %c%c\n",left1,left2));
1383 #ifdef PMAP
1384 if (c > 0 && matchtable[rsequence[c-1]-'A'][gsequence_uc[c-1]-'A'] == false) {
1385 nmismatches++;
1386 }
1387 #else
1388 if (c > 0 && rsequenceuc[c-1] != gsequence_uc[c-1]) {
1389 nmismatches++;
1390 }
1391 #endif
1392 if (nmismatches > 1) {
1393 debug(printf(" Aborting at %c != %c\n",rsequence[c-1],gsequence[c-1]));
1394 *microintrontype = NONINTRON;
1395 return NULL;
1396 }
1397 if (left1 == intron1 && left2 == intron2) {
1398 endlength = rlength - c;
1399 debug(printf(" Found donor at %d, length %d. End sequence is %.*s\n",
1400 c,endlength,endlength,&(rsequence[c])));
1401
1402 textleft = goffset + c;
1403 textright = textleft + search_length(endlength,genomiclength-textleft,end_microexons_p);
1404
1405 if (textright >= textleft + endlength) {
1406 hits = BoyerMoore_nt(&(rsequence[c]),endlength,textleft,textright-textleft,
1407 chroffset,chrhigh,watsonp);
1408 for (p = hits; p != NULL; p = Intlist_next(p)) {
1409 candidate = textleft + Intlist_head(p);
1410 if (genomicseg[candidate - 2] == intron3 &&
1411 genomicseg[candidate - 1] == intron4) {
1412 debug(printf(" Successful microexon at %d\n",candidate));
1413
1414 Intlist_free(&hits);
1415 *microexonlength = endlength;
1416 pairs = make_microexon_pairs_single(roffset,roffset+c,
1417 goffset,candidate,c,endlength,
1418 queryseq,queryuc,genomicseg,genomicuc,
1419 pairpool,gapchar,*dynprogindex);
1420 *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
1421 return pairs;
1422 }
1423 }
1424 Intlist_free(&hits);
1425 }
1426
1427 }
1428 }
1429
1430 debug(printf("End of dynprog microexon 3\n"));
1431
1432 *microintrontype = NONINTRON;
1433 return NULL;
1434 }
1435 #endif
1436
1437