1 static char rcsid[] = "$Id: dynprog_single.c 218162 2019-01-17 05:59:55Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "dynprog_single.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h>		/* For ceil, log, pow */
11 #include <ctype.h>		/* For tolower */
12 
13 
14 #include "bool.h"
15 #include "except.h"
16 #include "assert.h"
17 #include "mem.h"
18 #include "comp.h"
19 #include "pair.h"
20 #include "pairdef.h"
21 #include "boyer-moore.h"
22 #include "complement.h"
23 #include "intron.h"
24 #include "maxent.h"
25 #include "maxent_hr.h"
26 #include "fastlog.h"
27 #include "dynprog_simd.h"
28 
29 
30 /* Tests whether get_genomic_nt == genomicseg in compute_scores procedures */
31 /* #define EXTRACT_GENOMICSEG 1 */
32 
33 
34 /* Prints parameters and results */
35 #ifdef DEBUG
36 #define debug(x) x
37 #else
38 #define debug(x)
39 #endif
40 
41 /* Microexon search */
42 #ifdef DEBUG1
43 #define debug1(x) x
44 #else
45 #define debug1(x)
46 #endif
47 
48 /* Getting genomic nt */
49 #ifdef DEBUG8
50 #define debug8(x) x
51 #else
52 #define debug8(x)
53 #endif
54 
55 /* print_vector */
56 #ifdef DEBUG15
57 #define debug15(x) x
58 #else
59 #define debug15(x)
60 #endif
61 
62 /* Homopolymer (e.g., PacBio) */
63 #ifdef DEBUG16
64 #define debug16(x) x
65 #else
66 #define debug16(x)
67 #endif
68 
69 /* Homopolymer details */
70 #ifdef DEBUG16A
71 #define debug16a(x) x
72 #else
73 #define debug16a(x)
74 #endif
75 
76 
77 #define MICROEXON_PVALUE_HIGHQ 0.01
78 #define MICROEXON_PVALUE_MEDQ 0.001
79 #define MICROEXON_PVALUE_LOWQ 0.0001
80 #define ENDSEQUENCE_PVALUE 0.001 /* Have stricter threshold for making end exons */
81 
82 #define MIN_MICROEXON_LENGTH 3
83 #ifdef PMAP
84 #define MAX_MICROEXON_LENGTH 17	/* Should be oligomer length - 1 plus peelback */
85 #else
86 #define MAX_MICROEXON_LENGTH 12	/* Should be oligomer length - 1 plus peelback */
87 #endif
88 #define MICROINTRON_LENGTH 9
89 
90 
91 
92 #define T Dynprog_T
93 
94 bool homopolymerp;
95 
96 void
Dynprog_single_setup(bool homopolymerp_in)97 Dynprog_single_setup (bool homopolymerp_in) {
98 
99   homopolymerp = homopolymerp_in;
100 
101   return;
102 }
103 
104 
105 static char complCode[128] = COMPLEMENT_LC;
106 
107 static char
get_genomic_nt(char * g_alt,int genomicpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)108 get_genomic_nt (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
109 		bool watsonp) {
110   char c2, c2_alt;
111   Univcoord_T pos;
112 
113 #if 0
114   /* If the read has a deletion, then we will extend beyond 0 or genomiclength, so do not restrict. */
115   if (genomicpos < 0) {
116     return '*';
117 
118   } else if (genomicpos >= genomiclength) {
119     return '*';
120 
121   }
122 #endif
123 
124   if (watsonp) {
125     if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
126       *g_alt = '*';
127       return '*';
128 
129     } else if (pos >= chrhigh) {
130       *g_alt = '*';
131       return '*';
132 
133 #if 0
134     } else if (genome) {
135       /* Not necessary, because Genome_get_char_blocks should work */
136       debug8(printf("At %u, genomicnt is %c\n",
137 		    genomicpos,Genome_get_char(genome,pos)));
138       return Genome_get_char(genome,pos);
139 #endif
140 
141     } else {
142       /* GMAP with user-supplied genomic segment */
143       debug8(printf("At %u, genomicnt is %c\n",
144 		    genomicpos,Genome_get_char_blocks(pos)));
145       return Genome_get_char_blocks(&(*g_alt),pos);
146     }
147 
148   } else {
149     if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
150       *g_alt = '*';
151       return '*';
152 
153     } else if (pos >= chrhigh) {
154       *g_alt = '*';
155       return '*';
156 
157 #if 0
158     } else if (genome) {
159       /* Not necessary, because Genome_get_char_blocks should work */
160       c2 = Genome_get_char(genome,pos);
161 #endif
162 
163     } else {
164       /* GMAP with user-supplied genomic segment */
165       c2 = Genome_get_char_blocks(&c2_alt,pos);
166     }
167     debug8(printf("At %u, genomicnt is %c\n",genomicpos,complCode[(int) c2]));
168     *g_alt = complCode[(int) c2_alt];
169     return complCode[(int) c2];
170   }
171 }
172 
173 
174 /************************************************************************
175  *  Homopolymer
176  ************************************************************************/
177 
178 static List_T
augment_pairs(List_T pairs,int * rsequence_nreps,int roffset,int * gsequence_nreps,int goffset,Pairpool_T pairpool,int dynprogindex)179 augment_pairs (List_T pairs, int *rsequence_nreps, int roffset,
180 	       int *gsequence_nreps, int goffset,
181 	       Pairpool_T pairpool, int dynprogindex) {
182   List_T augmented = NULL, p;
183   Pair_T pair;
184   int r, c, r_nreps, c_nreps, i;
185   int r_cum = 0, c_cum = 0;
186 
187 
188 #ifdef DEBUG16
189   printf("r_nreps: ");
190   for (i = 0; i < r_uniqlength; i++) {
191     printf("%d",rsequence_nreps[i]);
192   }
193   printf("\n");
194 
195   printf("g_nreps: ");
196   for (i = 0; i < g_uniqlength; i++) {
197     printf("%d",gsequence_nreps[i]);
198   }
199   printf("\n");
200 #endif
201 
202   for (p = pairs; p != NULL; p = p->rest) {
203     pair = (Pair_T) p->first;
204     r = pair->querypos - roffset;
205     c = pair->genomepos - goffset;
206 
207     if (pair->comp == INDEL_COMP) {
208       augmented = Pairpool_push_existing(augmented,pairpool,pair);
209       debug16a(Pair_dump_one(pair,true));
210       pair->querypos += r_cum;
211       pair->genomepos += c_cum;
212 
213       if (pair->cdna == ' ') {
214 	c_nreps = gsequence_nreps[c];
215 	debug16a(printf(" genomepos %u, c_cum %d, nreps %d\n",c,c_cum,c_nreps));
216 	if (c_nreps == 0) {
217 	  /* Do nothing */
218 	} else {
219 	  for (i = 1; i <= c_nreps; i++) {
220 	    augmented = Pairpool_push(augmented,pairpool,r+r_cum+roffset,c+c_cum+i+goffset,
221 				      /*cdna*/' ',INDEL_COMP,pair->genome,pair->genomealt,
222 				      dynprogindex);
223 	  }
224 	  c_cum += c_nreps;
225 	}
226 
227       } else if (pair->genome == ' ') {
228 	r_nreps = rsequence_nreps[r];
229 	debug16a(printf(" querypos %d, r_cum %d, nreps %d\n",r,r_cum,r_nreps));
230 	if (r_nreps == 0) {
231 	  /* Do nothing */
232 	} else {
233 	  for (i = 1; i <= r_nreps; i++) {
234 	    augmented = Pairpool_push(augmented,pairpool,r+r_cum+i+roffset,c+c_cum+goffset,
235 				      pair->cdna,INDEL_COMP,/*genome*/' ',/*genomealt*/' ',
236 				      dynprogindex);
237 	  }
238 	  r_cum += r_nreps;
239 	}
240 
241       } else {
242 	fprintf(stderr,"Indel pair is missing both cdna and genome nts\n");
243 	abort();
244       }
245 
246     } else {
247       r_nreps = rsequence_nreps[r];
248       c_nreps = gsequence_nreps[c];
249       debug16a(printf(" querypos %d, r_cum %d, nreps %d, genomepos %u, c_cum %d, nreps %d\n",
250 		      r,r_cum,r_nreps,c,c_cum,c_nreps));
251       augmented = Pairpool_push_existing(augmented,pairpool,pair);
252       pair->querypos += r_cum;
253       pair->genomepos += c_cum;
254       if (r_nreps == 0 && c_nreps == 0) {
255 	/* Do nothing */
256       } else if (r_nreps == c_nreps) {
257 	for (i = 1; i <= r_nreps; i++) {
258 	  augmented = Pairpool_push_copy(augmented,pairpool,pair);
259 	  ((Pair_T) augmented->first)->querypos += i;
260 	  ((Pair_T) augmented->first)->genomepos += i;
261 	}
262 	r_cum += r_nreps;
263 	c_cum += c_nreps;
264 
265       } else if (r_nreps < c_nreps) {
266 	for (i = 1; i <= r_nreps; i++) {
267 	  augmented = Pairpool_push_copy(augmented,pairpool,pair);
268 	  ((Pair_T) augmented->first)->querypos += i;
269 	  ((Pair_T) augmented->first)->genomepos += i;
270 	}
271 	r_cum += r_nreps;
272 
273 	for ( ; i <= c_nreps; i++) {
274 	  /* Add 1 to r to advance to next coordinate */
275 	  augmented = Pairpool_push(augmented,pairpool,r+r_cum+roffset + 1,c+c_cum+i+goffset,
276 				    /*cdna*/' ',INDEL_COMP,pair->genome,pair->genomealt,
277 				    dynprogindex);
278 	}
279 	c_cum += c_nreps;
280 
281       } else {
282 	for (i = 1; i <= c_nreps; i++) {
283 	  augmented = Pairpool_push_copy(augmented,pairpool,pair);
284 	  ((Pair_T) augmented->first)->querypos += i;
285 	  ((Pair_T) augmented->first)->genomepos += i;
286 	}
287 	c_cum += c_nreps;
288 
289 	for ( ; i <= r_nreps; i++) {
290 	  /* Add 1 to c to advance to next coordinate */
291 	  augmented = Pairpool_push(augmented,pairpool,r+r_cum+i+roffset,c+c_cum+goffset + 1,
292 				    pair->cdna,INDEL_COMP,/*genome*/' ',/*genomealt*/' ',
293 				    dynprogindex);
294 	}
295 	r_cum += r_nreps;
296 
297       }
298     }
299   }
300 
301   debug16(Pair_dump_list(augmented,true));
302 
303   return List_reverse(augmented);
304 }
305 
306 static char *
uniq_string(int ** nreps,int * uniqlength,char * string,int length)307 uniq_string (int **nreps, int *uniqlength, char *string, int length) {
308   char *uniq, *p, nt, lastnt;
309   int i, k, *a;
310 
311   *uniqlength = 1;
312   lastnt = string[0];
313   for (i = 1; i < length; i++) {
314     if ((nt = string[i]) != lastnt) {
315       (*uniqlength)++;
316       lastnt = nt;
317     }
318   }
319 
320   p = uniq = (char *) MALLOC(((*uniqlength) + 1) * sizeof(char));
321   a = *nreps = (int *) MALLOC((*uniqlength) * sizeof(int));
322   k = 0;
323 
324   lastnt = string[0];
325   for (i = 1; i < length; i++) {
326     if ((nt = string[i]) != lastnt) {
327       *p++ = lastnt;
328       *a++ = k;
329       lastnt = nt;
330       k = 0;
331     } else {
332       k++;
333     }
334   }
335   *p = lastnt;
336   *a = k;
337 
338 #ifdef DEBUG16
339   printf("string: %.*s\n",length,string);
340   printf("uniq:   %.*s\n",*uniqlength,uniq);
341   printf("nreps   ");
342   for (i = 0; i < *uniqlength; i++) {
343     printf("%d",(*nreps)[i]);
344   }
345   printf("\n");
346 #endif
347 
348   return uniq;
349 }
350 
351 
352 
353 static List_T
single_gap_simple(int * finalscore,int * nmatches,int * nmismatches,char * rsequence,char * rsequenceuc,int rlength,char * gsequence,char * gsequence_alt,int roffset,int goffset,Pairpool_T pairpool,int mismatchtype,int genestrand,int dynprogindex)354 single_gap_simple (int *finalscore, int *nmatches, int *nmismatches,
355 		   char *rsequence, char *rsequenceuc, int rlength, char *gsequence, char *gsequence_alt,
356 		   int roffset, int goffset, Pairpool_T pairpool,
357 		   int mismatchtype, int genestrand, int dynprogindex) {
358   int score;
359   List_T pairs = NULL;
360   int r;
361   int querycoord, genomecoord;
362   int c1, c1_uc, c2, c2_alt;
363   Pairdistance_T **pairdistance_array_type;
364 
365   debug(printf("Starting single_gap_simple\n"));
366   pairdistance_array_type = pairdistance_array[mismatchtype];
367 
368   *finalscore = 0;
369   *nmatches = *nmismatches = 0;
370 
371   /* Push from left to right, so we don't need to do List_reverse() later */
372   for (r = 1; r <= rlength; r++) {
373     querycoord = genomecoord = r-1;
374 
375     c1 = rsequence[querycoord];
376     c1_uc = rsequenceuc[querycoord];
377     c2 = gsequence[genomecoord];
378     c2_alt = gsequence_alt[genomecoord];
379 
380     if (c2 == '*') {
381       /* Don't push pairs past end of chromosome */
382       debug(printf("Don't push pairs past end of chromosome: genomeoffset %u, genomecoord %u\n",goffset,genomecoord));
383 
384     } else if (c1_uc == c2 || c1_uc == c2_alt) {
385       debug(printf("Pushing simple %d,%d [%d,%d] (%c,%c) - match\n",
386 		   r,/*c*/r,roffset+querycoord,goffset+genomecoord,c1_uc,c2));
387       score = pairdistance_array_type[c1_uc][c2];
388       if (pairdistance_array_type[c1_uc][c2_alt] > score) {
389 	score = pairdistance_array_type[c1_uc][c2_alt];
390       }
391       *finalscore += score;
392       *nmatches += 1;
393       pairs = Pairpool_push(pairs,pairpool,roffset+querycoord,goffset+genomecoord,
394 			    c1,DYNPROG_MATCH_COMP,c2,c2_alt,dynprogindex);
395 
396     } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
397       debug(printf("Pushing simple %d,%d [%d,%d] (%c,%c) - ambiguous\n",
398 		   r,/*c*/r,roffset+querycoord,goffset+genomecoord,c1_uc,c2));
399       score = pairdistance_array_type[c1_uc][c2];
400       if (pairdistance_array_type[c1_uc][c2_alt] > score) {
401 	score = pairdistance_array_type[c1_uc][c2_alt];
402       }
403       *finalscore += score;
404       *nmatches += 1;
405       pairs = Pairpool_push(pairs,pairpool,roffset+querycoord,goffset+genomecoord,
406 			    c1,AMBIGUOUS_COMP,c2,c2_alt,dynprogindex);
407 
408     } else {
409       debug(printf("Pushing simple %d,%d [%d,%d] (%c,%c) - mismatch\n",
410 		   r,/*c*/r,roffset+querycoord,goffset+genomecoord,c1_uc,c2));
411       score = pairdistance_array_type[c1_uc][c2];
412       if (pairdistance_array_type[c1_uc][c2_alt] > score) {
413 	score = pairdistance_array_type[c1_uc][c2_alt];
414       }
415       *finalscore += score;
416       *nmismatches += 1;
417       pairs = Pairpool_push(pairs,pairpool,roffset+querycoord,goffset+genomecoord,
418 			    c1,MISMATCH_COMP,c2,c2_alt,dynprogindex);
419     }
420   }
421 
422   if (*nmismatches > 1) {
423     return (List_T) NULL;
424   } else {
425     return pairs;
426   }
427 }
428 
429 
430 List_T
Dynprog_single_gap(int * dynprogindex,int * finalscore,int * nmatches,int * nmismatches,int * nopens,int * nindels,T dynprog,char * rsequence,char * rsequenceuc,int rlength,int glength,int roffset,int goffset,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,bool jump_late_p,Pairpool_T pairpool,int extraband_single,double defect_rate,bool widebandp)431 Dynprog_single_gap (int *dynprogindex, int *finalscore, int *nmatches, int *nmismatches, int *nopens, int *nindels,
432 		    T dynprog, char *rsequence, char *rsequenceuc,
433 		    int rlength, int glength, int roffset, int goffset,
434 		    Univcoord_T chroffset, Univcoord_T chrhigh,
435 		    bool watsonp, int genestrand, bool jump_late_p, Pairpool_T pairpool,
436 		    int extraband_single, double defect_rate, bool widebandp) {
437   List_T pairs = NULL;
438   char *gsequence, *gsequence_alt;
439 
440   char *gsequence_orig, *rsequence_orig;
441   int *gsequence_nreps, *rsequence_nreps;
442   int glength_orig, rlength_orig;
443 
444   Mismatchtype_T mismatchtype;
445   int lband, uband;
446   int open, extend;
447 #if defined(HAVE_SSE2)
448   Score8_T **matrix8;
449   Direction8_T **directions8_nogap, **directions8_Egap, **directions8_Fgap;
450 
451   Score16_T **matrix16;
452   Direction16_T **directions16_nogap, **directions16_Egap, **directions16_Fgap;
453 #else
454   Score32_T **matrix;
455   Direction32_T **directions_nogap, **directions_Egap, **directions_Fgap;
456 #endif
457   /* bool onesidegapp; */
458 
459   if (defect_rate < DEFECT_HIGHQ) {
460     mismatchtype = HIGHQ;
461     open = SINGLE_OPEN_HIGHQ;
462     extend = SINGLE_EXTEND_HIGHQ;
463     /* onesidegapp = false; */
464   } else if (defect_rate < DEFECT_MEDQ) {
465     mismatchtype = MEDQ;
466     open = SINGLE_OPEN_MEDQ;
467     extend = SINGLE_EXTEND_MEDQ;
468     /* onesidegapp = true; */
469   } else {
470     mismatchtype = LOWQ;
471     open = SINGLE_OPEN_LOWQ;
472     extend = SINGLE_EXTEND_LOWQ;
473     /* onesidegapp = true; */
474   }
475 
476 #if 0
477   if (close_indels_mode == +1) {
478     /* Allow close indels */
479     onesidegapp = false;
480   } else if (close_indels_mode == -1) {
481     /* Disallow close indels */
482     onesidegapp = true;
483   } else {
484     /* Allow close indels for high quality alignments, as determined above */
485   }
486 #endif
487 
488   /* Rlength: maxlookback+MAXPEELBACK.  Glength +EXTRAMATERIAL */
489   debug(printf("%c:  ",*dynprogindex > 0 ? (*dynprogindex-1)%26+'a' : (-(*dynprogindex)-1)%26+'A'));
490   debug(printf("Aligning single gap middle with wideband = %d and extraband %d\n",widebandp,extraband_single));
491 #ifdef EXTRACT_GENOMICSEG
492   debug(printf("At genomic offset %d-%d, %.*s\n",goffset,goffset+glength-1,glength,gsequence));
493 #endif
494   debug(printf("\n"));
495 
496 #if 0
497   /* Can be violated when extension search in GSNAP tries different diagonals */
498   assert(glength > 0);
499 #endif
500 
501   if (rlength <= 0 || glength <= 0 ||
502       rlength > dynprog->max_rlength || glength > dynprog->max_glength) {
503     debug(printf("rlength %d or glength %d is too long.  Returning NULL\n",rlength,glength));
504     *finalscore = NEG_INFINITY_32;
505     *nmatches = *nmismatches = *nopens = *nindels = 0;
506 #if 0
507     /* Don't push a gapholder for single gap, because gapholder already exists */
508     pairs = Pairpool_push_gapholder(NULL,pairpool,rlength,glength,
509 				    /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
510 #endif
511     *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
512     return (List_T) NULL;
513   }
514 
515   debug(printf("At query offset %d-%d, %.*s\n",roffset,roffset+rlength-1,rlength,rsequence));
516 
517     /* If extraband_single is too large, then gaps may be inserted on
518        both sides, like this
519 
520        CACCC   AGAGCAGGCACTGCCT
521        |||||--- ||| ||---|||||
522        CACCCCAGGGAGGAG   CTGCCC
523 
524     */
525 
526 
527   if (homopolymerp == true) {
528     rsequence_orig = rsequence;
529     rlength_orig = rlength;
530     rsequence = uniq_string(&rsequence_nreps,&rlength,rsequence_orig,rlength_orig);
531 
532     gsequence_orig = (char *) MALLOCA((glength+1) * sizeof(char));
533     gsequence_alt = (char *) MALLOCA((glength+1) * sizeof(char));
534 
535     if (watsonp) {
536       Genome_get_segment_blocks_right(gsequence_orig,gsequence_alt,/*left*/chroffset+goffset,
537                                       glength,chrhigh,/*revcomp*/false);
538     } else {
539       Genome_get_segment_blocks_left(gsequence_orig,gsequence_alt,/*right*/chrhigh-goffset+1,
540                                      glength,chroffset,/*revcomp*/true);
541     }
542     if (gsequence_orig[0] == '\0') {
543       *finalscore = NEG_INFINITY_32;
544       *nmatches = *nmismatches = *nopens = *nindels = 0;
545       FREEA(gsequence_alt);
546       FREEA(gsequence_orig);
547       return (List_T) NULL;
548     } else {
549       glength_orig = glength;
550       rsequence = uniq_string(&gsequence_nreps,&glength,gsequence_orig,glength_orig);
551     }
552 
553   } else {
554     gsequence = (char *) MALLOCA((glength+1) * sizeof(char));
555     gsequence_alt = (char *) MALLOCA((glength+1) * sizeof(char));
556 
557     if (watsonp) {
558       Genome_get_segment_blocks_right(gsequence,gsequence_alt,/*left*/chroffset+goffset,
559 				      glength,chrhigh,/*revcomp*/false);
560     } else {
561       Genome_get_segment_blocks_left(gsequence,gsequence_alt,/*right*/chrhigh-goffset+1,
562 				     glength,chroffset,/*revcomp*/true);
563     }
564     if (gsequence[0] == '\0') {
565       *finalscore = NEG_INFINITY_32;
566       *nmatches = *nmismatches = *nopens = *nindels = 0;
567       FREEA(gsequence_alt);
568       FREEA(gsequence);
569       return (List_T) NULL;
570     } else if (glength == rlength &&
571 	       (pairs = single_gap_simple(&(*finalscore),&(*nmatches),&(*nmismatches),
572 					  rsequence,rsequenceuc,rlength,gsequence,gsequence_alt,roffset,goffset,
573 					  pairpool,mismatchtype,genestrand,*dynprogindex)) != NULL) {
574       FREEA(gsequence_alt);
575       FREEA(gsequence);
576 
577       *nopens = *nindels = 0;
578       *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
579       return pairs;
580     }
581   }
582 
583 
584   Dynprog_compute_bands(&lband,&uband,rlength,glength,extraband_single,widebandp);
585 #if defined(HAVE_SSE2)
586   /* Use || because we want the minimum length (which determines the diagonal length) to achieve a score less than 128 */
587   /* Use && because we don't want to overflow in either direction */
588   if (rlength < use8p_size[mismatchtype] && glength < use8p_size[mismatchtype]) {
589     matrix8 = Dynprog_simd_8(&directions8_nogap,&directions8_Egap,&directions8_Fgap,dynprog,
590 			     rsequence,gsequence,gsequence_alt,rlength,glength,
591 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
592 			     goffset,chroffset,chrhigh,watsonp,
593 #endif
594 			     mismatchtype,open,extend,
595 			     lband,uband,jump_late_p,/*revp*/false);
596     *finalscore = (int) matrix8[glength][rlength];
597 
598     *nmatches = *nmismatches = *nopens = *nindels = 0;
599     pairs = Dynprog_traceback_8(NULL,&(*nmatches),&(*nmismatches),&(*nopens),&(*nindels),
600 				directions8_nogap,directions8_Egap,directions8_Fgap,rlength,glength,
601 				rsequence,rsequenceuc,
602 				gsequence,gsequence_alt,roffset,goffset,pairpool,/*revp*/false,
603 				chroffset,chrhigh,watsonp,genestrand,*dynprogindex);
604 
605   } else {
606     matrix16 = Dynprog_simd_16(&directions16_nogap,&directions16_Egap,&directions16_Fgap,dynprog,
607 			       rsequence,gsequence,gsequence_alt,rlength,glength,
608 #if defined(DEBUG_AVX2) || defined(DEBUG_SIMD)
609 			       goffset,chroffset,chrhigh,watsonp,
610 #endif
611 			       mismatchtype,open,extend,
612 			       lband,uband,jump_late_p,/*revp*/false);
613     *finalscore = (int) matrix16[glength][rlength];
614 
615     *nmatches = *nmismatches = *nopens = *nindels = 0;
616     pairs = Dynprog_traceback_16(NULL,&(*nmatches),&(*nmismatches),&(*nopens),&(*nindels),
617 				 directions16_nogap,directions16_Egap,directions16_Fgap,rlength,glength,
618 				 rsequence,rsequenceuc,
619 				 gsequence,gsequence_alt,roffset,goffset,pairpool,/*revp*/false,
620 				 chroffset,chrhigh,watsonp,genestrand,*dynprogindex);
621   }
622 
623 #else
624 
625   matrix = Dynprog_standard(&directions_nogap,&directions_Egap,&directions_Fgap,dynprog,
626 			    rsequence,gsequence,gsequence_alt,rlength,glength,
627 			    goffset,chroffset,chrhigh,watsonp,mismatchtype,open,extend,
628 			    lband,uband,jump_late_p,/*revp*/false,/*saturation*/NEG_INFINITY_INT,
629 			    /*upperp*/true,/*lowerp*/true);
630   *finalscore = (int) matrix[glength][rlength];
631 
632   *nmatches = *nmismatches = *nopens = *nindels = 0;
633   pairs = Dynprog_traceback_std(NULL,&(*nmatches),&(*nmismatches),&(*nopens),&(*nindels),
634 				directions_nogap,directions_Egap,directions_Fgap,rlength,glength,
635 				rsequence,rsequenceuc,
636 				gsequence,gsequence_alt,roffset,goffset,pairpool,/*revp*/false,
637 				chroffset,chrhigh,watsonp,genestrand,*dynprogindex);
638 #endif
639 
640   if (homopolymerp == true) {
641     pairs = augment_pairs(pairs,rsequence_nreps,roffset,
642 			  gsequence_nreps,goffset,pairpool,*dynprogindex);
643     FREE(gsequence_nreps);
644     FREEA(gsequence_orig);
645     FREEA(gsequence);
646 
647     FREE(rsequence_nreps);
648     /* Do not free rsequence_orig */
649     FREE(rsequence);
650 
651   } else {
652     FREEA(gsequence_alt);
653     FREEA(gsequence);
654   }
655 
656   /*
657   Directions_free(directions);
658   Matrix_free(matrix);
659   */
660 
661   debug(printf("End of dynprog single gap\n"));
662 
663   *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
664   return List_reverse(pairs);
665 }
666 
667 
668 
669 static const Except_T microexon_error = {"Microexon error"};
670 
671 static List_T
make_microexon_pairs_double(int roffsetL,int roffsetM,int roffsetR,int goffsetL,int goffsetM,int goffsetR,int lengthL,int lengthM,int lengthR,char * rsequence,char * rsequenceuc,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,Pairpool_T pairpool,char gapchar,int dynprogindex)672 make_microexon_pairs_double (int roffsetL, int roffsetM, int roffsetR,
673 			     int goffsetL, int goffsetM, int goffsetR,
674 			     int lengthL, int lengthM, int lengthR,
675 			     char *rsequence, char *rsequenceuc,
676 			     Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int genestrand,
677 			     Pairpool_T pairpool, char gapchar, int dynprogindex) {
678   List_T pairs = NULL;
679   Pair_T gappair;
680   char c1, c1_uc, c2, c2_alt;
681   int i;
682 
683   /* Left segment */
684   for (i = 0; i < lengthL; i++) {
685     c1 = rsequence[roffsetL+i];
686     c1_uc = rsequenceuc[roffsetL+i];
687 
688     c2 = get_genomic_nt(&c2_alt,goffsetL+i,chroffset,chrhigh,watsonp);
689 #ifdef EXTRACT_GENOMICSEG
690     assert(c2 == genomicseg[goffsetL+i]);
691 #endif
692 
693     if (c1_uc == c2 || c1_uc == c2_alt) {
694       pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
695 			    dynprogindex);
696     } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
697       pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
698 			    dynprogindex);
699     } else {
700       pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,MISMATCH_COMP,c2,c2_alt,
701 			    dynprogindex);
702     }
703   }
704 
705   /* First gap */
706   /* Don't have to adjust querypos/genomepos, since no cdna/genome skips allowed */
707   pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/goffsetM-(goffsetL+lengthL),
708 				  /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
709 
710   /* Assign pair->comp, because might occur after assign_gap_types */
711   gappair = (Pair_T) List_head(pairs);
712   gappair->comp = gapchar;
713 
714 
715   /* Microexon */
716   for (i = 0; i < lengthM; i++) {
717     c1 = rsequence[roffsetM+i];
718     c1_uc = rsequenceuc[roffsetM+i];
719 
720     c2 = get_genomic_nt(&c2_alt,goffsetM+i,chroffset,chrhigh,watsonp);
721 #ifdef EXTRACT_GENOMICSEG
722     assert(c2 == genomicseg[goffsetM+i]);
723 #endif
724 
725     if (c1_uc == c2 || c1_uc == c2_alt) {
726       pairs = Pairpool_push(pairs,pairpool,roffsetM+i,goffsetM+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
727 			    dynprogindex);
728     } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
729       pairs = Pairpool_push(pairs,pairpool,roffsetM+i,goffsetM+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
730 			    dynprogindex);
731     } else {
732       pairs = Pairpool_push(pairs,pairpool,roffsetM+i,goffsetM+i,c1,MISMATCH_COMP,c2,c2_alt,
733 			    dynprogindex);
734     }
735   }
736 
737   /* Second gap */
738   /* Don't have to adjust querypos/genomepos, since no cdna/genome skips allowed */
739   if (lengthR == 0) {
740     /* If lengthR is zero, then we will have a gap after a gap */
741     Except_raise(&microexon_error,__FILE__,__LINE__);
742   } else {
743     pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/goffsetR-(goffsetM+lengthM),
744 				    /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
745   }
746 
747   /* Assign pair->comp, because might occur after assign_gap_types */
748   gappair = (Pair_T) List_head(pairs);
749   gappair->comp = gapchar;
750 
751 
752   /* Right segment */
753   for (i = 0; i < lengthR; i++) {
754     c1 = rsequence[roffsetR+i];
755     c1_uc = rsequenceuc[roffsetR+i];
756 
757     c2 = get_genomic_nt(&c2_alt,goffsetR+i,chroffset,chrhigh,watsonp);
758 #ifdef EXTRACT_GENOMICSEG
759     assert(c2 == genomicseg[goffsetR+i]);
760 #endif
761 
762     if (c1_uc == c2 || c1_uc == c2_alt) {
763       pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
764 			    dynprogindex);
765     } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
766       pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
767 			    dynprogindex);
768     } else {
769       pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,MISMATCH_COMP,c2,c2_alt,
770 			    dynprogindex);
771     }
772   }
773 
774   return pairs;
775 }
776 
777 
778 #if 0
779 static List_T
780 make_microexon_pairs_single (int roffsetL, int roffsetR,
781 			     int goffsetL, int goffsetR,
782 			     int lengthL, int lengthR, char *rsequence, char *rsequenceuc,
783 			     Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp,
784 			     Pairpool_T pairpool, char gapchar, int dynprogindex) {
785   List_T pairs = NULL;
786   Pair_T gappair;
787   char c1, c1_uc, c2, c2_alt;
788   int i;
789 
790   /* Microexon */
791   for (i = 0; i < lengthL; i++) {
792     c1 = rsequence[roffsetL+i];
793     c1_uc = rsequenceuc[roffsetL+i];
794 
795     c2 = get_genomic_nt(&c2_alt,goffsetL+i,chroffset,chrhigh,watsonp);
796 #ifdef EXTRACT_GENOMICSEG
797     assert(c2 == genomicseg[goffsetL+i]);
798 #endif
799 
800     if (c1_uc == c2 || c1_uc == c2_alt) {
801       pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
802 			    dynprogindex);
803     } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
804       pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
805 			    dynprogindex);
806     } else {
807       pairs = Pairpool_push(pairs,pairpool,roffsetL+i,goffsetL+i,c1,MISMATCH_COMP,c2,c2_alt,
808 			    dynprogindex);
809     }
810   }
811 
812   /* Gap */
813   /* Don't have to adjust querypos/genomepos, since no cdna/genome skips allowed */
814   pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/goffsetR-(goffsetL+lengthL),
815 				  /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
816 
817   /* Assign pair->comp, because might occur after assign_gap_types */
818   gappair = (Pair_T) List_head(pairs);
819   gappair->comp = gapchar;
820 
821 
822   /* Right segment */
823   for (i = 0; i < lengthR; i++) {
824     c1 = rsequence[roffsetR+i];
825     c1_uc = rsequenceuc[roffsetR+i];
826 
827     c2 = get_genomic_nt(&c2_alt,goffsetR+i,chroffset,chrhigh,watsonp);
828 #ifdef EXTRACT_GENOMICSEG
829     assert(c2 == genomicseg[goffsetR+i]);
830 #endif
831 
832     if (c1_uc == c2 || c1_uc == c2_alt) {
833       pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,DYNPROG_MATCH_COMP,c2,c2_alt,
834 			    dynprogindex);
835     } else if (Dynprog_consistent_p(c1_uc,/*g*/c2,/*g_alt*/c2_alt,genestrand) == true) {
836       pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,AMBIGUOUS_COMP,c2,c2_alt,
837 			    dynprogindex);
838     } else {
839       pairs = Pairpool_push(pairs,pairpool,roffsetR+i,goffsetR+i,c1,MISMATCH_COMP,c2,c2_alt,
840 			    dynprogindex);
841     }
842   }
843 
844   return pairs;
845 }
846 #endif
847 
848 
849 /************************************************************************/
850 
851 #ifdef PMAP
852 /* Same as in boyer-moore.c */
853 /* Handle only cases in iupac table in sequence.c */
854 static bool matchtable[26][26] =
855 /*  A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z */
856   {{1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0}, /* A */
857    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* B */
858    {0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0}, /* C */
859    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* D */
860    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* E */
861    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* F */
862    {0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0}, /* G */
863    {1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0}, /* H = [ACT] */
864    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* I */
865    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* J */
866    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* K */
867    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* L */
868    {1,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,1,0,1,0}, /* M = [AC] */
869    {1,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,1,1,0,0,1,0,1,0}, /* N = [ACGT] */
870    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* O */
871    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* P */
872    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* Q */
873    {1,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,1,0,0,0}, /* R = [AG] */
874    {0,0,1,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0,1,0}, /* S = [CG] */
875    {0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0}, /* T */
876    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* U */
877    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* V */
878    {1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,1,0,0,1,0,1,0}, /* W = [AT] */
879    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, /* X */
880    {0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0}, /* Y = [CT] */
881    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}}; /* Z */
882 #endif
883 
884 /************************************************************************/
885 
886 
887 List_T
Dynprog_microexon_int(double * bestprob2,double * bestprob3,int * dynprogindex,int * microintrontype,char * rsequence,char * rsequenceuc,int rlength,int glengthL,int glengthR,int roffset,int goffsetL,int rev_goffsetR,int cdna_direction,char * queryseq,char * queryuc,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int genestrand,Pairpool_T pairpool)888 Dynprog_microexon_int (double *bestprob2, double *bestprob3, int *dynprogindex, int *microintrontype,
889 		       char *rsequence, char *rsequenceuc, int rlength,
890 #ifdef EXTRACT_GENOMICSEG
891 		       int glengthL, int glengthR,
892 #endif
893 		       int roffset, int goffsetL, int rev_goffsetR, int cdna_direction,
894 		       char *queryseq, char *queryuc,
895 		       Univcoord_T chroffset, Univcoord_T chrhigh,
896 		       bool watsonp, int genestrand, Pairpool_T pairpool) {
897   List_T pairs = NULL;
898   Intlist_T hits = NULL, p;
899 #ifdef EXTRACT_GENOMICSEG
900   Intlist_T hits_old;
901 #endif
902   int bestcL = -1, bestcR = -1, best_middlelength;
903   int middlelength, cL, cR, mincR, maxcR, leftbound, rightbound, textleft, textright,
904     best_candidate, candidate, i;
905   int nmismatches;
906   char left1, left2, right2, right1, left1_alt, left2_alt, right2_alt, right1_alt;
907   char c, c_alt;
908   char c1_alt, c2_alt, c3_alt, c4_alt;
909   char intron1, intron2, intron3, intron4, gapchar;
910   float bestprob = 0.0, prob2, prob3;
911   Univcoord_T splicesitepos;
912   /* float pvalue; */
913   /* int span; */
914 
915 
916   *bestprob2 = *bestprob3 = 0.0;
917 
918 #if 0
919   if (defect_rate < DEFECT_HIGHQ) {
920     pvalue = MICROEXON_PVALUE_HIGHQ;
921   } else if (defect_rate < DEFECT_MEDQ) {
922     pvalue = MICROEXON_PVALUE_MEDQ;
923   } else {
924     pvalue = MICROEXON_PVALUE_LOWQ;
925   }
926 #endif
927 
928 #ifdef PMAP
929   intron1 = 'G';
930   intron2 = 'T';
931   intron3 = 'A';
932   intron4 = 'G';
933   gapchar = FWD_CANONICAL_INTRON_COMP;
934   *microintrontype = GTAG_FWD;
935 #else
936   if (cdna_direction > 0) {
937     intron1 = 'G';
938     intron2 = 'T';
939     intron3 = 'A';
940     intron4 = 'G';
941     gapchar = FWD_CANONICAL_INTRON_COMP;
942     *microintrontype = GTAG_FWD;
943   } else if (cdna_direction < 0) {
944     intron1 = 'C';
945     intron2 = 'T';
946     intron3 = 'A';
947     intron4 = 'C';
948     gapchar = REV_CANONICAL_INTRON_COMP;
949     *microintrontype = GTAG_REV;
950   } else {
951     /* Can occur when called by Stage3_merge_local_splice */
952     /* fprintf(stderr,"cdna_direction is 0 in Dynprog_microexon_int\n"); */
953     *microintrontype = NONINTRON;
954     return NULL;
955   }
956 #endif
957 
958 #ifdef EXTRACT_GENOMICSEG
959   debug1(printf("Begin microexon search for %.*s and %.*s\n",
960 	       glengthL,gsequenceL,glengthR,&(rev_gsequenceR[-glengthR+1])));
961 #else
962   debug1(printf("Begin microexon search\n"));
963 #endif
964 
965   debug1(printf("  Query sequence is %.*s\n",rlength,rsequence));
966 
967   /* span = rev_goffsetR-goffsetL; */
968   debug1(printf("  Genomic span is of length %d\n",span));
969 
970 #if 0
971   if (span <= 0) {
972     fprintf(stderr,"Bug in Dynprog_microexon_int.  span %d <= 0.  Please report to twu@gene.com\n",span);
973     abort();
974   } else {
975     min_microexon_length = ceilf(-fasterlog(1.0-powf(1.0-pvalue,1.0/(float) span)) / /*log(4)*/1.386294);
976   }
977   min_microexon_length -= 8;	/* Two donor-acceptor pairs */
978   debug1(printf("  Min microexon length is %d\n",min_microexon_length));
979   if (min_microexon_length > MAX_MICROEXON_LENGTH) {
980     *microintrontype = NONINTRON;
981     return NULL;
982   } else if (min_microexon_length < MIN_MICROEXON_LENGTH) {
983     min_microexon_length = MIN_MICROEXON_LENGTH;
984   }
985 #elif 0
986   min_microexon_length = 6;
987 #endif
988 
989   debug1(printf("\nFinding starting boundary on left\n"));
990   leftbound = 0;
991   nmismatches = 0;
992   while (leftbound < rlength - 1 && nmismatches <= 1) {
993     debug1(printf("  leftbound = %d, nmismatches = %d.",leftbound,nmismatches));
994     c = get_genomic_nt(&c_alt,goffsetL+leftbound,chroffset,chrhigh,watsonp);
995 #ifdef EXTRACT_GENOMICSEG
996     assert(c == gsequence_ucL[leftbound]);
997 #endif
998     debug1(printf("  Comparing %c with %c\n",rsequence[leftbound],c));
999 #ifdef PMAP
1000     if (matchtable[rsequence[leftbound]-'A'][c-'A'] == false) {
1001       nmismatches++;
1002     }
1003 #else
1004     if (rsequenceuc[leftbound] != c) {
1005       nmismatches++;
1006     }
1007 #endif
1008     leftbound++;
1009   }
1010   leftbound--;			/* This is where the leftmost mismatch occurred */
1011 
1012   debug1(printf("\nFinding starting boundary on right\n"));
1013   rightbound = 0;
1014   i = rlength-1;
1015   nmismatches = 0;
1016   while (i >= 0 && nmismatches <= 1) {
1017     debug1(printf("  rightbound = %d, nmismatches = %d.",rightbound,nmismatches));
1018     c = get_genomic_nt(&c_alt,rev_goffsetR-rightbound,chroffset,chrhigh,watsonp);
1019 #ifdef EXTRACT_GENOMICSEG
1020     assert(c == rev_gsequence_ucR[-rightbound]);
1021 #endif
1022     debug1(printf("  Comparing %c with %c\n",rsequence[i],c));
1023 #ifdef PMAP
1024     if (matchtable[rsequence[i]-'A'][c-'A'] == false) {
1025       nmismatches++;
1026     }
1027 #else
1028     if (rsequenceuc[i] != c) {
1029       nmismatches++;
1030     }
1031 #endif
1032     rightbound++;
1033     i--;
1034   }
1035   rightbound--;			/* This is where the rightmost mismatch occurred */
1036 
1037   debug1(printf("  Left must start before %d from left end of query.  Right must start after %d from right end of query\n",
1038 	       leftbound,rightbound));
1039 
1040   /* We require that cL >= 1 and cR >= 1 so that lengthL and lengthR are >= 1 */
1041   for (cL = 1; cL <= leftbound; cL++) {
1042     left1 = get_genomic_nt(&left1_alt,goffsetL+cL,chroffset,chrhigh,watsonp);
1043     left2 = get_genomic_nt(&left2_alt,goffsetL+cL+1,chroffset,chrhigh,watsonp);
1044 #ifdef EXTRACT_GENOMICSEG
1045     assert(left1 == gsequence_ucL[cL]);
1046     assert(left2 == gsequence_ucL[cL+1]);
1047 #endif
1048 
1049     debug1(printf("  %d: %c%c\n",cL,left1,left2));
1050     if (left1 == intron1 && left2 == intron2) {
1051       mincR = rlength - MAX_MICROEXON_LENGTH - cL;
1052       debug1(printf("  mincR %d = rlength %d - MAX_MICROEXON_LENGTH %d - cL %d\n",
1053 		    mincR,rlength,MAX_MICROEXON_LENGTH,cL));
1054       if (mincR < 1) {
1055 	mincR = 1;
1056       }
1057       maxcR = rlength - MIN_MICROEXON_LENGTH - cL;
1058       debug1(printf("  maxcR %d = rlength %d - MIN_MICROEXON_LENGTH %d - cL %d\n",
1059 		    maxcR,rlength,MIN_MICROEXON_LENGTH,cL));
1060       if (maxcR > rightbound) {
1061 	maxcR = rightbound;
1062       }
1063       debug1(printf("  Found left GT at %d.  Scanning from %d - cL - (1-7), or %d to %d\n",
1064 		   cL,rlength,mincR,maxcR));
1065       for (cR = mincR; cR <= maxcR; cR++) {
1066 	right2 = get_genomic_nt(&right2_alt,rev_goffsetR-cR-1,chroffset,chrhigh,watsonp);
1067 	right1 = get_genomic_nt(&right1_alt,rev_goffsetR-cR,chroffset,chrhigh,watsonp);
1068 #ifdef EXTRACT_GENOMICSEG
1069 	assert(right2 == rev_gsequence_ucR[-cR-1]);
1070 	assert(right1 == rev_gsequence_ucR[-cR]);
1071 #endif
1072 	debug1(printf("   Checking %d: %c%c\n",cR,right2,right1));
1073 	if (right2 == intron3 && right1 == intron4) {
1074 	  middlelength = rlength - cL - cR;
1075 	  debug1(printf("  Found pair at %d to %d, length %d.  Middle sequence is %.*s\n",
1076 		       cL,cR,middlelength,middlelength,&(rsequence[cL])));
1077 
1078 	  textleft = goffsetL + cL + MICROINTRON_LENGTH;
1079 	  textright = rev_goffsetR - cR - MICROINTRON_LENGTH;
1080 
1081 	  if (textright >= textleft + middlelength) {
1082 	    hits = BoyerMoore_nt(&(rsequence[cL]),middlelength,textleft,textright-textleft,
1083 				 chroffset,chrhigh,watsonp);
1084 #ifdef EXTRACT_GENOMICSEG
1085 	    hits_old = BoyerMoore(&(rsequenceuc[cL]),middlelength,&(genomicuc[textleft]),textright-textleft);
1086 	    assert(Intlist_equal(hits,hits_old));
1087 	    Intlist_free(&hits_old);
1088 #endif
1089 	    for (p = hits; p != NULL; p = Intlist_next(p)) {
1090 	      candidate = textleft + Intlist_head(p);
1091 #ifdef EXTRACT_GENOMICSEG
1092 	      assert(get_genomic_nt(candidate-2,chroffset,chrhigh,watsonp) == genomicuc[candidate - 2]);
1093 	      assert(get_genomic_nt(candidate-1,chroffset,chrhigh,watsonp) == genomicuc[candidate - 1]);
1094 	      assert(get_genomic_nt(candidate+middlelength,chroffset,chrhigh,watsonp) == genomicuc[candidate + middlelength]);
1095 	      assert(get_genomic_nt(candidate+middlelength+1,chroffset,chrhigh,watsonp) == genomicuc[candidate + middlelength+1]);
1096 #endif
1097 	      if (/*genomicuc[candidate - 2]*/ get_genomic_nt(&c3_alt,candidate-2,chroffset,chrhigh,watsonp) == intron3 &&
1098 		  /*genomicuc[candidate - 1]*/ get_genomic_nt(&c4_alt,candidate-1,chroffset,chrhigh,watsonp)  == intron4 &&
1099 		  /*genomicuc[candidate + middlelength]*/ get_genomic_nt(&c1_alt,candidate+middlelength,chroffset,chrhigh,watsonp) == intron1 &&
1100 		  /*genomicuc[candidate + middlelength + 1]*/ get_genomic_nt(&c2_alt,candidate+middlelength+1,chroffset,chrhigh,watsonp) == intron2) {
1101 		debug1(printf("  Successful microexon at %d >>> %d..%d >>> %d\n",goffsetL+cL,candidate,candidate+middlelength,rev_goffsetR-cR));
1102 
1103 		/* Not handling known splice sites yet */
1104 		if (watsonp == true) {
1105 		  if (cdna_direction > 0) {
1106 		    splicesitepos = chroffset + (candidate-1) + 1;
1107 		    prob2 = Maxent_hr_acceptor_prob(splicesitepos,chroffset);
1108 		    splicesitepos = chroffset + candidate+middlelength;
1109 		    prob3 = Maxent_hr_donor_prob(splicesitepos,chroffset);
1110 		  } else {
1111 		    splicesitepos = chroffset + (candidate-1) + 1;
1112 		    prob2 = Maxent_hr_antidonor_prob(splicesitepos,chroffset);
1113 		    splicesitepos = chroffset + candidate+middlelength;
1114 		    prob3 = Maxent_hr_antiacceptor_prob(splicesitepos,chroffset);
1115 		  }
1116 		} else {
1117 		  if (cdna_direction > 0) {
1118 		    splicesitepos = chrhigh - (candidate-1);
1119 		    prob2 = Maxent_hr_antiacceptor_prob(splicesitepos,chroffset);
1120 		    splicesitepos = chrhigh - (candidate+middlelength) + 1;
1121 		    prob3 = Maxent_hr_antidonor_prob(splicesitepos,chroffset);
1122 		  } else {
1123 		    splicesitepos = chrhigh - (candidate-1);
1124 		    prob2 = Maxent_hr_donor_prob(splicesitepos,chroffset);
1125 		    splicesitepos = chrhigh - (candidate+middlelength) + 1;
1126 		    prob3 = Maxent_hr_acceptor_prob(splicesitepos,chroffset);
1127 		  }
1128 		}
1129 
1130 		debug1(printf("microexon probabilities: prob2 = %f, prob3 = %f\n",prob2,prob3));
1131 		if (prob2 + prob3 > bestprob) {
1132 		  bestcL = cL;
1133 		  bestcR = cR;
1134 		  best_candidate = candidate;
1135 		  best_middlelength = middlelength;
1136 		  *bestprob2 = prob2;
1137 		  *bestprob3 = prob3;
1138 		  bestprob = prob2 + prob3;
1139 		}
1140 	      }
1141 	    }
1142 	    Intlist_free(&hits);
1143 	  }
1144 
1145 	}
1146       }
1147     }
1148   }
1149 
1150   if (bestcL < 0 || bestcR < 0) {
1151     debug1(printf("End of dynprog microexon int\n"));
1152 
1153     *microintrontype = NONINTRON;
1154     return NULL;
1155 
1156   } else {
1157     debug1(printf("Making microexon pairs with candidate %u\n",best_candidate));
1158     pairs = make_microexon_pairs_double(roffset,/*roffsetM*/roffset+bestcL,/*roffsetR*/roffset+bestcL+best_middlelength,
1159 					goffsetL,/*candidate*/best_candidate,/*goffsetR*/rev_goffsetR-bestcR+1,
1160 					/*lengthL*/bestcL,/*lengthM*/best_middlelength,/*lengthR*/bestcR,
1161 					queryseq,queryuc,
1162 					chroffset,chrhigh,watsonp,genestrand,pairpool,gapchar,*dynprogindex);
1163     *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
1164     return pairs;
1165   }
1166 }
1167 
1168 
1169 #if 0
1170 /* Based on probability of seeing a pattern of length n in L is
1171    1-(1-p1)^L, where p1 is 4^n.  We determine L so chance probability
1172    is less than ENDSEQUENCE_PVALUE */
1173 static int
1174 search_length (int endlength, int maxlength, bool end_microexons_p) {
1175   double p1;
1176   int effective_maxlength, extrant, result;
1177 
1178   if (end_microexons_p == true) {
1179     extrant = 4;		/* Count the four nucleotides at the intron bounds */
1180     effective_maxlength = maxlength;
1181   } else {
1182     extrant = 0;		/* Don't count the four nucleotides */
1183     effective_maxlength = 5000;
1184     if (maxlength < effective_maxlength) {
1185       effective_maxlength = maxlength;
1186     }
1187   }
1188 
1189   if (endlength + extrant > 12) {
1190     debug(printf("  Search length for endlength of %d is maxlength %d\n",endlength,effective_maxlength));
1191     return effective_maxlength;
1192   } else {
1193     p1 = 1.0/pow(4.0,(double) (endlength + extrant));
1194     result = (int) (fasterlog(1.0-ENDSEQUENCE_PVALUE)/fasterlog(1-p1));
1195     debug(printf("  Search length for endlength of %d plus extra nt of %d is %d\n",endlength,extrant,result));
1196     if (result > effective_maxlength) {
1197       return effective_maxlength;
1198     } else {
1199       return result;
1200     }
1201   }
1202 }
1203 #endif
1204 
1205 
1206 #if 0
1207 /* Not currently used */
1208 List_T
1209 Dynprog_microexon_5 (int *dynprogindex, int *microintrontype, int *microexonlength,
1210 		     char *rev_rsequence, char *rev_rsequenceuc, char *rev_gsequence, char *rev_gsequence_uc,
1211 		     int rlength, int glength, int rev_roffset, int rev_goffset, int cdna_direction,
1212 		     char *queryseq, char *queryuc, char *genomicseg, char *genomicuc,
1213 		     Pairpool_T pairpool, bool end_microexons_p) {
1214   List_T pairs = NULL;
1215   Intlist_T hits = NULL, p;
1216   int endlength, maxc, c, textleft, textright, candidate, nmismatches = 0;
1217   char right2, right1;
1218   char intron1, intron2, intron3, intron4, gapchar;
1219 
1220 #ifdef PMAP
1221   intron1 = 'G';
1222   intron2 = 'T';
1223   intron3 = 'A';
1224   intron4 = 'G';
1225   gapchar = FWD_CANONICAL_INTRON_COMP;
1226   *microintrontype = GTAG_FWD;
1227 #else
1228   if (cdna_direction > 0) {
1229     intron1 = 'G';
1230     intron2 = 'T';
1231     intron3 = 'A';
1232     intron4 = 'G';
1233     gapchar = FWD_CANONICAL_INTRON_COMP;
1234     *microintrontype = GTAG_FWD;
1235   } else if (cdna_direction < 0) {
1236     intron1 = 'C';
1237     intron2 = 'T';
1238     intron3 = 'A';
1239     intron4 = 'C';
1240     gapchar = REV_CANONICAL_INTRON_COMP;
1241     *microintrontype = GTAG_REV;
1242   } else {
1243     *microintrontype = NONINTRON;
1244     return (List_T) NULL;
1245     abort();
1246   }
1247 #endif
1248 
1249 #ifdef EXTRACT_GENOMICSEG
1250   debug(printf("Begin microexon search at 5' for %.*s\n",
1251 	       glength,&(rev_gsequence[-glength+1])));
1252 #else
1253   debug(printf("Begin microexon search at 5'\n"));
1254 #endif
1255 
1256   debug(printf("  Query sequence is %.*s\n",rlength,&(rev_rsequence[-rlength+1])));
1257 
1258   *microexonlength = 0;
1259   if (glength < rlength) {
1260     maxc = glength - MIN_MICROEXON_LENGTH;
1261   } else {
1262     maxc = rlength - MIN_MICROEXON_LENGTH;
1263   }
1264   for (c = 0; c < maxc; c++) {
1265     right2 = rev_gsequence_uc[-c-1];
1266     right1 = rev_gsequence_uc[-c];
1267     debug(printf("   Checking %c%c\n",right2,right1));
1268 #ifdef PMAP
1269     if (c > 0 && matchtable[rev_rsequence[-c+1]-'A'][rev_gsequence_uc[-c+1]-'A'] == false) {
1270       nmismatches++;
1271     }
1272 #else
1273     if (c > 0 && rev_rsequenceuc[-c+1] != rev_gsequence_uc[-c+1]) {
1274       nmismatches++;
1275     }
1276 #endif
1277     if (nmismatches > 1) {
1278       debug(printf("   Aborting at %c != %c\n",rev_rsequence[-c+1],rev_gsequence[-c+1]));
1279       *microintrontype = NONINTRON;
1280       return NULL;
1281     }
1282     if (right2 == intron3 && right1 == intron4) {
1283       endlength = rlength - c;
1284       debug(printf("  Found acceptor at %d, length %d.  End sequence is %.*s\n",
1285 		       c,endlength,endlength,&(rev_rsequence[-endlength+1])));
1286 
1287       textright = rev_goffset - c - MICROINTRON_LENGTH;
1288       textleft = textright - search_length(endlength,textright,end_microexons_p) + MICROINTRON_LENGTH;
1289 
1290       if (textright >= textleft + endlength) {
1291 	hits = BoyerMoore_nt(&(rev_rsequence[-c-endlength+1]),endlength,textleft,textright-textleft,
1292 			     chroffset,chrhigh,watsonp);
1293 	for (p = hits; p != NULL; p = Intlist_next(p)) {
1294 	  candidate = textleft + Intlist_head(p);
1295 	  if (genomicseg[candidate + endlength] == intron1 &&
1296 	      genomicseg[candidate + endlength + 1] == intron2) {
1297 	    debug(printf("  Successful microexon at %d\n",candidate));
1298 
1299 	    Intlist_free(&hits);
1300 	    *microexonlength = endlength;
1301 	    pairs = make_microexon_pairs_single(rev_roffset-c-endlength+1,rev_roffset-c+1,
1302 						candidate,rev_goffset-c+1,endlength,c,
1303 						queryseq,queryuc,chroffset,watsonp,pairpool,gapchar,*dynprogindex);
1304 	    *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
1305 	    return pairs;
1306 	  }
1307 	}
1308 	Intlist_free(&hits);
1309       }
1310 
1311     }
1312   }
1313 
1314   debug(printf("End of dynprog microexon 5\n"));
1315 
1316   *microintrontype = NONINTRON;
1317   return NULL;
1318 }
1319 #endif
1320 
1321 
1322 #if 0
1323 /* Not currently used */
1324 List_T
1325 Dynprog_microexon_3 (int *dynprogindex, int *microintrontype, int *microexonlength,
1326 		     char *rsequence, char *rsequenceuc, char *gsequence, char *gsequence_uc,
1327 		     int rlength, int glength, int roffset, int goffset, int cdna_direction,
1328 		     char *queryseq, char *queryuc, char *genomicseg, char *genomicuc,
1329 		     int genomiclength, Pairpool_T pairpool, bool end_microexons_p) {
1330   List_T pairs = NULL;
1331   Intlist_T hits = NULL, p;
1332   int endlength, maxc, c, textleft, textright, candidate, nmismatches = 0;
1333   char left1, left2;
1334   char intron1, intron2, intron3, intron4, gapchar;
1335 
1336 #ifdef PMAP
1337   intron1 = 'G';
1338   intron2 = 'T';
1339   intron3 = 'A';
1340   intron4 = 'G';
1341   gapchar = FWD_CANONICAL_INTRON_COMP;
1342   *microintrontype = GTAG_FWD;
1343 #else
1344   if (cdna_direction > 0) {
1345     intron1 = 'G';
1346     intron2 = 'T';
1347     intron3 = 'A';
1348     intron4 = 'G';
1349     gapchar = FWD_CANONICAL_INTRON_COMP;
1350     *microintrontype = GTAG_FWD;
1351   } else if (cdna_direction < 0) {
1352     intron1 = 'C';
1353     intron2 = 'T';
1354     intron3 = 'A';
1355     intron4 = 'C';
1356     gapchar = REV_CANONICAL_INTRON_COMP;
1357     *microintrontype = GTAG_REV;
1358   } else {
1359     *microintrontype = NONINTRON;
1360     return (List_T) NULL;
1361     abort();
1362   }
1363 #endif
1364 
1365 #ifdef EXTRACT_GENOMICSEG
1366   debug(printf("Begin microexon search at 3' for %.*s\n",glength,gsequence));
1367 #else
1368   debug(printf("Begin microexon search at 3'\n"));
1369 #endif
1370 
1371   debug(printf("  Query sequence is %.*s\n",rlength,rsequence));
1372 
1373   *microexonlength = 0;
1374   if (glength < rlength) {
1375     maxc = glength - MIN_MICROEXON_LENGTH;
1376   } else {
1377     maxc = rlength - MIN_MICROEXON_LENGTH;
1378   }
1379   for (c = 0; c < maxc; c++) {
1380     left1 = gsequence_uc[c];
1381     left2 = gsequence_uc[c+1];
1382     debug(printf("   Checking %c%c\n",left1,left2));
1383 #ifdef PMAP
1384     if (c > 0 && matchtable[rsequence[c-1]-'A'][gsequence_uc[c-1]-'A'] == false) {
1385       nmismatches++;
1386     }
1387 #else
1388     if (c > 0 && rsequenceuc[c-1] != gsequence_uc[c-1]) {
1389       nmismatches++;
1390     }
1391 #endif
1392     if (nmismatches > 1) {
1393       debug(printf("   Aborting at %c != %c\n",rsequence[c-1],gsequence[c-1]));
1394       *microintrontype = NONINTRON;
1395       return NULL;
1396     }
1397     if (left1 == intron1 && left2 == intron2) {
1398       endlength = rlength - c;
1399       debug(printf("  Found donor at %d, length %d.  End sequence is %.*s\n",
1400 		   c,endlength,endlength,&(rsequence[c])));
1401 
1402       textleft = goffset + c;
1403       textright = textleft + search_length(endlength,genomiclength-textleft,end_microexons_p);
1404 
1405       if (textright >= textleft + endlength) {
1406 	hits = BoyerMoore_nt(&(rsequence[c]),endlength,textleft,textright-textleft,
1407 			     chroffset,chrhigh,watsonp);
1408 	for (p = hits; p != NULL; p = Intlist_next(p)) {
1409 	  candidate = textleft + Intlist_head(p);
1410 	  if (genomicseg[candidate - 2] == intron3 &&
1411 	      genomicseg[candidate - 1] == intron4) {
1412 	    debug(printf("  Successful microexon at %d\n",candidate));
1413 
1414 	    Intlist_free(&hits);
1415 	    *microexonlength = endlength;
1416 	    pairs = make_microexon_pairs_single(roffset,roffset+c,
1417 						goffset,candidate,c,endlength,
1418 						queryseq,queryuc,genomicseg,genomicuc,
1419 						pairpool,gapchar,*dynprogindex);
1420 	    *dynprogindex += (*dynprogindex > 0 ? +1 : -1);
1421 	    return pairs;
1422 	  }
1423 	}
1424 	Intlist_free(&hits);
1425       }
1426 
1427     }
1428   }
1429 
1430   debug(printf("End of dynprog microexon 3\n"));
1431 
1432   *microintrontype = NONINTRON;
1433   return NULL;
1434 }
1435 #endif
1436 
1437