1 static char rcsid[] = "$Id: stage1.c 218185 2019-01-17 13:05:01Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "stage1.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>		/* For memset */
10 
11 #include "bool.h"
12 #include "mem.h"
13 #include "reader.h"
14 #include "block.h"
15 #ifdef PMAP
16 #include "oligop.h"
17 #else
18 #include "oligo.h"
19 #endif
20 #include "intlist.h"
21 #include "matchdef.h"
22 #include "match.h"
23 #include "gregion.h"
24 #include "orderstat.h"
25 
26 
27 /* Restoring old scan_ends algorithm, removed on 2012-05-14, to get
28    ends correctly (e.g., NM_004449) */
29 #define SCAN_ENDS 1
30 
31 
32 /* Need to limit ninrange in find_range, or else we get bogged down in
33    repeats */
34 #define MAX_NINRANGE 100
35 
36 
37 #define MAX_INDELS 15
38 #define MIN_REPEAT 6
39 #define MAXENTRIES 100
40 #define MIN_MATCH_WEIGHT 0.05	/* For connectablep */
41 
42 #define SUFFICIENT_FIRST_WEIGHT 0.50
43 
44 #define PCT_MAX_SUPPORT 0.70
45 #define DIFF_MAX_SUPPORT 200
46 #define BOUNDARY_SUPPORT 666 	/* DIFF_MAXSUPPORT/(1-PCT_MAX_SUPPORT) */
47 
48 #ifdef PMAP
49 #define VERYSHORTSEQLEN 30
50 #define SHORTSEQLEN 90
51 #define SINGLEEXONLENGTH 30	/* per nt */
52 #define SLOPE 2400		/* genomic bp per aa */
53 #define NOEXTENDLEN 6		/* in aa */
54 #else
55 #define VERYSHORTSEQLEN 60
56 #define SHORTSEQLEN 90
57 #define SINGLEEXONLENGTH 90	/* per nt */
58 #define SLOPE 800		/* genomic bp per nt */
59 #define NOEXTENDLEN 6		/* in nt */
60 #endif
61 
62 #define HIGHINTRONLENGTH 100000U /* Used in find_best_path */
63 
64 #define INTRONLEN_PENALTY 10000	/* Used in find_best_path on diagonal segments.  1 point for each length */
65 
66 #define MAX_ATTEMPTS_UNIT 100 /* This parameter is used to decide when to switch from 24-mers to 18-mers */
67 
68 #define MAX_FILL_IN 200
69 #define MAX_DANGLING_PCT 0.33
70 
71 #define SAMPLERUN 50
72 
73 #define COLLAPSE_DISTANCE 12
74 #define SUBOPTIMAL 36
75 
76 #define MAX_GREGIONS_PRE_UNIQUE 1000
77 #define MAX_GREGIONS_POST_UNIQUE 100
78 
79 #define BINARY_FOLDDIFF 8
80 
81 #define MAXEXONS 3
82 
83 /* Once a match at a genomic location has PROMISCUOUS matches locally,
84    it is unlikely that further matches will help define that candidate
85    segment.  Allowing PROMISCUOUS to be greater than 1 allows more
86    candidate segments to be passed to stage 2.  However, if
87    PROMISCUOUS is set too high, then a single pair of matches may make
88    the algorithm miss the correct spot.  For example, PROMISCUOUS of 9
89    or greater will miss the spliced form of BM994949, on chr4, and get
90    a pseudogene that contains a poly-T on chr15. */
91 
92 #define PROMISCUOUS 4
93 
94 /* Debugging of scanning for 24-mers */
95 #ifdef DEBUG
96 #define debug(x) x
97 static Univ_IIT_T global_chromosome_iit;
98 static char *queryuc_ptr;
99 #else
100 #define debug(x)
101 #endif
102 
103 /* final results */
104 #ifdef DEBUG0
105 #define debug0(x) x
106 #else
107 #define debug0(x)
108 #endif
109 
110 /* show positions.  Should also specify DEBUG */
111 #ifdef DEBUG1
112 #define debug1(x) x
113 #else
114 #define debug1(x)
115 #endif
116 
117 /* Debugging of 12-mer extension at ends */
118 #ifdef DEBUG2
119 #define debug2(x) x
120 #else
121 #define debug2(x)
122 #endif
123 
124 /* Detailed view of scanning for 24-mers */
125 #ifdef DEBUG3
126 #define debug3(x) x
127 #else
128 #define debug3(x)
129 #endif
130 
131 /* Double/triple matching, including binary search */
132 #ifdef DEBUG4
133 #define debug4(x) x
134 #else
135 #define debug4(x)
136 #endif
137 
138 /* connectable_p */
139 #ifdef DEBUG5
140 #define debug5(x) x
141 #else
142 #define debug5(x)
143 #endif
144 
145 
146 /* collapse_diagonals and find_segments */
147 #ifdef DEBUG6
148 #define debug6(x) x
149 #else
150 #define debug6(x)
151 #endif
152 
153 /* find_segments_heap */
154 #ifdef DEBUG7
155 #define debug7(x) x
156 #else
157 #define debug7(x)
158 #endif
159 
160 /* compute_paths */
161 #ifdef DEBUG8
162 #define debug8(x) x
163 #else
164 #define debug8(x)
165 #endif
166 
167 /* find_good_paths */
168 #ifdef DEBUG9
169 #define debug9(x) x
170 #else
171 #define debug9(x)
172 #endif
173 
174 
175 #ifdef PMAP
176 static Width_T index1part_aa;
177 static int leftreadshift;
178 static Chrpos_T maxextension;
179 static Chrpos_T maxtotallen_bound;
180 static int circular_typeint;
181 
182 void
Stage1_setup(Width_T index1part_aa_in,Chrpos_T maxextension_in,Chrpos_T maxtotallen_bound_in,int circular_typeint_in)183 Stage1_setup (Width_T index1part_aa_in, Chrpos_T maxextension_in, Chrpos_T maxtotallen_bound_in,
184 	      int circular_typeint_in) {
185   index1part_aa = index1part_aa_in;
186   leftreadshift = 32 - 2*index1part_aa_in; /* chars are shifted into left of a 32 bit word */
187   maxextension = maxextension_in;
188   maxtotallen_bound = maxtotallen_bound_in;
189   circular_typeint = circular_typeint_in;
190   return;
191 }
192 
193 #else
194 static Width_T index1part;
195 static int leftreadshift;
196 static Chrpos_T maxextension;
197 static Chrpos_T maxtotallen_bound;
198 static Oligospace_T oligobase_mask;
199 static int circular_typeint;
200 
201 void
Stage1_setup(Width_T index1part_in,Chrpos_T maxextension_in,Chrpos_T maxtotallen_bound_in,int circular_typeint_in)202 Stage1_setup (Width_T index1part_in, Chrpos_T maxextension_in, Chrpos_T maxtotallen_bound_in,
203 	      int circular_typeint_in) {
204   index1part = index1part_in;
205 #ifdef HAVE_64_BIT
206   leftreadshift = 64 - 2*index1part_in; /* chars are shifted into left of a 64-bit word */
207   oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part_in);
208 #else
209   leftreadshift = 32 - 2*index1part_in; /* chars are shifted into left of a 32-bit word */
210   oligobase_mask = ~(~ (Oligospace_T) 0 << 2*index1part_in);
211 #endif
212   maxextension = maxextension_in;
213   maxtotallen_bound = maxtotallen_bound_in;
214   circular_typeint = circular_typeint_in;
215   return;
216 }
217 #endif
218 
219 
220 
221 #define T Stage1_T
222 struct T {
223   int trimstart;
224   int trimend;
225   int trimlength;
226   int maxtotallen;
227   int querylength;
228   int maxentries;
229 
230   Width_T oligosize;
231 
232   Block_T block5;
233   Block_T block3;
234   List_T matches5;
235   List_T matches3;
236   Univcoord_T **plus_positions;
237   Univcoord_T **minus_positions;
238   int *plus_npositions;
239   int *minus_npositions;
240   bool *plus_matchedp;		/* For identify_matches */
241   bool *minus_matchedp;
242   bool *processedp;		/* For Block_process_oligo */
243   bool *validp;			/* For Block_process_oligo */
244   int querystart;		/* Changes during computation */
245   int queryend;			/* Changes during computation */
246 
247 #ifdef PMAP
248   Oligospace_T *oligos;
249 #else
250   Oligospace_T *forward_oligos;
251   Oligospace_T *revcomp_oligos;
252 #endif
253 };
254 
255 
256 #ifdef CHECK
257 static void
Stage1_check(T this)258 Stage1_check (T this) {
259   int i, npositions, j;
260 
261   for (i = 0; i < this->querylength; i++) {
262     npositions = this->plus_npositions[i];
263     if (npositions == 0 && this->plus_positions[i] != NULL) {
264       fprintf(stderr,"npositions = 0, this->plus_positions[i] != NULL\n");
265       abort();
266     } else if (npositions != 0 && this->plus_positions[i] == NULL) {
267       fprintf(stderr,"npositions != 0, this->plus_positions[i] == NULL\n");
268       abort();
269     } else {
270       for (j = 0; j < npositions; j++) {
271 	if (this->plus_positions[i][j] == 0U) {
272 	  fprintf(stderr,"this->plus_positions[i][i] == 0\n");
273 	  abort();
274 	}
275       }
276     }
277 
278     npositions = this->minus_npositions[i];
279     if (npositions == 0 && this->minus_positions[i] != NULL) {
280       fprintf(stderr,"npositions = 0, this->minus_positions[i] != NULL\n");
281       abort();
282     } else if (npositions != 0 && this->minus_positions[i] == NULL) {
283       fprintf(stderr,"npositions != 0, this->minus_positions[i] == NULL\n");
284       abort();
285     } else {
286       for (j = 0; j < npositions; j++) {
287 	if (this->minus_positions[i][j] == 0U) {
288 	  fprintf(stderr,"this->minus_positions[i][i] == 0\n");
289 	  abort();
290 	}
291       }
292     }
293   }
294 
295   return;
296 
297 }
298 #endif
299 
300 
301 
302 static T
Stage1_new(Sequence_T queryuc,int maxtotallen,int maxentries)303 Stage1_new (Sequence_T queryuc, int maxtotallen, int maxentries) {
304   T new = (T) MALLOC(sizeof(*new));
305   Reader_T reader;
306 
307 #ifdef PMAP
308   new->querylength = Sequence_fulllength_given(queryuc);
309 #else
310   new->querylength = Sequence_fulllength(queryuc);
311 #endif
312 
313   new->querystart = Sequence_trim_start(queryuc);
314   new->queryend = Sequence_trim_end(queryuc);
315   new->trimstart = new->querystart;
316   new->trimend = new->queryend;
317   new->trimlength = Sequence_trimlength(queryuc);
318 
319   new->maxtotallen = maxtotallen;
320   new->maxentries = maxentries;
321 #ifdef PMAP
322   new->oligosize = index1part_aa;
323 #else
324   new->oligosize = index1part;
325 #endif
326 
327 #if 0
328   debug(Sequence_print(stdout,queryuc,/*uppercasep*/true,/*wraplength*/50,/*trimmedp*/true));
329 #endif
330 #ifdef PMAP
331   reader = Reader_new(Sequence_fullpointer(queryuc),new->querystart,new->queryend);
332   new->block5 = Block_new(FIVE,/*oligosize*/index1part_aa,reader,3*new->querylength);
333   new->block3 = Block_new(THREE,/*oligosize*/index1part_aa,reader,3*new->querylength);
334 #else
335   reader = Reader_new(Sequence_fullpointer(queryuc),new->querystart,new->queryend);
336   new->block5 = Block_new(FIVE,/*oligosize*/index1part,leftreadshift,reader,new->querylength);
337   new->block3 = Block_new(THREE,/*oligosize*/index1part,leftreadshift,reader,new->querylength);
338 #endif
339   new->matches5 = NULL;
340   new->matches3 = NULL;
341 
342   new->plus_positions = (Univcoord_T **) CALLOC(new->querylength,sizeof(Univcoord_T *));
343   new->minus_positions = (Univcoord_T **) CALLOC(new->querylength,sizeof(Univcoord_T *));
344   new->plus_npositions = (int *) CALLOC(new->querylength,sizeof(int));
345   new->minus_npositions = (int *) CALLOC(new->querylength,sizeof(int));
346   new->processedp = (bool *) CALLOC(new->querylength,sizeof(bool));
347 
348   new->plus_matchedp = (bool *) CALLOC(new->querylength,sizeof(bool));
349   new->minus_matchedp = (bool *) CALLOC(new->querylength,sizeof(bool));
350 
351   new->validp = (bool *) NULL;
352 #ifdef PMAP
353   new->oligos = (Oligospace_T *) NULL;
354 #else
355   new->forward_oligos = (Oligospace_T *) NULL;
356   new->revcomp_oligos = (Oligospace_T *) NULL;
357 #endif
358 
359   return new;
360 }
361 
362 static void
Stage1_free(T * old)363 Stage1_free (T *old) {
364   Reader_T reader;
365   int i;
366 
367   /* Stage1_check(*old); */
368 
369   if ((*old)->validp != NULL) {
370     FREE((*old)->validp);
371 #ifdef PMAP
372     FREE((*old)->oligos);
373 #else
374     FREE((*old)->revcomp_oligos);
375     FREE((*old)->forward_oligos);
376 #endif
377   }
378 
379   for (i = 0; i < (*old)->querylength; i++) {
380     if ((*old)->plus_positions[i] != NULL) {
381       FREE((*old)->plus_positions[i]);
382     }
383     if ((*old)->minus_positions[i] != NULL) {
384       FREE((*old)->minus_positions[i]);
385     }
386   }
387 
388   FREE((*old)->plus_positions);
389   FREE((*old)->minus_positions);
390   FREE((*old)->plus_npositions);
391   FREE((*old)->minus_npositions);
392   FREE((*old)->processedp);
393 
394   FREE((*old)->plus_matchedp);
395   FREE((*old)->minus_matchedp);
396 
397   reader = Block_reader((*old)->block5);
398   Reader_free(&reader);
399 
400   Block_free(&(*old)->block3);
401   Block_free(&(*old)->block5);
402 
403   FREE(*old);
404   return;
405 }
406 
407 
408 
409 /* This procedure is called a lot.  Replacing calls to Match_chrnum and so on with direct calls to Match_T object */
410 static bool
connectable_p(Match_T match5,Match_T match3,int maxtotallen)411 connectable_p (Match_T match5, Match_T match3, int maxtotallen) {
412   Univcoord_T position5, position3;
413   int querypos5, querypos3, exonlen;
414   bool forwardp5, forwardp3;
415 
416   debug5(printf("Comparing #%d:%u at querypos %d with #%d:%u at querypos %d\n",
417 		Match_chrnum(match5),Match_chrpos(match5),Match_querypos(match5),
418 		Match_chrnum(match3),Match_chrpos(match3),Match_querypos(match3)));
419 
420   if (match5->chrnum != match3->chrnum) {
421     debug5(printf("No, different chromosomes\n\n"));
422     return false;
423   } else {
424     querypos5 = match5->querypos;
425     querypos3 = match3->querypos;
426     exonlen = querypos3 - querypos5;
427 #if 0
428     if (exonlen < matchinterval) {
429       debug5(printf("No, exonlen == 0\n\n"));
430       return false;
431     } else {
432 #endif
433       position5 = match5->position;
434       position3 = match3->position;
435       /* intronlen = abs(position3 - position5) - exonlen; -- shouldn't subtract unsigned ints */
436       if (position3 > position5) {
437 	/* intronlen = position3 - position5 - exonlen; -- Don't subtract into a signed int */
438 	/* The check below is equivalent to intronlen > maxtotallen */
439 	if (position3 > (Univcoord_T) maxtotallen + position5 + (Univcoord_T) exonlen) {
440 	  debug5(printf("No, intron too long (%u > %u + %u + %u)\n\n",
441 			position3,maxtotallen,position5,exonlen));
442 	  return false;
443 	}
444       } else {
445 	/* intronlen = position5 - position3 - exonlen; -- Don't subtract into a signed int */
446 	/* The check below is equivalent to intronlen > maxtotallen */
447 	if (position5 > (Univcoord_T) maxtotallen + position3 + (Univcoord_T) exonlen) {
448 	  debug5(printf("No, intron too long (%u > %u + %u + %u)\n\n",
449 			position5,maxtotallen,position3,exonlen));
450 	  return false;
451 	}
452       }
453       forwardp5 = match5->forwardp;
454       forwardp3 = match3->forwardp;
455 
456       if (forwardp5 != forwardp3) {
457 	debug5(printf("No, forwardp different\n\n"));
458 	return false;
459       } else if (forwardp5 == true && position3 < position5) {
460 	debug5(printf("No, forwardp true, but positions wrong\n\n"));
461 	return false;
462       } else if (forwardp5 == false && position5 < position3) {
463 	debug5(printf("No, forwardp false, but positions wrong\n\n"));
464 	return false;
465       } else if (match5->weight < MIN_MATCH_WEIGHT || match3->weight < MIN_MATCH_WEIGHT) {
466 	debug5(printf("Yes, but weights are too small: %.1f %.1f\n\n",
467 		      match5->weight,match3->weight));
468 	return false;
469       } else {
470 	debug5(printf("Yes\n\n"));
471 	return true;
472       }
473 #if 0
474     }
475 #endif
476   }
477 }
478 
479 
480 
481 /* This procedure is called a lot.  Replacing calls to Match_npairings and so on with direct calls to Match_T object */
482 /* Updates a list of Stage1_T objects */
483 static List_T
pair_up(bool * foundpairp,List_T gregionlist,Width_T matchsize,List_T newmatches5,List_T newmatches3,List_T matches5,List_T matches3,int genestrand,Univ_IIT_T chromosome_iit,int querylength,int maxtotallen,int trimstart,int trimend,int trimlength)484 pair_up (bool *foundpairp, List_T gregionlist, Width_T matchsize,
485 	 List_T newmatches5, List_T newmatches3, List_T matches5, List_T matches3,
486 	 int genestrand, Univ_IIT_T chromosome_iit, int querylength,
487 	 int maxtotallen, int trimstart, int trimend, int trimlength) {
488   List_T p, q, s, new_gregions = NULL;
489   Match_T match5, match3;
490   /* Width_T matchinterval; */
491 #ifdef DEBUG
492   Gregion_T gregion;
493 #endif
494 
495   /* matchinterval = matchsize - oligosize; */
496 
497   /* Do N vs N */
498   for (q = newmatches5; q != NULL; q = q->rest) {
499     match5 = (Match_T) q->first;
500     if (match5->npairings < PROMISCUOUS) {
501       for (s = newmatches3; s != NULL; s = s->rest) {
502 	match3 = (Match_T) s->first;
503 	if (match3->npairings < PROMISCUOUS) {
504 	  if (connectable_p(match5,match3,maxtotallen)) {
505 #if 1
506 	    if (Match_acceptable_pair(match5,match3,trimlength,matchsize) == true) {
507 #endif
508 	      new_gregions = List_push(new_gregions,Gregion_new_from_matches(match5,match3,genestrand,chromosome_iit,
509 									     querylength,matchsize,trimstart,trimend,
510 									     circular_typeint));
511 #if 1
512 	    }
513 #endif
514 	  }
515 	}
516       }
517     }
518   }
519 
520   /* Do N vs (N-1..1) */
521   for (q = newmatches5; q != NULL; q = q->rest) {
522     match5 = (Match_T) q->first;
523     if (match5->npairings < PROMISCUOUS) {
524       for (s = matches3; s != NULL; s = s->rest) {
525 	match3 = (Match_T) s->first;
526 	if (match3->npairings < PROMISCUOUS) {
527 	  if (connectable_p(match5,match3,maxtotallen)) {
528 #if 1
529 	    if (Match_acceptable_pair(match5,match3,trimlength,matchsize) == true) {
530 #endif
531 	      new_gregions = List_push(new_gregions,Gregion_new_from_matches(match5,match3,genestrand,chromosome_iit,
532 									     querylength,matchsize,trimstart,trimend,
533 									     circular_typeint));
534 #if 1
535 	    }
536 #endif
537 	  }
538 	}
539       }
540     }
541   }
542 
543   /* Do (N-1..1) vs N */
544   for (q = matches5; q != NULL; q = q->rest) {
545     match5 = (Match_T) q->first;
546     if (match5->npairings < PROMISCUOUS) {
547       for (s = newmatches3; s != NULL; s = s->rest) {
548 	match3 = (Match_T) s->first;
549 	if (match3->npairings < PROMISCUOUS) {
550 	  if (connectable_p(match5,match3,maxtotallen)) {
551 #if 1
552 	    if (Match_acceptable_pair(match5,match3,trimlength,matchsize) == true) {
553 #endif
554 	      new_gregions = List_push(new_gregions,Gregion_new_from_matches(match5,match3,genestrand,chromosome_iit,
555 									     querylength,matchsize,trimstart,trimend,
556 									     circular_typeint));
557 #if 1
558 	    }
559 #endif
560 	  }
561 	}
562       }
563     }
564   }
565 
566   if (new_gregions == NULL) {
567     debug(printf("--No new gregions found\n"));
568     *foundpairp = false;
569 
570   } else {
571     debug(printf("--%d new gregions found before uniq\n",List_length(new_gregions)));
572     debug(
573 	  for (p = new_gregions; p != NULL; p = p->rest) {
574 	    gregion = (Gregion_T) List_head(p);
575 	    printf("Before uniq: ");
576 	    Gregion_print(gregion);
577 	  }
578 	  );
579 
580 #if 0
581     new_gregions = Gregion_filter_unique(new_gregions);
582 #endif
583 
584     debug(printf("--%d new gregions found after uniq\n",List_length(new_gregions)));
585     *foundpairp = true;
586     for (p = new_gregions; p != NULL; p = p->rest) {
587 	debug(
588 	      gregion = (Gregion_T) List_head(p);
589 	      printf("After uniq: ");
590 	      Gregion_print(gregion);
591 	      );
592 	gregionlist = List_push(gregionlist,(Gregion_T) List_head(p));
593     }
594     List_free(&new_gregions);
595   }
596 
597   return gregionlist;
598 }
599 
600 
601 #if 0
602 static bool
603 repetitivep (Oligospace_T oligo, int oligosize) {
604   int i;
605 
606   oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
607   for (querypos = 0; querypos <= this->querylength - this->oligobase; querypos++) {
608     if (this->validp[querypos] == true) {
609       oligos[k++] = this->forward_oligos[querypos];
610     }
611   }
612   qsort(oligos,k,sizeof(Oligospace_T),Oligospace_compare);
613 
614   for (i = 0, j = 1; j < k; i++, j++) {
615     if (oligos[j] == oligos[i]) {
616       debug(printf("Found repetition of oligo %06X == %06X\n",oligos[i],oligos[j]));
617       FREE(oligos);
618       return true;
619     }
620   }
621 
622   FREE(oligos);
623   return false;
624 }
625 #endif
626 
627 
628 static List_T
identify_singles(int * nnew,bool * overflowp,List_T matches,int merstart,Univcoord_T positionadj,int querylength,Univcoord_T * positions,int npositions,Width_T matchsize,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)629 identify_singles (int *nnew, bool *overflowp, List_T matches, int merstart, Univcoord_T positionadj,
630 		  int querylength, Univcoord_T *positions, int npositions,
631 #ifdef DEBUG
632 		  Width_T matchsize,
633 #endif
634 		  Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
635 		  Matchpool_T matchpool, bool forwardp, bool fivep, int maxentries) {
636   List_T newmatches = NULL, p;
637   Match_T match;
638   Univcoord_T position;
639   int i = 0, nentries = 0;
640   bool donep = false;
641   double weight;
642 
643   if (npositions == 0) {
644     *nnew = 0;
645     *overflowp = false;
646     return matches;
647   } else {
648     position = positions[0];
649   }
650 
651   Matchpool_save(matchpool);
652 
653   while (donep == false) {
654     if (position >= chrsubset_start && position < chrsubset_end) {
655       if (++nentries > maxentries) {
656 	donep = true;
657       } else {
658 	newmatches = Matchpool_push(newmatches,matchpool,merstart,querylength,forwardp,fivep,
659 				    position+positionadj,chromosome_iit);
660       }
661     }
662 
663     /* Advance */
664     if (++i >= npositions) {
665       donep = true;
666     } else {
667       position = positions[i];
668     }
669   }
670 
671   if (nentries > maxentries) {
672     /* Too many entries.  Not unique enough in genome */
673     *nnew = 0;
674     *overflowp = true;
675     debug(printf("  Singles overflow at %d\n",merstart));
676 
677     /* Not necessary to free */
678     Matchpool_restore(matchpool);
679 
680   } else {
681     *nnew = nentries;
682     *overflowp = false;
683     if (nentries > 0) {
684       weight = 1.0/(double) nentries;
685       for (p = newmatches; p != NULL; p = p->rest) {
686 	match = (Match_T) p->first;
687 	Match_set_weight(match,weight);
688 
689 	debug(
690 	      Match_print(match,chromosome_iit);
691 	      printf("\n");
692 	      );
693       }
694     }
695     matches = Matchpool_transfer(matches,newmatches);
696   }
697 
698   return matches;
699 }
700 
701 
702 static int
binary_search(int lowi,int highi,Univcoord_T * positions,Univcoord_T goal)703 binary_search (int lowi, int highi, Univcoord_T *positions, Univcoord_T goal) {
704   bool foundp = false;
705   int middlei;
706 
707 #ifdef NOBINARY
708   return lowi;
709 #endif
710 
711   if (goal == 0U) {
712     return lowi;
713   }
714 
715   while (!foundp && lowi < highi) {
716     middlei = lowi + ((highi - lowi) / 2);
717     debug4(printf("  binary: %d:%u %d:%u %d:%u   vs. %u\n",
718 		      lowi,positions[lowi],middlei,positions[middlei],
719 		      highi,positions[highi],goal));
720     if (goal < positions[middlei]) {
721       highi = middlei;
722     } else if (goal > positions[middlei]) {
723       lowi = middlei + 1;
724     } else {
725       foundp = true;
726     }
727   }
728 
729   if (foundp == true) {
730     debug4(printf("returning %d\n\n",middlei));
731     return middlei;
732   } else {
733     debug4(printf("returning %d\n\n",highi));
734     return highi;
735   }
736 }
737 
738 
739 /* positions0 should be left of positions1, and expecteddist should be
740  * positive.  This procedure assumes that the entries in position0 and
741  * positions1 are in order.  They are assumed not to have duplicates, which
742  * are removed by Indexdb_write_positions and Indexdb_read. */
743 static List_T
identify_doubles(int * nnew,bool * overflowp,List_T matches,int merstart,Univcoord_T positionadj,int querylength,Univcoord_T * positions0,int npositions0,Univcoord_T * positions1,int npositions1,Width_T matchsize,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)744 identify_doubles (int *nnew, bool *overflowp, List_T matches, int merstart, Univcoord_T positionadj,
745 		  int querylength, Univcoord_T *positions0, int npositions0,
746 		  Univcoord_T *positions1, int npositions1,
747 #ifdef DEBUG
748 		  Width_T matchsize,
749 #endif
750 		  Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
751 		  Matchpool_T matchpool, bool forwardp, bool fivep, int maxentries) {
752   List_T newmatches = NULL, p;
753   Match_T match;
754   Univcoord_T expected0, position0, expected1, position1;
755   int i = 0, j = 0, nentries = 0;
756   bool donep = false;
757   double weight;
758 
759   /*
760   debug(printf("Entering identify_doubles with merstart = %d, positionadj = %u, expecteddist = %d\n",
761 	       merstart,positionadj,expecteddist));
762   */
763   if (npositions0 == 0) {
764     /* debug(printf("Leaving identify_doubles because npositions0 == 0\n\n")); */
765     *nnew = 0;
766     *overflowp = false;
767     return matches;
768   } else {
769     position0 = positions0[0];
770     expected1 = position0 /*+ expecteddist*/;
771     if (npositions1 == 0) {
772       /* debug(printf("Leaving identify_doubles because npositions1 == 0\n\n")); */
773       *nnew = 0;
774       *overflowp = false;
775       return matches;
776     } else {
777       position1 = positions1[0];
778 #if 0
779       if (position1 < expecteddist) {
780 	expected0 = 0U;
781       } else {
782 #endif
783 	expected0 = position1 /*- expecteddist*/;
784 #if 0
785       }
786 #endif
787     }
788   }
789 
790   Matchpool_save(matchpool);
791 
792   while (donep == false) {
793     debug4(printf("  %d:%u %d:%u\n",i,position0,j,position1));
794     debug4(
795 	   if (abs(position1-position0) < 100) {
796 	     printf("Close: %u %u\n",position0,position1);
797 	   }
798 	  );
799 
800     if (expected1 < position1) {
801       /* Advance position0 */
802       if (npositions0 - i < BINARY_FOLDDIFF*(npositions1 - j)) {
803 	i++;
804       } else {
805 	debug4(printf("  remaining positions: %d and %d => binary\n",npositions0-i,npositions1-j));
806 	i = binary_search(i+1,npositions0,positions0,expected0);
807       }
808       if (i >= npositions0) {
809 	donep = true;
810       } else {
811 	position0 = positions0[i];
812 	expected1 = position0 /*+ expecteddist*/;
813       }
814 
815     } else if (expected1 > position1) {
816       /* Advance position1 */
817       if (npositions1 - j < BINARY_FOLDDIFF*(npositions0 - i)) {
818 	j++;
819       } else {
820 	debug4(printf("  remaining positions: %d and %d => binary\n",npositions0-i,npositions1-j));
821 	j = binary_search(j+1,npositions1,positions1,expected1);
822       }
823       if (j >= npositions1) {
824 	donep = true;
825       } else {
826 	position1 = positions1[j];
827 #if 0
828 	if (position1 < expecteddist) {
829 	  expected0 = 0U;
830 	} else {
831 #endif
832 	  expected0 = position1 /*- expecteddist*/;
833 #if 0
834 	}
835 #endif
836       }
837 
838     } else {
839       if (position0 >= chrsubset_start && position0 < chrsubset_end) {
840 	if (++nentries > maxentries) {
841 	  donep = true;
842 	} else {
843 	  newmatches = Matchpool_push(newmatches,matchpool,merstart,querylength,forwardp,fivep,
844 				      position0+positionadj,chromosome_iit);
845 	}
846       }
847 
848       /* Advance both */
849       if (++i >= npositions0) {
850 	donep = true;
851       } else {
852 	position0 = positions0[i];
853 	expected1 = position0 /*+ expecteddist*/;
854 	if (++j >= npositions1) {
855 	  donep = true;
856 	} else {
857 	  position1 = positions1[j];
858 	  expected0 = position1 /*- expecteddist*/;
859 	}
860       }
861     }
862   }
863 
864   if (nentries > maxentries) {
865     /* Too many entries.  Not unique enough in genome */
866     *nnew = 0;
867     *overflowp = true;
868     /* debug(printf("  Doubles overflow at %d\n\n",querypos1)); */
869 
870     /* Not necessary to free */
871     Matchpool_restore(matchpool);
872 
873   } else {
874     /* debug(printf("Leaving identify_doubles with %d matches\n\n",List_length(newmatches))); */
875     *nnew = nentries;
876     *overflowp = false;
877     if (nentries > 0) {
878       weight = 1.0/(double) nentries;
879 
880       debug(if (newmatches != NULL) {
881 	  printf("Entering identify_doubles with merstart = %d, positionadj = %u\n",
882 		 merstart,positionadj);
883 	});
884 
885       for (p = newmatches; p != NULL; p = p->rest) {
886 	match = (Match_T) p->first;
887 	Match_set_weight(match,weight);
888 	debug(
889 	      Match_print(match,chromosome_iit);
890 	      printf("\n");
891 	      );
892       }
893     }
894     matches = Matchpool_transfer(matches,newmatches);
895   }
896 
897   return matches;
898 }
899 
900 
901 /* Involves search of three lists.  This procedure assumes that the
902  * entries in positions0, positions1, and positions2 are in order.  They are assumed
903  * not to have duplicates, which are removed by
904  * Indexdb_write_positions and Indexdb_read. */
905 static List_T
identify_triples(int * nnew,bool * overflowp,List_T matches,int merstart,Univcoord_T positionadj,int querylength,Univcoord_T * positions0,int npositions0,Univcoord_T * positions1,int npositions1,Univcoord_T * positions2,int npositions2,Width_T matchsize,int expecteddist1,int expecteddist2,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)906 identify_triples (int *nnew, bool *overflowp, List_T matches, int merstart, Univcoord_T positionadj,
907 		  int querylength, Univcoord_T *positions0, int npositions0,
908 		  Univcoord_T *positions1, int npositions1, Univcoord_T *positions2, int npositions2,
909 #ifdef DEBUG
910 		  Width_T matchsize,
911 #endif
912 		  int expecteddist1, int expecteddist2, Univ_IIT_T chromosome_iit,
913 		  Univcoord_T chrsubset_start, Univcoord_T chrsubset_end, Matchpool_T matchpool,
914 		  bool forwardp, bool fivep, int maxentries) {
915   List_T newmatches = NULL, p;
916   Match_T match;
917   Univcoord_T position0, expected1, position1, expected2, position2;
918   int i = 0, j = 0, k = 0, nentries = 0;
919   int low2, middle2, high2;
920   bool donep = false, foundp;
921   double weight;
922   expecteddist1 = expecteddist2 = 0;
923 
924   if (npositions0 == 0) {
925     *nnew = 0;
926     *overflowp = false;
927     return matches;
928   } else {
929     position0 = positions0[0];
930     expected1 = position0 + expecteddist1;
931     if (npositions1 == 0) {
932       *nnew = 0;
933       *overflowp = false;
934       return matches;
935     } else {
936       position1 = positions1[0];
937       expected2 = position1 + expecteddist2;
938       if (npositions2 == 0) {
939 	*nnew = 0;
940 	*overflowp = false;
941 	return matches;
942       } else {
943 	position2 = positions2[0];
944       }
945     }
946   }
947 
948 
949   Matchpool_save(matchpool);
950 
951   while (donep == false) {
952     debug4(printf("  %d:%u %d:%u %d:%u\n",
953 		  i,position0,j,position1,k,position2));
954     if (expected1 < position1) {
955       /* Advance position0 */
956       if (++i >= npositions0) {
957 	donep = true;
958       } else {
959 	position0 = positions0[i];
960 	expected1 = position0 + expecteddist1;
961       }
962     } else if (expected1 > position1) {
963       /* Advance position1 */
964       if (++j >= npositions1) {
965 	donep = true;
966       } else {
967 	position1 = positions1[j];
968 	expected2 = position1 + expecteddist2;
969       }
970     } else if (expected2 < position2) {
971       /* Advance position1 */
972       if (++j >= npositions1) {
973 	donep = true;
974       } else {
975 	position1 = positions1[j];
976 	expected2 = position1 + expecteddist2;
977       }
978     } else if (expected2 > position2) {
979       /* Binary search */
980       low2 = k+1;
981       high2 = npositions2;
982       foundp = false;
983       debug4(
984 	     for (middle2 = low2; middle2 < high2; middle2++) {
985 	       printf("  --%d:%u %d:%u %d:%u\n",
986 		      i,position0,j,position1,middle2,
987 		      positions2[middle2]);
988 	     }
989 	     );
990 
991       while (!foundp && low2 < high2) {
992 	middle2 = (low2+high2)/2;
993 	debug4(printf("  **%d:%u %d:%u %d:%u   vs. %u\n",
994 		      low2,positions2[low2],
995 		      middle2,positions2[middle2],
996 		      high2,positions2[high2],
997 		      expected2));
998 	if (expected2 < positions2[middle2]) {
999 	  high2 = middle2;
1000 	} else if (expected2 > positions2[middle2]) {
1001 	  low2 = middle2 + 1;
1002 	} else {
1003 	  foundp = true;
1004 	}
1005       }
1006       if (foundp == true) {
1007 	k = middle2;
1008 	position2 = positions2[k];
1009       } else if ((k = high2) >= npositions2) {
1010 	donep = true;
1011       } else {
1012 	position2 = positions2[k];
1013       }
1014 
1015     } else {
1016       debug4(printf("Success at position %u\n",positions0[i]));
1017       if (position0 >= chrsubset_start && position0 < chrsubset_end) {
1018 	if (++nentries > maxentries) {
1019 	  donep = true;
1020 	} else {
1021 	  newmatches = Matchpool_push(newmatches,matchpool,merstart,querylength,forwardp,fivep,
1022 				      position0+positionadj,chromosome_iit);
1023 	}
1024       }
1025 
1026       /* Advance all */
1027       if (++i >= npositions0) {
1028 	donep = true;
1029       } else {
1030 	position0 = positions0[i];
1031 	expected1 = position0 + expecteddist1;
1032 	if (++j >= npositions1) {
1033 	  donep = true;
1034 	} else {
1035 	  position1 = positions1[j];
1036 	  expected2 = position1 + expecteddist2;
1037 	  if (++k >= npositions2) {
1038 	    donep = true;
1039 	  } else {
1040 	    position2 = positions2[k];
1041 	  }
1042 	}
1043       }
1044     }
1045   }
1046 
1047   if (nentries > maxentries) {
1048     *nnew = 0;
1049     *overflowp = true;
1050     debug(printf("  Triples overflow at %d\n",merstart));
1051     Matchpool_restore(matchpool);
1052   } else {
1053     *nnew = nentries;
1054     *overflowp = false;
1055     if (nentries > 0) {
1056       weight = 1.0/(double) nentries;
1057       for (p = newmatches; p != NULL; p = p->rest) {
1058 	match = (Match_T) p->first;
1059 	Match_set_weight(match,weight);
1060       }
1061     }
1062     matches = Matchpool_transfer(matches,newmatches);
1063   }
1064 
1065   return matches;
1066 }
1067 
1068 
1069 /************************************************************************
1070 
1071               merstart
1072 5', forward:  prevpos (pos0)     querypos (pos1)
1073 5', revcomp:  prevpos (pos1)     querypos (pos0)
1074 
1075 3', forward:  querypos (pos0)    prevpos (pos1)
1076 3', revcomp:  querypos (pos1)    prevpos (pos0)
1077 
1078 ************************************************************************/
1079 
1080 static List_T
identify_matches(int * nnew,bool * overflowp,List_T matches,int querypos,int querylength,Width_T oligosize,Width_T matchinterval,Univcoord_T ** plus_positions,int * plus_npositions,Univcoord_T ** minus_positions,int * minus_npositions,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,bool forwardp,bool fivep,int maxentries)1081 identify_matches (int *nnew, bool *overflowp, List_T matches, int querypos, int querylength,
1082 		  Width_T oligosize, Width_T matchinterval,
1083 		  Univcoord_T **plus_positions, int *plus_npositions,
1084 		  Univcoord_T **minus_positions, int *minus_npositions,
1085 		  Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1086 		  Matchpool_T matchpool, bool forwardp, bool fivep, int maxentries) {
1087   int prevpos, middlepos;
1088   int pos0, pos1;
1089   Univcoord_T **positions, positionadj = 0U;
1090   int *npositions;
1091   Width_T matchsize, merstart;
1092 
1093 #ifdef PMAP
1094   matchsize = matchinterval + index1part_aa;
1095 #else
1096   matchsize = matchinterval + index1part;
1097 #endif
1098 
1099   if (fivep == true) {
1100     prevpos = querypos - matchinterval;
1101     middlepos = querypos - oligosize;
1102     merstart = prevpos;
1103   } else {
1104     prevpos = querypos + matchinterval;
1105     middlepos = querypos + oligosize;
1106     merstart = querypos;
1107   }
1108   if (forwardp == fivep) {
1109     pos0 = prevpos;
1110     pos1 = querypos;
1111   } else {
1112     pos0 = querypos;
1113     pos1 = prevpos;
1114   }
1115 
1116   if (forwardp == true) {
1117     positions = plus_positions;
1118     npositions = plus_npositions;
1119   } else {
1120     positions = minus_positions;
1121     npositions = minus_npositions;
1122 #ifndef PMAP
1123     /* Need this adjustment because gmapindex has only one
1124        idxpositions file for both strands, where position always
1125        points to lowest coordinate of the 12-mer */
1126     positionadj = matchsize - 1U;
1127 #endif
1128   }
1129 
1130   if (matchsize == oligosize) {
1131     return identify_singles(&(*nnew),&(*overflowp),matches,merstart,positionadj,querylength,
1132 			    positions[pos0],npositions[pos0],
1133 #ifdef DEBUG
1134 			    matchsize,
1135 #endif
1136 			    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1137 			    forwardp,fivep,maxentries);
1138   } else if (matchsize <= 2*oligosize) {
1139     /* debug(printf("Using positions at %d and %d\n",pos0,pos1)); */
1140     return identify_doubles(&(*nnew),&(*overflowp),matches,merstart,positionadj,querylength,
1141 			    positions[pos0],npositions[pos0],
1142 			    positions[pos1],npositions[pos1],
1143 #ifdef DEBUG
1144 			    matchsize,
1145 #endif
1146 			    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1147 			    forwardp,fivep,maxentries);
1148   } else if (matchsize == 3*oligosize) {
1149     /* This branch hasn't been tested very well */
1150     debug(printf("Using positions at %d and %d and %d\n",pos0,middlepos,pos1));
1151     return identify_triples(&(*nnew),&(*overflowp),matches,merstart,positionadj,querylength,
1152 			    positions[pos0],npositions[pos0],
1153 			    positions[middlepos],npositions[middlepos],
1154 			    positions[pos1],npositions[pos1],
1155 #ifdef DEBUG
1156 			    matchsize,
1157 #endif
1158 #ifdef PMAP
1159 			    oligosize*3,oligosize*3,
1160 #else
1161 			    oligosize,oligosize,
1162 #endif
1163 			    chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1164 			    forwardp,fivep,maxentries);
1165   } else {
1166     abort();
1167   }
1168 }
1169 
1170 
1171 /* Need to change these procedures from n*m to n+m */
1172 static List_T
find_5prime_matches(int * nnew,List_T matches5,T this,Width_T matchsize,Univcoord_T ** plus_positions,int * plus_npositions,Univcoord_T ** minus_positions,int * minus_npositions,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int querystart,int querylength,int maxentries,bool pairedp)1173 find_5prime_matches (int *nnew, List_T matches5, T this, Width_T matchsize,
1174 		     Univcoord_T **plus_positions, int *plus_npositions,
1175 		     Univcoord_T **minus_positions, int *minus_npositions,
1176 		     Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1177 		     Matchpool_T matchpool, int querystart, int querylength,
1178 		     int maxentries, bool pairedp) {
1179   Width_T merstart;	   /* Note: negative values must be allowed */
1180   int nnewplus = 0, nnewminus = 0;
1181   bool overflowp;
1182   Width_T matchinterval;
1183 
1184 #ifdef PMAP
1185   matchinterval = matchsize - index1part_aa;
1186   /*matchinterval_nt = matchinterval*3; */
1187 #else
1188   matchinterval = matchsize - index1part;
1189   /* matchinterval_nt = matchinterval; */
1190 #endif
1191 
1192   if ((merstart = querystart - matchinterval) < 0) {
1193     debug3(printf("Quitting because %d - %d < 0\n",querystart,matchinterval));
1194     *nnew = 0;
1195   } else {
1196     debug3(printf("Identifying 5' matches for %d-mer at %d and %d.  maxentries=%d.\n",
1197 		  matchsize,merstart,querystart,maxentries));
1198 
1199     debug3(printf("  Previous status of plus_matchedp at %d is %d\n",merstart,this->plus_matchedp[merstart]));
1200     if (pairedp == true || this->plus_matchedp[merstart] == false) {
1201       matches5 = identify_matches(&nnewplus,&overflowp,matches5,querystart,querylength,
1202 				  this->oligosize,matchinterval,
1203 				  plus_positions,plus_npositions,minus_positions,minus_npositions,
1204 				  chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1205 				  /*forwardp*/true,/*fivep*/true,maxentries);
1206       if (overflowp == false) {
1207 	debug3(printf("  No overflow so setting plus_matchedp at %d to true\n",merstart));
1208 	this->plus_matchedp[merstart] = true;
1209       }
1210     }
1211 
1212     debug3(printf("  Previous status of minus_matchedp at %d is %d\n",merstart,this->minus_matchedp[merstart]));
1213     if (pairedp == true || this->minus_matchedp[merstart] == false) {
1214       matches5 = identify_matches(&nnewminus,&overflowp,matches5,querystart,querylength,
1215 				  this->oligosize,matchinterval,
1216 				  plus_positions,plus_npositions,minus_positions,minus_npositions,
1217 				  chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1218 				  /*forwardp*/false,/*fivep*/true,maxentries);
1219       if (overflowp == false) {
1220 	debug3(printf("  No overflow so setting minus_matchedp at %d to true\n",merstart));
1221 	this->minus_matchedp[merstart] = true;
1222       }
1223     }
1224 
1225     *nnew = nnewplus + nnewminus;
1226   }
1227 
1228   return matches5;
1229 }
1230 
1231 static List_T
find_3prime_matches(int * nnew,List_T matches3,T this,Width_T matchsize,Univcoord_T ** plus_positions,int * plus_npositions,Univcoord_T ** minus_positions,int * minus_npositions,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int queryend,int querylength,int maxentries,bool pairedp)1232 find_3prime_matches (int *nnew, List_T matches3, T this, Width_T matchsize,
1233 		     Univcoord_T **plus_positions, int *plus_npositions,
1234 		     Univcoord_T **minus_positions, int *minus_npositions,
1235 		     Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1236 		     Matchpool_T matchpool, int queryend, int querylength, int maxentries, bool pairedp) {
1237   Width_T merstart;
1238   int nnewplus = 0, nnewminus = 0;
1239   bool overflowp;
1240   Width_T matchinterval;
1241 
1242 #ifdef PMAP
1243   matchinterval = matchsize - index1part_aa;
1244   /* matchinterval_nt = matchinterval*3; */
1245 #else
1246   matchinterval = matchsize - index1part;
1247   /* matchinterval_nt = matchinterval; */
1248 #endif
1249 
1250   if (queryend + matchsize > querylength) {
1251     debug3(printf("Quitting because %d + %d > %d\n",queryend,matchsize,querylength));
1252     *nnew = 0;
1253   } else {
1254     merstart = queryend;
1255     debug3(printf("Identifying 3' matches for %d-mer at %d and %d.  maxentries=%d\n",
1256 		  matchsize,queryend,queryend+matchinterval,maxentries));
1257 
1258     /* If maxentries > 0, then we are working on pairs.  Otherwise, check for a successful previous check */
1259     debug3(printf("  Previous status of plus_matchedp at %d is %d\n",merstart,this->plus_matchedp[merstart]));
1260     if (pairedp == true || this->plus_matchedp[merstart] == false) {
1261 
1262       matches3 = identify_matches(&nnewplus,&overflowp,matches3,queryend,querylength,
1263 				  this->oligosize,matchinterval,
1264 				  plus_positions,plus_npositions,minus_positions,minus_npositions,
1265 				  chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1266 				  /*forwardp*/true,/*fivep*/false,maxentries);
1267       if (overflowp == false) {
1268 	debug3(printf("  No overflow so setting plus_matchedp at %d to true\n",merstart));
1269 	this->plus_matchedp[merstart] = true;
1270       }
1271     }
1272 
1273     debug3(printf("  Previous status of minus_matchedp at %d is %d\n",merstart,this->minus_matchedp[merstart]));
1274     if (pairedp == true || this->minus_matchedp[merstart] == false) {
1275       matches3 = identify_matches(&nnewminus,&overflowp,matches3,queryend,querylength,
1276 				  this->oligosize,matchinterval,
1277 				  plus_positions,plus_npositions,minus_positions,minus_npositions,
1278 				  chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1279 				  /*forwardp*/false,/*fivep*/false,maxentries);
1280       if (overflowp == false) {
1281 	debug3(printf("  No overflow so setting minus_matchedp at %d to true\n",merstart));
1282 	this->minus_matchedp[merstart] = true;
1283       }
1284     }
1285 
1286     *nnew = nnewplus + nnewminus;
1287   }
1288 
1289   return matches3;
1290 }
1291 
1292 
1293 /*
1294 static bool
1295 check_fraction_paired (List_T matches5, List_T matches3) {
1296   List_T p;
1297   int npaired5 = 0, nsingle5 = 0, npaired3 = 0, nsingle3 = 0;
1298   double fracpaired5, fracpaired3;
1299 
1300   for (p = matches5; p != NULL; p = p->rest) {
1301     if (Match_pairedp((Match_T) List_head(p)) == true) {
1302       npaired5++;
1303       } else {
1304 	nsingle5++;
1305       }
1306   }
1307   for (p = matches3; p != NULL; p = p->rest) {
1308     if (Match_pairedp((Match_T) List_head(p)) == true) {
1309       npaired3++;
1310     } else {
1311       nsingle3++;
1312     }
1313   }
1314 
1315   debug(printf("npaired5: %d, nsingle5: %d, npaired3: %d, nsingle3: %d\n",
1316 	       npaired5,nsingle5,npaired3,nsingle3);
1317 	);
1318   if (npaired5+nsingle5 == 0) {
1319     fracpaired5 = (double) (npaired5+1)/(double) (npaired5+nsingle5+1);
1320   } else {
1321     fracpaired5 = (double) npaired5/(double) (npaired5+nsingle5);
1322   }
1323   if (npaired3+nsingle3 == 0) {
1324     fracpaired3 = (double) npaired3/(double) (npaired3+nsingle3+1);
1325   } else {
1326     fracpaired3 = (double) npaired3/(double) (npaired3+nsingle3);
1327   }
1328   if (fracpaired5 > 0.75 || fracpaired3 > 0.75) {
1329     return true;
1330   } else {
1331     return false;
1332   }
1333 }
1334 */
1335 
1336 /* Previously had parameter stage1_stutter_sizelimit */
1337 static List_T
stutter(List_T gregionlist,T this,Width_T matchsize,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits)1338 stutter (List_T gregionlist, T this, Width_T matchsize,
1339 	 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1340 	 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1341 	 Matchpool_T matchpool, int stutterhits) {
1342   List_T newmatches5 = NULL, newmatches3 = NULL;
1343   int stutterdist5 = 0, stutterdist3 = 0, maxbases, start5, start3;
1344   bool foundpairp;
1345   double n5hits = 0.0, n3hits = 0.0;
1346   int nnew;
1347 #ifdef DEBUG1
1348   int i;
1349 #endif
1350 
1351   start5 = Block_querypos(this->block5);
1352   start3 = Block_querypos(this->block3);
1353 
1354 #if 0
1355   maxbases = stuttercycles*this->matchsize;  /* (matchsize = matchinterval + INDEX1PART_AA/INDEX1PART); */
1356   if (maxbases > (start3 - start5)/2) {
1357     maxbases = (start3 - start5)/2;
1358   }
1359 #else
1360   maxbases = (start3 - start5)/2;
1361 #endif
1362 
1363 
1364   debug(printf("*** Beginning stutter.  maxbases = %d, stutterhits = %d ***\n",maxbases,stutterhits));
1365 
1366   while (Block_next_5(this->block5,genestrand) == true &&
1367 	 stutterdist5 < maxbases && n5hits < stutterhits) {
1368     this->querystart = Block_querypos(this->block5);
1369     if (this->processedp[this->querystart] == false) {
1370       Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1371 			    &(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1372 			    this->block5,indexdb_fwd,indexdb_rev/*,stage1_stutter_sizelimit*/);
1373       debug(printf("stutter: Processing 5' position %d: %d plus, %d minus",
1374 		   this->querystart,this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1375       debug1(
1376 	     for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1377 	       printf(" +%u",this->plus_positions[this->querystart][i]);
1378 	     }
1379 	     for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1380 	       printf(" -%u",this->minus_positions[this->querystart][i]);
1381 	     }
1382 	     );
1383       debug(printf("\n"));
1384 
1385       this->processedp[this->querystart] = true;
1386     }
1387     newmatches5 = find_5prime_matches(&nnew,newmatches5,this,matchsize,
1388 				      this->plus_positions,this->plus_npositions,
1389 				      this->minus_positions,this->minus_npositions,
1390 				      chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1391 				      this->querystart,this->querylength,this->maxentries,true);
1392 
1393     stutterdist5 = Block_querypos(this->block5) - start5;
1394     if (nnew > 0) {
1395       n5hits += 1.0/(double) (1 + nnew);
1396       debug(printf("n5hits = %.1f, stutterdist5 = %d\n",n5hits,stutterdist5));
1397     }
1398   }
1399 
1400   while (Block_next_3(this->block3,genestrand) == true &&
1401 	 stutterdist3 < maxbases && n3hits < stutterhits) {
1402     this->queryend = Block_querypos(this->block3);
1403     if (this->processedp[this->queryend] == false) {
1404       Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1405 			    &(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
1406 			    this->block3,indexdb_fwd,indexdb_rev/*,stage1_stutter_sizelimit*/);
1407       debug(printf("stutter: Processing 3' position %d: %d plus, %d minus",
1408 		   this->queryend,this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
1409       debug1(
1410 	     for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
1411 	       printf(" +%u",this->plus_positions[this->queryend][i]);
1412 	     }
1413 	     for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
1414 	       printf(" -%u",this->minus_positions[this->queryend][i]);
1415 	     }
1416 	     );
1417       debug(printf("\n"));
1418       this->processedp[this->queryend] = true;
1419     }
1420     newmatches3 = find_3prime_matches(&nnew,newmatches3,this,matchsize,
1421 				      this->plus_positions,this->plus_npositions,
1422 				      this->minus_positions,this->minus_npositions,
1423 				      chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1424 				      this->queryend,this->querylength,this->maxentries,true);
1425 
1426     stutterdist3 = start3 - Block_querypos(this->block3);
1427     if (nnew > 0) {
1428       n3hits += 1.0/(double) (1 + nnew);
1429       debug(printf("n3hits = %.1f, stutterdist3 = %d\n",n3hits,stutterdist3));
1430     }
1431   }
1432 
1433   debug(printf("*** Ending stutter ***\n"));
1434 
1435   gregionlist = pair_up(&foundpairp,gregionlist,matchsize,newmatches5,newmatches3,
1436 			this->matches5,this->matches3,genestrand,chromosome_iit,
1437 			this->querylength,this->maxtotallen,
1438 			this->trimstart,this->trimend,this->trimlength);
1439 
1440   this->matches5 = Matchpool_transfer(this->matches5,newmatches5);
1441   this->matches3 = Matchpool_transfer(this->matches3,newmatches3);
1442 
1443   return gregionlist;
1444 }
1445 
1446 
1447 /* Tries to find matches on the 5' end to unpaired hits from the 3' end */
1448 /* Previously had parameer stage1_fillin_sizelimit */
1449 static List_T
fill_in_5(List_T gregionlist,T this,Width_T matchsize,List_T dangling3,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool)1450 fill_in_5 (List_T gregionlist, T this, Width_T matchsize, List_T dangling3,
1451 	   Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1452 	   Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1453 	   Matchpool_T matchpool) {
1454   List_T newmatches5 = NULL;
1455   int fillindist5 = 0, maxbases, start5;
1456   bool foundpairp = false;
1457   int nnew;
1458 #ifdef DEBUG1
1459   int i;
1460 #endif
1461 
1462   start5 = Block_querypos(this->block5);
1463   maxbases = MAX_FILL_IN;
1464   if (maxbases > this->querylength/2 - start5) {
1465     maxbases = this->querylength/2 - start5;
1466   }
1467   debug(printf("*** Beginning fill_in_5.  maxbases = %d ***\n",maxbases));
1468 
1469   while (Block_next_5(this->block5,genestrand) == true &&
1470 	 fillindist5 < maxbases && foundpairp == false) {
1471     this->querystart = Block_querypos(this->block5);
1472     if (this->processedp[this->querystart] == false) {
1473       Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1474 			    &(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1475 			    this->block5,indexdb_fwd,indexdb_rev/*,stage1_fillin_sizelimit*/);
1476       debug(printf("fill_in_5: Processing 5' position %d: %d plus, %d minus",
1477 		   this->querystart,this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1478       debug1(
1479 	     for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1480 	       printf(" +%u",this->plus_positions[this->querystart][i]);
1481 	     }
1482 	     for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1483 	       printf(" -%u",this->minus_positions[this->querystart][i]);
1484 	     }
1485 	     );
1486       debug(printf("\n"));
1487       this->processedp[this->querystart] = true;
1488     }
1489     newmatches5 = find_5prime_matches(&nnew,newmatches5,this,matchsize,
1490 				      this->plus_positions,this->plus_npositions,
1491 				      this->minus_positions,this->minus_npositions,
1492 				      chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1493 				      this->querystart,this->querylength,this->maxentries,true);
1494     fillindist5 = Block_querypos(this->block5) - start5;
1495 
1496     if (nnew > 0) {
1497       gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1498 			    newmatches5,/*newmatches3*/(List_T) NULL,
1499 			    (List_T) NULL,dangling3,genestrand,chromosome_iit,
1500 			    this->querylength,this->maxtotallen,
1501 			    this->trimstart,this->trimend,this->trimlength);
1502       debug(printf("   Foundpairp = %d\n",foundpairp));
1503     }
1504   }
1505 
1506   /* Mark newmatches5 as being pairedp if they match non-dangling matches3 */
1507   gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1508 			newmatches5,/*newmatches3*/(List_T) NULL,
1509 			(List_T) NULL,this->matches3,genestrand,chromosome_iit,
1510 			this->querylength,this->maxtotallen,
1511 			this->trimstart,this->trimend,this->trimlength);
1512 
1513   this->matches5 = Matchpool_transfer(this->matches5,newmatches5);
1514 
1515   debug(printf("*** Ending fill_in_5 ***\n"));
1516   return gregionlist;
1517 }
1518 
1519 
1520 /* Tries to find matches on the 5' end to unpaired hits from the 3' end */
1521 /* Previously had parameer stage1_fillin_sizelimit */
1522 static List_T
fill_in_3(List_T gregionlist,T this,Width_T matchsize,List_T dangling5,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool)1523 fill_in_3 (List_T gregionlist, T this, Width_T matchsize, List_T dangling5,
1524 	   Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1525 	   Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1526 	   Matchpool_T matchpool) {
1527   List_T newmatches3 = NULL;
1528   int fillindist3 = 0, maxbases, start3;
1529   bool foundpairp = false;
1530   int nnew;
1531 #ifdef DEBUG1
1532   int i;
1533 #endif
1534 
1535   start3 = Block_querypos(this->block3);
1536   maxbases = MAX_FILL_IN;
1537   if (maxbases > start3 - this->querylength/2) {
1538     maxbases = start3 - this->querylength/2;
1539   }
1540   debug(printf("*** Beginning fill_in_3.  maxbases = %d ***\n",maxbases));
1541 
1542   while (Block_next_3(this->block3,genestrand) == true &&
1543 	 fillindist3 < maxbases && foundpairp == false) {
1544     this->queryend = Block_querypos(this->block3);
1545     if (this->processedp[this->queryend] == false) {
1546       Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1547 			    &(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
1548 			    this->block3,indexdb_fwd,indexdb_rev/*,stage1_fillin_sizelimit*/);
1549       debug(printf("fill_in_3: Processing 3' position %d: %d plus, %d minus",
1550 		   this->queryend,this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
1551       debug1(
1552 	     for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
1553 	       printf(" +%u",this->plus_positions[this->queryend][i]);
1554 	     }
1555 	     for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
1556 	       printf(" -%u",this->minus_positions[this->queryend][i]);
1557 	     }
1558 	     );
1559       debug(printf("\n"));
1560       this->processedp[this->queryend] = true;
1561     }
1562     newmatches3 = find_3prime_matches(&nnew,newmatches3,this,matchsize,
1563 				      this->plus_positions,this->plus_npositions,
1564 				      this->minus_positions,this->minus_npositions,
1565 				      chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
1566 				      this->queryend,this->querylength,this->maxentries,true);
1567     fillindist3 = start3 - Block_querypos(this->block3);
1568 
1569     if (nnew > 0) {
1570       gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1571 			    /*newmatches5*/(List_T) NULL,newmatches3,
1572 			    dangling5,(List_T) NULL,genestrand,chromosome_iit,
1573 			    this->querylength,this->maxtotallen,
1574 			    this->trimstart,this->trimend,this->trimlength);
1575       debug(printf("   Foundpairp = %d\n",foundpairp));
1576     }
1577   }
1578 
1579   /* Mark newmatches3 as being pairedp if they match non-dangling matches5 */
1580   gregionlist = pair_up(&foundpairp,gregionlist,matchsize,
1581 			/*newmatches5*/(List_T) NULL,newmatches3,
1582 			this->matches5,(List_T) NULL,genestrand,chromosome_iit,
1583 			this->querylength,this->maxtotallen,
1584 			this->trimstart,this->trimend,this->trimlength);
1585 
1586   this->matches3 = Matchpool_transfer(this->matches3,newmatches3);
1587 
1588   debug(printf("*** Ending fill_in_3 ***\n"));
1589   return gregionlist;
1590 }
1591 
1592 
1593 #if 0
1594 static void
1595 sample (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int nskip) {
1596   int mod5 = 0, mod3 = 0;
1597 #ifdef DEBUG1
1598   int i;
1599 #endif
1600 
1601   Block_restore(this->block5);
1602   Block_restore(this->block3);
1603 
1604   while (Block_next_5(this->block5,genestrand) == true) {
1605     this->querystart = Block_querypos(this->block5);
1606     if (this->processedp[this->querystart] == false) {
1607       Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1608 			    &(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1609 			    this->block5,indexdb_fwd,indexdb_rev/*,stage1_sample_sizelimit*/);
1610       debug(printf("sample: Processing 5' position %d: %d plus, %d minus",
1611 		   this->querystart,this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1612       debug1(
1613 	     for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1614 	       printf(" +%u",this->plus_positions[this->querystart][i]);
1615 	     }
1616 	     for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1617 	       printf(" -%u",this->minus_positions[this->querystart][i]);
1618 	     }
1619 	     );
1620       debug(printf("\n"));
1621       this->processedp[this->querystart] = true;
1622 
1623       if (++mod5 % SAMPLERUN == 0) {
1624 	mod5 = 0;
1625 	Block_skip(this->block5,nskip);
1626       }
1627     }
1628   }
1629 
1630   while (Block_next_3(this->block3,genestrand) == true) {
1631     this->queryend = Block_querypos(this->block3);
1632     if (this->processedp[this->queryend] == false) {
1633       Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1634 			    &(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
1635 			    this->block3,indexdb_fwd,indexdb_rev/*,stage1_sample_sizelimit*/);
1636       debug(printf("sample: Processing 3' position %d: %d plus, %d minus",
1637 		   this->queryend,this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
1638       debug1(
1639 	     for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
1640 	       printf(" +%u",this->plus_positions[this->queryend][i]);
1641 	     }
1642 	     for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
1643 	       printf(" -%u",this->minus_positions[this->queryend][i]);
1644 	     }
1645 	     );
1646       debug(printf("\n"));
1647       this->processedp[this->queryend] = true;
1648 
1649       if (++mod3 % SAMPLERUN == 0) {
1650 	mod3 = 0;
1651 	Block_skip(this->block3,nskip);
1652       }
1653     }
1654   }
1655 
1656   return;
1657 }
1658 #endif
1659 
1660 
1661 static Univcoord_T *
find_range(int ** querypositions,int * ninrange,int starti,int endi,Univcoord_T ** positions,int * npositions,Univcoord_T leftbound,Univcoord_T rightbound)1662 find_range (int **querypositions, int *ninrange, int starti, int endi,
1663 	    Univcoord_T **positions, int *npositions, Univcoord_T leftbound, Univcoord_T rightbound) {
1664   Univcoord_T *range;
1665   int i;
1666   int querypos;
1667 
1668   if (rightbound < leftbound) {
1669     abort();
1670   }
1671 
1672   *ninrange = 0;
1673   for (querypos = starti; *ninrange < MAX_NINRANGE && querypos <= endi; querypos++) {
1674     i = binary_search(0,npositions[querypos],positions[querypos],leftbound);
1675     while (i < npositions[querypos] && positions[querypos][i] < rightbound) {
1676       debug2(printf("At querypos %d, found position %u in (%u,%u)\n",querypos,positions[querypos][i],leftbound,rightbound));
1677       (*ninrange)++;
1678       i++;
1679     }
1680   }
1681 
1682   if (*ninrange == 0) {
1683     *querypositions = (int *) NULL;
1684     return (Univcoord_T *) NULL;
1685   } else {
1686     *querypositions = (int *) CALLOC(*ninrange,sizeof(int));
1687     range = (Univcoord_T *) CALLOC(*ninrange,sizeof(Univcoord_T));
1688   }
1689 
1690   *ninrange = 0;
1691   for (querypos = starti; *ninrange < MAX_NINRANGE && querypos <= endi; querypos++) {
1692     i = binary_search(0,npositions[querypos],positions[querypos],leftbound);
1693     while (i < npositions[querypos] && positions[querypos][i] < rightbound) {
1694       (*querypositions)[*ninrange] = querypos;
1695       range[*ninrange] = positions[querypos][i];
1696       (*ninrange)++;
1697       i++;
1698     }
1699   }
1700 
1701   return range;
1702 }
1703 
1704 static void
find_extensions(Univcoord_T * extension5,Univcoord_T * extension3,T this,Gregion_T gregion,Chrpos_T maxtotallen,bool continuousp)1705 find_extensions (Univcoord_T *extension5, Univcoord_T *extension3, T this,
1706 		 Gregion_T gregion, Chrpos_T maxtotallen, bool continuousp) {
1707   int *querypositions, querystart, queryend, ninrange, i, j, lastj;
1708   Univcoord_T *range, leftbound, rightbound, best_start, best_end, expectedi, expectedj;
1709   int best_concentration, concentration;
1710   Chrpos_T maxintronlen5, maxintronlen3;
1711 #ifdef DEBUG2
1712   int best_querystart, best_queryend;
1713 #endif
1714 
1715   querystart = Gregion_querystart(gregion);
1716   queryend = Gregion_queryend(gregion);
1717   debug2(printf("Entering find_extensions with querystart = %d, queryend = %d, querylength %d\n",
1718 		querystart,queryend,this->querylength));
1719 
1720   debug2(printf("continuousp = %d, this->trimlength %d (vs %d), querystart %d (vs %d)\n",
1721 		continuousp,this->trimlength,SINGLEEXONLENGTH,querystart,NOEXTENDLEN));
1722   if (continuousp == true || this->trimlength < SINGLEEXONLENGTH || querystart < NOEXTENDLEN) {
1723     maxintronlen5 = querystart + 20;
1724   } else {
1725     maxintronlen5 = maxextension;
1726   }
1727 
1728   debug2(printf("continuousp = %d, this->trimlength %d (vs %d), this->trimlength - queryend %d (vs %d)\n",
1729 		continuousp,this->trimlength,SINGLEEXONLENGTH,this->trimlength - queryend,NOEXTENDLEN));
1730   if (continuousp == true || this->trimlength < SINGLEEXONLENGTH || this->trimlength - queryend < NOEXTENDLEN) {
1731     maxintronlen3 = this->querylength - queryend + 20;
1732   } else {
1733     maxintronlen3 = maxextension;
1734   }
1735   debug2(printf("Set maxintronlen5 = %u, maxintronlen3 = %u\n",maxintronlen5,maxintronlen3));
1736 
1737   if (Gregion_plusp(gregion) == true) {
1738     rightbound = Gregion_genomicstart(gregion); /* was Match_position(match5)  */
1739     if (rightbound < maxintronlen5) {
1740       leftbound = 0U;
1741     } else {
1742       leftbound = rightbound - maxintronlen5;
1743     }
1744     range = find_range(&querypositions,&ninrange,0,querystart-1,
1745 		       this->plus_positions,this->plus_npositions,
1746 		       leftbound,rightbound);
1747 
1748     best_concentration = 0;
1749     best_start = Gregion_genomicstart(gregion);	/* was Match_position(match5) */
1750     debug2(best_querystart = querystart);
1751     for (i = 0; i < ninrange; i++) {
1752       debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1753 		    i,range[i],querypositions[i],Gregion_genomicstart(gregion)-range[i]));
1754       if (Gregion_genomicstart(gregion) > range[i] + maxtotallen) {
1755 	debug2(printf("  => Too far away, so skipping\n"));
1756       } else {
1757 	concentration = 1;
1758 	lastj = i;
1759 	for (j = i+1; j < ninrange; j++) {
1760 	  debug2(printf("  %u@%d",range[j],querypositions[j]));
1761 	  expectedj = range[i] + querypositions[j] - querypositions[i];
1762 	  if (range[j] + 20 > expectedj && range[j] < expectedj + 20) {
1763 	    concentration++;
1764 	    lastj = j;
1765 	    debug2(printf("*"));
1766 	  }
1767 	}
1768 	debug2(printf("\nConcentration is %d\n\n",concentration));
1769 	if (concentration > best_concentration ||
1770 	    (concentration == best_concentration && range[i] > best_start)) {
1771 	  best_concentration = concentration;
1772 	  best_start = range[i];
1773 	  debug2(best_querystart = querypositions[i]);
1774 	}
1775       }
1776     }
1777     FREE(querypositions);
1778     FREE(range);
1779 
1780     *extension5 = Gregion_genomicstart(gregion) - best_start;  /* was Match_position(match5) */
1781 
1782   } else {
1783     leftbound = Gregion_genomicend(gregion); /* was Match_position(match5) */
1784     rightbound = leftbound + maxintronlen5;
1785     range = find_range(&querypositions,&ninrange,0,querystart-1,
1786 		       this->minus_positions,this->minus_npositions,
1787 		       leftbound,rightbound);
1788 
1789     best_concentration = 0;
1790     best_start = Gregion_genomicend(gregion); /* was Match_position(match5) */
1791     debug2(best_querystart = querystart);
1792     for (i = 0; i < ninrange; i++) {
1793       debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1794 		    i,range[i],querypositions[i],range[i]-Gregion_genomicend(gregion)));
1795       if (range[i] > Gregion_genomicend(gregion) + maxtotallen) {
1796 	debug2(printf("  => Too far away, so skipping\n"));
1797       } else {
1798 	concentration = 1;
1799 	lastj = i;
1800 	for (j = i+1; j < ninrange; j++) {
1801 	  debug2(printf("  %u@%d",range[j],querypositions[j]));
1802 	  expectedi = range[j] + querypositions[j] - querypositions[i];
1803 	  if (range[i] + 20 > expectedi && range[i] < expectedi + 20) {
1804 	    concentration++;
1805 	    lastj = j;
1806 	    debug2(printf("*"));
1807 	  }
1808 	}
1809 	debug2(printf("\nConcentration is %d\n\n",concentration));
1810 	if (concentration > best_concentration ||
1811 	  (concentration == best_concentration && range[i] < best_start)) {
1812 	  best_concentration = concentration;
1813 	  best_start = range[i];
1814 	  debug2(best_querystart = querypositions[i]);
1815 	}
1816       }
1817     }
1818     FREE(querypositions);
1819     FREE(range);
1820 
1821     *extension5 = best_start - Gregion_genomicend(gregion); /* was - Match_position(match5) */
1822   }
1823 
1824   debug2(printf("5' extension, from positions, best_querystart = %d\n",best_querystart));
1825   debug2(printf("5' extension, From positions, extension5 = %d\n",*extension5));
1826 
1827   if (Gregion_plusp(gregion) == true) {
1828     leftbound = Gregion_genomicend(gregion); /* was Match_position(match3) */
1829     rightbound = leftbound + maxintronlen3;
1830     range = find_range(&querypositions,&ninrange,queryend+this->oligosize+1,this->querylength-1,
1831 		       this->plus_positions,this->plus_npositions,
1832 		       leftbound,rightbound);
1833 
1834     best_concentration = 0;
1835     best_end = Gregion_genomicend(gregion);  /* was Match_position(match3); */
1836     debug2(best_queryend = queryend);
1837     for (i = 0; i < ninrange; i++) {
1838       debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1839 		    i,range[i],querypositions[i],range[i]-Gregion_genomicend(gregion)));
1840       if (range[i] > Gregion_genomicend(gregion) + maxtotallen) {
1841 	debug2(printf("  => Too far away, so skipping\n"));
1842       } else {
1843 	concentration = 1;
1844 	lastj = i;
1845 	for (j = i+1; j < ninrange; j++) {
1846 	  debug2(printf("  %u@%d",range[j],querypositions[j]));
1847 	  expectedj = range[i] + querypositions[j] - querypositions[i];
1848 	  if (range[j] + 20 > expectedj && range[j] < expectedj + 20) {
1849 	    concentration++;
1850 	    lastj = j;
1851 	    debug2(printf("*"));
1852 	  }
1853 	}
1854 	debug2(printf("\nConcentration is %d\n\n",concentration));
1855 	if (concentration > best_concentration ||
1856 	    (concentration == best_concentration && range[lastj] < best_end)) {
1857 	  best_concentration = concentration;
1858 	  best_end = range[lastj];
1859 	  debug2(best_queryend = querypositions[lastj]);
1860 	}
1861       }
1862     }
1863     FREE(querypositions);
1864     FREE(range);
1865 
1866     *extension3 = best_end - Gregion_genomicend(gregion); /* was - Match_position(match3); */
1867 
1868   } else {
1869     rightbound = Gregion_genomicstart(gregion);  /* was Match_position(match3); */
1870     if (rightbound < maxintronlen3) {
1871       leftbound = 0U;
1872     } else {
1873       leftbound = rightbound - maxintronlen3;
1874     }
1875     range = find_range(&querypositions,&ninrange,queryend+this->oligosize+1,this->querylength-1,
1876 		       this->minus_positions,this->minus_npositions,
1877 		       leftbound,rightbound);
1878 
1879     best_concentration = 0;
1880     best_end = Gregion_genomicstart(gregion);  /* was Match_position(match3); */
1881     debug2(best_queryend = queryend);
1882     for (i = 0; i < ninrange; i++) {
1883       debug2(printf("Looking at range %d = %u@%d (distance = %u)\n",
1884 		    i,range[i],querypositions[i],Gregion_genomicstart(gregion)-range[i]));
1885       if (Gregion_genomicstart(gregion) > range[i] + maxtotallen) {
1886 	debug2(printf("  => Too far away, so skipping\n"));
1887       } else {
1888 	concentration = 1;
1889 	lastj = i;
1890 	for (j = i+1; j < ninrange; j++) {
1891 	  debug2(printf("  %u@%d",range[j],querypositions[j]));
1892 	  expectedi = range[j] + querypositions[j] - querypositions[i];
1893 	  if (range[i] + 20 > expectedi && range[i] < expectedi + 20) {
1894 	    concentration++;
1895 	    lastj = j;
1896 	    debug2(printf("*"));
1897 	  }
1898 	}
1899 	debug2(printf("\nConcentration is %d\n\n",concentration));
1900 	if (concentration > best_concentration ||
1901 	    (concentration == best_concentration && range[lastj] > best_end)) {
1902 	  best_concentration = concentration;
1903 	  best_end = range[lastj];
1904 	  debug2(best_queryend = querypositions[lastj]);
1905 	}
1906       }
1907     }
1908     FREE(querypositions);
1909     FREE(range);
1910 
1911     *extension3 = Gregion_genomicstart(gregion) - best_end;  /* was Match_position(match3) */
1912   }
1913 
1914   debug2(printf("3' extension, from positions, best_queryend = %d\n",best_queryend));
1915   debug2(printf("3' extension, from positions, extension3 = %d\n",*extension3));
1916 
1917   return;
1918 }
1919 
1920 
1921 /* Previously had parameter stage1_firstpair_sizelimit */
1922 static List_T
find_first_pair(bool * foundpairp,List_T gregionlist,T this,Width_T matchsize,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool)1923 find_first_pair (bool *foundpairp, List_T gregionlist, T this, Width_T matchsize,
1924 		 Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
1925 		 Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
1926 		 Matchpool_T matchpool) {
1927   List_T newmatches5 = NULL, newmatches3 = NULL;
1928   bool donep = false;
1929   int nnew;
1930   double n5hits = 0.0, n3hits = 0.0;
1931   int nblocks5 = 0, nblocks3 = 0;
1932 #ifdef DEBUG1
1933   int i;
1934 #endif
1935 
1936   Block_reset_ends(this->block5);
1937   Block_reset_ends(this->block3);
1938 
1939   debug(printf("*** Starting stage1 with matchsize %d ***\n",matchsize));
1940   debug(printf("    querystart = %d, queryend = %d\n",Block_querypos(this->block5),Block_querypos(this->block3)));
1941 
1942   *foundpairp = false;
1943   while (!donep && (*foundpairp) == false) {
1944     if (n5hits <= n3hits) {
1945       if (Block_next_5(this->block5,genestrand) == false) {
1946 	donep = true;
1947       } else {
1948 	this->querystart = Block_querypos(this->block5);
1949 	if (this->processedp[this->querystart] == false) {
1950 	  Block_process_oligo_5(&(this->plus_positions[this->querystart]),&(this->plus_npositions[this->querystart]),
1951 				&(this->minus_positions[this->querystart]),&(this->minus_npositions[this->querystart]),
1952 				this->block5,indexdb_fwd,indexdb_rev/*,stage1_firstpair_sizelimit*/);
1953 #ifdef PMAP
1954 	  debug(printf("find_first_pair: Processing 5' position %d (aaindex %u): %d plus, %d minus",
1955 		       this->querystart,Block_aaindex(this->block5),
1956 		       this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1957 #else
1958 	  debug(printf("find_first_pair: Processing 5' position %d (forward %06X, revcomp %06X): %d plus, %d minus",
1959 		       this->querystart,Block_forward(this->block5),Block_revcomp(this->block5),
1960 		       this->plus_npositions[this->querystart],this->minus_npositions[this->querystart]));
1961 #endif
1962 	  debug1(
1963 		 for (i = 0; i < this->plus_npositions[this->querystart]; i++) {
1964 		   printf(" +%u",this->plus_positions[this->querystart][i]);
1965 		 }
1966 		 for (i = 0; i < this->minus_npositions[this->querystart]; i++) {
1967 		   printf(" -%u",this->minus_positions[this->querystart][i]);
1968 		 }
1969 		 );
1970 	  debug(printf("\n"));
1971 	  this->processedp[this->querystart] = true;
1972 	  nblocks5++;
1973 	}
1974 	newmatches5 = find_5prime_matches(&nnew,NULL,this,matchsize,
1975 					  this->plus_positions,this->plus_npositions,
1976 					  this->minus_positions,this->minus_npositions,chromosome_iit,
1977 					  chrsubset_start,chrsubset_end,matchpool,this->querystart,
1978 					  this->querylength,this->maxentries,true);
1979 	if (nnew > 0) {
1980 	  n5hits += 1.0/(double) (1 + nnew);
1981 	  debug(printf("    n5hits: %.1f, n3hits: %.1f\n",n5hits,n3hits));
1982 	  gregionlist = pair_up(&(*foundpairp),gregionlist,matchsize,
1983 				newmatches5,/*newmatches3*/NULL,
1984 				this->matches5,this->matches3,genestrand,chromosome_iit,
1985 				this->querylength,this->maxtotallen,
1986 				this->trimstart,this->trimend,this->trimlength);
1987 	  this->matches5 = Matchpool_transfer(this->matches5,newmatches5);
1988 	  newmatches5 = NULL;
1989 	}
1990       }
1991 
1992     } else {
1993       if (Block_next_3(this->block3,genestrand) == false) {
1994 	donep = true;
1995       } else {
1996 	this->queryend = Block_querypos(this->block3);
1997 	if (this->processedp[this->queryend] == false) {
1998 	  Block_process_oligo_3(&(this->plus_positions[this->queryend]),&(this->plus_npositions[this->queryend]),
1999 				&(this->minus_positions[this->queryend]),&(this->minus_npositions[this->queryend]),
2000 				this->block3,indexdb_fwd,indexdb_rev/*,stage1_firstpair_sizelimit*/);
2001 #ifdef PMAP
2002 	  debug(printf("find_first_pair: Processing 3' position %d (aaindex %u): %d plus, %d minus",
2003 		       this->queryend,Block_aaindex(this->block3),
2004 		       this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
2005 #else
2006 	  debug(printf("find_first_pair: Processing 3' position %d (forward %06X, revcomp %06X): %d plus, %d minus",
2007 		       this->queryend,Block_forward(this->block3),Block_revcomp(this->block3),
2008 		       this->plus_npositions[this->queryend],this->minus_npositions[this->queryend]));
2009 #endif
2010 	  debug1(
2011 		 for (i = 0; i < this->plus_npositions[this->queryend]; i++) {
2012 		   printf(" +%u",this->plus_positions[this->queryend][i]);
2013 		 }
2014 		 for (i = 0; i < this->minus_npositions[this->queryend]; i++) {
2015 		   printf(" -%u",this->minus_positions[this->queryend][i]);
2016 		 }
2017 		 );
2018 	  debug(printf("\n"));
2019 	  this->processedp[this->queryend] = true;
2020 	  nblocks3++;
2021 	}
2022 	newmatches3 = find_3prime_matches(&nnew,NULL,this,matchsize,
2023 					  this->plus_positions,this->plus_npositions,
2024 					  this->minus_positions,this->minus_npositions,
2025 					  chromosome_iit,chrsubset_start,chrsubset_end,matchpool,
2026 					  this->queryend,this->querylength,this->maxentries,true);
2027 	if (nnew > 0) {
2028 	  n3hits += 1.0/(double) (1 + nnew);
2029 	  debug(printf("    n5hits: %.1f, n3hits: %.1f\n",n5hits,n3hits));
2030 	  gregionlist = pair_up(&(*foundpairp),gregionlist,matchsize,
2031 				/*newmatches5*/NULL,newmatches3,
2032 				this->matches5,this->matches3,genestrand,chromosome_iit,
2033 				this->querylength,this->maxtotallen,
2034 				this->trimstart,this->trimend,this->trimlength);
2035 	  this->matches3 = Matchpool_transfer(this->matches3,newmatches3);
2036 	  newmatches3 = NULL;
2037 	}
2038       }
2039     }
2040   }
2041 
2042   return gregionlist;
2043 }
2044 
2045 
2046 /************************************************************************/
2047 
2048 
2049 /* A dangling match is one that has not been paired up with a match
2050    from the other end of the cDNA sequence.  A significant fraction of
2051    dangling matches may indicate the need to continue finding more
2052    matches from the other end.  Conversely, the absence of dangling
2053    matches on both ends indicates that no other possibilities exist in
2054    the genome. */
2055 
2056 static int
count_dangling(List_T matches)2057 count_dangling (List_T matches) {
2058   int ndangling = 0;
2059   Match_T match;
2060   List_T p;
2061 
2062   for (p = matches; p != NULL; p = p->rest) {
2063     match = (Match_T) p->first;
2064     if (Match_npairings(match) == 0) {
2065       ndangling++;
2066     }
2067   }
2068   return ndangling;
2069 }
2070 
2071 static double
dangling_pct(List_T matches)2072 dangling_pct (List_T matches) {
2073   double ndangling = 0.0, denominator = 0.0;
2074   Match_T match;
2075   List_T p;
2076   bool weightp = false;
2077 
2078   for (p = matches; p != NULL; p = p->rest) {
2079     match = (Match_T) p->first;
2080     if (Match_npairings(match) == 0) {
2081       ndangling += Match_weight(match);
2082     }
2083     if (Match_has_weight_p(match) == true) {
2084       denominator += Match_weight(match);
2085       weightp = true;
2086     }
2087   }
2088   if (weightp == false) {
2089     return 0.0;
2090   } else {
2091     debug(printf("(%f/%f) ",ndangling,denominator));
2092     return ndangling/denominator;
2093   }
2094 }
2095 
2096 static List_T
get_dangling(List_T matches,Matchpool_T matchpool)2097 get_dangling (List_T matches, Matchpool_T matchpool) {
2098   List_T dangling = NULL, p;
2099   Match_T match;
2100 
2101   for (p = matches; p != NULL; p = p->rest) {
2102     match = (Match_T) p->first;
2103     if (Match_npairings(match) == 0) {
2104       dangling = Matchpool_push_existing(dangling,matchpool,match);
2105     }
2106   }
2107   return dangling;
2108 }
2109 
2110 
2111 /************************************************************************
2112  *   Procedures for solving using a heap
2113  ************************************************************************/
2114 
2115 static void
read_oligos(T this,Sequence_T queryuc,int genestrand)2116 read_oligos (T this, Sequence_T queryuc, int genestrand) {
2117   Reader_T reader;
2118   int querypos;
2119   Oligostate_T last_state = INIT;
2120 #ifdef PMAP
2121   Oligospace_T aaindex = 0U;
2122 #else
2123   Oligospace_T forward = 0U, revcomp = 0U;
2124 #endif
2125 
2126   this->validp = (bool *) CALLOC(this->querylength,sizeof(bool));
2127 #ifdef PMAP
2128   this->oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
2129   reader = Reader_new(Sequence_fullpointer(queryuc),/*querystart*/0,/*queryend*/this->querylength);
2130   /* Prevents us from processing invalid query 12-mers */
2131   for (querypos = 0; querypos <= this->querylength - index1part_aa; querypos++) {
2132     this->validp[querypos] = false;
2133   }
2134 #else
2135   this->forward_oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
2136   this->revcomp_oligos = (Oligospace_T *) CALLOC(this->querylength,sizeof(Oligospace_T));
2137   reader = Reader_new(Sequence_fullpointer(queryuc),/*querystart*/0,/*queryend*/this->querylength);
2138   debug(printf("oligobase is %d, oligobase_mask is %08X\n",index1part,oligobase_mask));
2139   /* Prevents us from processing invalid query 12-mers */
2140   for (querypos = 0; querypos <= this->querylength - index1part; querypos++) {
2141     this->validp[querypos] = false;
2142   }
2143 #endif
2144 
2145   /* Note: leftshifting is done here, rather than in Oligo_lookup */
2146   while ((last_state = Oligo_next_5(last_state,&querypos,
2147 #ifdef PMAP
2148 				    &aaindex,
2149 #else
2150 				    &forward,&revcomp,
2151 #endif
2152 				    reader,genestrand)) != DONE) {
2153 
2154     this->validp[querypos] = true;
2155 #ifdef PMAP
2156     this->oligos[querypos] = aaindex;
2157 #else
2158     this->forward_oligos[querypos] = forward & oligobase_mask;
2159     this->revcomp_oligos[querypos] = (revcomp >> leftreadshift) & oligobase_mask;
2160 #endif
2161 
2162 #if 0
2163     if (this->forward_oligos[querypos] == 0U || this->revcomp_oligos[querypos] == 0U) {
2164       debug(printf("Position %d is poly-A or poly-T\n",querypos));
2165       this->polyat[querypos] = true;
2166     }
2167 #endif
2168 
2169   }
2170 
2171   Reader_free(&reader);
2172   return;
2173 }
2174 
2175 
2176 struct Rep_T {
2177   Oligospace_T oligo;
2178   int querypos;
2179 };
2180 
2181 static int
Rep_compare(const void * a,const void * b)2182 Rep_compare (const void *a, const void *b) {
2183   struct Rep_T x = * (struct Rep_T *) a;
2184   struct Rep_T y = * (struct Rep_T *) b;
2185 
2186   if (x.oligo < y.oligo) {
2187     return -1;
2188   } else if (y.oligo < x.oligo) {
2189     return +1;
2190   } else if (x.querypos < y.querypos) {
2191     return -1;
2192   } else if (y.querypos < x.querypos) {
2193     return +1;
2194   } else {
2195     return 0;
2196   }
2197 }
2198 
2199 static void
identify_repeated_oligos(T this,int oligobase,int querylength)2200 identify_repeated_oligos (T this, int oligobase, int querylength) {
2201   struct Rep_T *reps;
2202   int querypos, i, j, k, n = 0, min, max;
2203 
2204   debug(printf("Starting identify_repeated_oligos\n"));
2205 
2206   if (querylength < MAX_QUERYLENGTH_STACK) {
2207     reps = (struct Rep_T *) MALLOCA(this->querylength * sizeof(struct Rep_T));
2208   } else {
2209     reps = (struct Rep_T *) MALLOC(this->querylength * sizeof(struct Rep_T));
2210   }
2211 
2212   for (querypos = 0; querypos <= this->querylength - oligobase; querypos++) {
2213     if (this->validp[querypos] == true) {
2214 #ifdef PMAP
2215       reps[n].oligo = this->oligos[querypos];
2216 #else
2217       reps[n].oligo = this->forward_oligos[querypos];
2218 #endif
2219       reps[n].querypos = querypos;
2220       n++;
2221     }
2222   }
2223 
2224   qsort(reps,n,sizeof(struct Rep_T),Rep_compare);
2225 #if 0
2226   /* Test for thread safety */
2227   for (i = 0; i < n-1; i++) {
2228     if (Rep_compare(&(reps[i]),&(reps[i+1])) > 0) {
2229       abort();
2230     }
2231   }
2232 #endif
2233 
2234   for (i = 0, j = 1; j < n; i++, j++) {
2235     if (reps[j].oligo == reps[i].oligo) {
2236 #ifdef PMAP
2237       debug(printf("Found repetition of oligo %u at %d and %d\n",
2238 		   reps[i].oligo,reps[i].querypos,reps[j].querypos));
2239 #else
2240       debug(printf("Found repetition of oligo %06X at %d and %d\n",
2241 		   reps[i].oligo,reps[i].querypos,reps[j].querypos));
2242 #endif
2243 
2244       if (reps[j].querypos - reps[i].querypos <= MIN_REPEAT) {
2245 #if 0
2246 	this->validp[reps[j].querypos] = false;
2247 	this->validp[reps[i].querypos] = false;
2248 	this->processedp[reps[j].querypos] = true;
2249 	this->processedp[reps[i].querypos] = true;
2250 #else
2251 	/* Wiping out everything around the repeated oligos */
2252 	if ((min = reps[i].querypos - oligobase) < 0) {
2253 	  min = 0;
2254 	}
2255 	if ((max = reps[i].querypos + oligobase) > querylength) {
2256 	  max = querylength;
2257 	}
2258 	for (k = min; k < max; k++) {
2259 	  this->validp[k] = false;
2260 	  this->processedp[k] = true;
2261 	}
2262 
2263 	if ((min = reps[j].querypos - oligobase) < 0) {
2264 	  min = 0;
2265 	}
2266 	if ((max = reps[j].querypos + oligobase) > querylength) {
2267 	  max = querylength;
2268 	}
2269 	for (k = min; k < max; k++) {
2270 	  this->validp[k] = false;
2271 	  this->processedp[k] = true;
2272 	}
2273 #endif
2274 
2275       }
2276     }
2277   }
2278 
2279   if (querylength < MAX_QUERYLENGTH_STACK) {
2280     FREEA(reps);
2281   } else {
2282     FREE(reps);
2283   }
2284 
2285   return;
2286 }
2287 
2288 
2289 #if 0
2290 static void
2291 check_old_new (Univcoord_T *old_positions, int old_npositions, Univcoord_T *new_positions, int new_npositions) {
2292   int i;
2293 
2294   if (new_npositions != old_npositions) {
2295     abort();
2296   } else {
2297     for (i = 0; i < new_npositions; i++) {
2298       if (new_positions[i] != old_positions[i]) {
2299 	abort();
2300       }
2301     }
2302   }
2303   return;
2304 }
2305 #endif
2306 
2307 
2308 /* Whether to use indexdb_size_threshold */
2309 /* #define SIZELIMIT 1 */
2310 
2311 #ifdef PMAP
2312 static void
sample_oligos(T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int querylength,int oligobase)2313 sample_oligos (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2314   int querypos;
2315 
2316   for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2317 #if 0
2318     if (this->validp[querypos] == true && this->processedp[querypos] == true) {
2319       printf("sample_oligos: Querypos %d, oligo is %06X\n",querypos,this->oligos[querypos]);
2320       plus_positions = Indexdb_read_with_diagterm(&plus_npositions,indexdb_fwd,this->oligos[querypos],
2321 						  /*diagterm*/querylength-querypos);
2322       check_old_new(this->plus_positions[querypos],this->plus_npositions[querypos],plus_positions,plus_npositions);
2323 
2324       minus_positions = Indexdb_read_with_diagterm(&minus_npositions,indexdb_rev,this->oligos[querypos],
2325 						   /*diagterm*/querypos + index1part_aa*3);
2326       check_old_new(this->minus_positions[querypos],this->minus_npositions[querypos],minus_positions,minus_npositions);
2327 
2328     }
2329 #endif
2330 
2331     /* FORMULA */
2332     if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2333       this->plus_positions[querypos] =
2334 	Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->oligos[querypos],
2335 				   /*diagterm*/querylength-querypos);
2336       this->minus_positions[querypos] =
2337 	Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->oligos[querypos],
2338 				   /*diagterm*/querypos + index1part_aa*3);
2339       debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2340 		   querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2341       this->processedp[querypos] = true;
2342     }
2343   }
2344 
2345   return;
2346 }
2347 
2348 /* This procedure is equivalent to sample_oligos */
2349 static void
sample_oligos_nolimit(T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int querylength,int oligobase)2350 sample_oligos_nolimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2351   int querypos;
2352 
2353   for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2354     /* FORMULA */
2355     if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2356       this->plus_positions[querypos] =
2357 	Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->oligos[querypos],
2358 				   /*diagterm*/querylength-querypos);
2359 
2360       this->minus_positions[querypos] =
2361 	Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->oligos[querypos],
2362 				   /*diagterm*/querypos + index1part_aa*3);
2363 
2364       debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2365 		   querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2366       this->processedp[querypos] = true;
2367     }
2368   }
2369 
2370   return;
2371 }
2372 
2373 #else
2374 
2375 #if 0
2376 /* Needed to limit number of nregions */
2377 static void
2378 sample_oligos_sizelimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase,
2379 			 int stage1_sample_sizelimit) {
2380   int querypos;
2381 
2382   for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2383     /* FORMULA */
2384     if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2385       this->plus_positions[querypos] =
2386 	Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->forward_oligos[querypos],
2387 				   /*diagterm*/querylength-querypos);
2388       this->minus_positions[querypos] =
2389 	Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->revcomp_oligos[querypos],
2390 				   /*diagterm*/querypos + index1part);
2391       debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2392 		   querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2393       this->processedp[querypos] = true;
2394     }
2395   }
2396 
2397   return;
2398 }
2399 #endif
2400 
2401 
2402 static void
sample_oligos_nolimit(T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int querylength,int oligobase)2403 sample_oligos_nolimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2404   int querypos;
2405 
2406   for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2407     /* FORMULA */
2408     if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2409       this->plus_positions[querypos] =
2410 	Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->forward_oligos[querypos],
2411 				   /*diagterm*/querylength-querypos);
2412 
2413       this->minus_positions[querypos] =
2414 	Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->revcomp_oligos[querypos],
2415 				   /*diagterm*/querypos + index1part);
2416 
2417       debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2418 		   querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2419       this->processedp[querypos] = true;
2420     }
2421   }
2422 
2423   return;
2424 }
2425 #endif
2426 
2427 
2428 #if 0
2429 static void
2430 sample_oligos_findlimit (T this, Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int querylength, int oligobase) {
2431   int querypos;
2432 
2433   int *counts_plus, *counts_minus, threshold;
2434   int n;
2435 
2436   n = querylength-oligobase;
2437   counts_plus = (int *) CALLOC(n,sizeof(int));
2438   counts_minus = (int *) CALLOC(n,sizeof(int));
2439 
2440   for (querypos = 0; querypos < querylength - oligobase; querypos++) {
2441     /* FORMULA */
2442     if (this->validp[querypos] == true && this->processedp[querypos] == false) {
2443       this->plus_positions[querypos] =
2444 	Indexdb_read_with_diagterm(&(this->plus_npositions[querypos]),indexdb_fwd,this->forward_oligos[querypos],
2445 				   /*diagterm*/querylength-querypos);
2446       counts_plus[querypos] = this->plus_npositions[querypos];
2447 
2448       this->minus_positions[querypos] =
2449 	Indexdb_read_with_diagterm(&(this->minus_npositions[querypos]),indexdb_rev,this->revcomp_oligos[querypos],
2450 				   /*diagterm*/querypos + index1part);
2451       counts_minus[querypos] = this->minus_npositions[querypos];
2452 
2453       debug(printf("Sampling at querypos %d, plus_npositions = %d, minus_npositions = %d\n",
2454 		   querypos,this->plus_npositions[querypos],this->minus_npositions[querypos]));
2455       this->processedp[querypos] = true;
2456     }
2457   }
2458 
2459   /* Compute own size thresholds here */
2460   threshold = Orderstat_int_pct_inplace(counts_plus,n,/*percentile*/0.90);
2461   for (querypos = 0; querypos < n; querypos++) {
2462     if (counts_plus[querypos] > threshold) {
2463       this->plus_npositions[querypos] = 0;
2464     }
2465   }
2466 
2467   threshold = Orderstat_int_pct_inplace(counts_minus,n,/*percentile*/0.90);
2468   for (querypos = 0; querypos < n; querypos++) {
2469     if (counts_minus[querypos] > threshold) {
2470       this->minus_npositions[querypos] = 0;
2471     }
2472   }
2473 
2474   return;
2475 }
2476 #endif
2477 
2478 
2479 
2480 
2481 #define PARENT(i) (i >> 1)
2482 #define LEFT(i) (i << 1)
2483 #define RIGHT(i) ((i << 1) | 1)
2484 
2485 
2486 typedef struct Batch_T *Batch_T;
2487 struct Batch_T {
2488   int querypos;
2489   int ndiagonals;
2490   Univcoord_T diagonal;
2491   Univcoord_T *diagonals;
2492 };
2493 
2494 static void
Batch_init(Batch_T batch,int querypos,Univcoord_T * diagonals,int ndiagonals)2495 Batch_init (Batch_T batch, int querypos, Univcoord_T *diagonals, int ndiagonals) {
2496 
2497   batch->querypos = querypos;
2498   batch->diagonals = diagonals;
2499   batch->diagonal = *diagonals;
2500   batch->ndiagonals = ndiagonals;
2501 
2502   return;
2503 }
2504 
2505 
2506 static void
min_heap_insert(Batch_T * heap,int * heapsize,Batch_T batch)2507 min_heap_insert (Batch_T *heap, int *heapsize, Batch_T batch) {
2508   int querypos, i;
2509   Univcoord_T diagonal;
2510 
2511   /* debug0(printf("Inserting into heap: diagonal %u at querypos %d\n",diagonal,querypos)); */
2512 
2513   querypos = batch->querypos;
2514   diagonal = batch->diagonal;
2515 
2516   i = ++(*heapsize);
2517   /* sort primarily by diagonal, then by querypos */
2518   while (i > 1 && (heap[PARENT(i)]->diagonal > diagonal ||
2519 		   (heap[PARENT(i)]->diagonal == diagonal && heap[PARENT(i)]->querypos > querypos))) {
2520     heap[i] = heap[PARENT(i)];
2521     i = PARENT(i);
2522   }
2523   heap[i] = batch;
2524 
2525   return;
2526 }
2527 
2528 /* A diagonal segment */
2529 typedef struct Segment_T *Segment_T;
2530 struct Segment_T {
2531   Univcoord_T diagonal;
2532   int chrnum;
2533   int querypos5;
2534   int querypos3;
2535   int median;
2536   int score;
2537   int noligomers;	    /* Needed to speed up single_mismatches */
2538 };
2539 
2540 
2541 
2542 /* Reduces diagonals to handle small indels */
2543 static void
collapse_diagonals(Univcoord_T ** diagonals,int * ndiagonals,int oligobase,int querylength)2544 collapse_diagonals (Univcoord_T **diagonals, int *ndiagonals, int oligobase, int querylength) {
2545   Batch_T batch, *heap, sentinel;
2546   struct Batch_T sentinel_struct, *batchpool;
2547   int maxsegments, heapsize = 0, i;
2548   int parenti, smallesti, righti;
2549   int querypos;
2550   Univcoord_T diagonal, base_diagonal, last_diagonal;
2551 #ifdef DEBUG6
2552   int j;
2553 #endif
2554 
2555   debug6(printf("*** Starting collapse_diagonals ***\n"));
2556 
2557   /* Set up batches */
2558   maxsegments = 0;
2559 
2560   /* Batch_T can be large, so don't use alloca */
2561   batchpool = (struct Batch_T *) MALLOC((querylength-oligobase+1) * sizeof(struct Batch_T));
2562   heap = (Batch_T *) MALLOC((2*querylength+1+1) * sizeof(Batch_T));
2563 
2564   for (querypos = 0, i = 0; querypos <= querylength - oligobase; querypos++) {
2565     if (ndiagonals[querypos] > 0) {
2566       maxsegments += ndiagonals[querypos];
2567 #ifdef PMAP
2568       debug6(printf("Adding batch for querypos %d with %d diagonals and aaindex %u\n",
2569 		    querypos,ndiagonals[querypos],oligos[querypos]));
2570 #else
2571       debug6(printf("Adding batch for querypos %d with %d diagonals and oligo %06X\n",
2572 		    querypos,ndiagonals[querypos],oligos[querypos]));
2573 #endif
2574       batch = &(batchpool[i++]);
2575       Batch_init(batch,querypos,diagonals[querypos],ndiagonals[querypos]);
2576       min_heap_insert(heap,&heapsize,batch);
2577     }
2578   }
2579 
2580   if (maxsegments == 0) {
2581     FREE(heap);
2582     FREE(batchpool);
2583     return;
2584   }
2585 
2586   /* Set up rest of heap */
2587   sentinel_struct.querypos = querylength; /* infinity */
2588   sentinel_struct.diagonal = (Univcoord_T) -1; /* infinity */
2589   sentinel = &sentinel_struct;
2590   for (i = heapsize+1; i <= 2*heapsize+1; i++) {
2591     heap[i] = sentinel;
2592   }
2593 
2594 
2595   /* Initialize loop */
2596   batch = heap[1];
2597   base_diagonal = last_diagonal = diagonal = batch->diagonal;
2598   debug6(printf("diagonal %u\n",diagonal));
2599 
2600   if (--batch->ndiagonals <= 0) {
2601     /* Use last entry in heap for insertion */
2602     batch = heap[heapsize];
2603     heap[heapsize--] = sentinel;
2604 
2605   } else {
2606     /* Use this batch for insertion (same querypos) */
2607     batch->diagonal = *(++batch->diagonals);
2608   }
2609 
2610   /* heapify */
2611   diagonal = batch->diagonal;
2612   parenti = 1;
2613   smallesti = 2 + (heap[3]->diagonal < heap[2]->diagonal);
2614   /* Note that diagonal will never exceed a sentinel diagonal */
2615   while (diagonal > heap[smallesti]->diagonal) {
2616     heap[parenti] = heap[smallesti];
2617     parenti = smallesti;
2618     smallesti = LEFT(parenti);
2619     righti = smallesti+1;
2620     smallesti += (heap[righti]->diagonal < heap[smallesti]->diagonal);
2621   }
2622   heap[parenti] = batch;
2623 
2624 
2625   /* Continue after initialization */
2626   while (heapsize > 0) {
2627     batch = heap[1];
2628     diagonal = batch->diagonal;
2629     debug6(printf("diagonal %u\n",diagonal));
2630 
2631 #if 0
2632     /* Compare against last_diagonal, not base_diagonal to allow a chain */
2633     if (diagonal <= last_diagonal + COLLAPSE_DISTANCE) {
2634       /* Compress.  Modifies diagonals. */
2635       debug6(printf("  collapsing to %u\n",base_diagonal));
2636       *batch->diagonals = base_diagonal;
2637     } else {
2638       /* New base diagonal */
2639       base_diagonal = diagonal;
2640     }
2641     last_diagonal = diagonal;
2642 #else
2643     /* Compare against base_diagonal to ignore chains */
2644     if (diagonal == base_diagonal) {
2645       /* Do nothing */
2646     } else if (diagonal <= base_diagonal + COLLAPSE_DISTANCE) {
2647       /* Compress.  Modifies diagonals. */
2648       debug6(printf("  collapsing to %u\n",base_diagonal));
2649       *batch->diagonals = base_diagonal;
2650     } else {
2651       /* New base diagonal */
2652       base_diagonal = diagonal;
2653     }
2654 #endif
2655 
2656     if (--batch->ndiagonals <= 0) {
2657       /* Use last entry in heap for insertion */
2658       batch = heap[heapsize];
2659       heap[heapsize--] = sentinel;
2660 
2661     } else {
2662       /* Use this batch for insertion (same querypos) */
2663       batch->diagonal = *(++batch->diagonals);
2664     }
2665 
2666     /* heapify */
2667     diagonal = batch->diagonal;
2668     parenti = 1;
2669     smallesti = 2 + (heap[3]->diagonal < heap[2]->diagonal);
2670     /* Note that diagonal/querypos will never exceed a sentinel diagonal/querypos */
2671     while (diagonal > heap[smallesti]->diagonal) {
2672       heap[parenti] = heap[smallesti];
2673       parenti = smallesti;
2674       smallesti = LEFT(parenti);
2675       righti = smallesti+1;
2676       smallesti += (heap[righti]->diagonal < heap[smallesti]->diagonal);
2677     }
2678     heap[parenti] = batch;
2679 
2680   }
2681 
2682   /* Terminate loop. */
2683   FREE(heap);
2684   FREE(batchpool);
2685 
2686   return;
2687 }
2688 
2689 
2690 static struct Segment_T *
find_segments(int * nsegments,Univcoord_T ** diagonals,int * ndiagonals,int oligobase,int querylength,int threshold_score)2691 find_segments (int *nsegments, Univcoord_T **diagonals, int *ndiagonals,
2692 	       int oligobase, int querylength, int threshold_score) {
2693   struct Segment_T *segments, *ptr;
2694   Batch_T batch, *heap, sentinel;
2695   struct Batch_T sentinel_struct, *batchpool;
2696   int maxsegments, heapsize = 0, i;
2697   int parenti, smallesti, righti;
2698   int querypos, last_querypos;
2699   Univcoord_T diagonal, last_diagonal;
2700   int *ordered;
2701 #ifdef DEBUG6
2702   int j;
2703 #endif
2704 
2705   debug6(printf("*** Starting find_segments ***\n"));
2706 
2707   /* Set up batches */
2708   maxsegments = 0;
2709 
2710   /* Batch_T can be large, so don't use alloca */
2711   batchpool = (struct Batch_T *) MALLOC((querylength-oligobase+1) * sizeof(struct Batch_T));
2712   heap = (Batch_T *) MALLOC((2*querylength+1+1) * sizeof(Batch_T));
2713 
2714   for (querypos = 0, i = 0; querypos <= querylength - oligobase; querypos++) {
2715     if (ndiagonals[querypos] > 0) {
2716       maxsegments += ndiagonals[querypos];
2717 #ifdef PMAP
2718       debug6(printf("Adding batch for querypos %d with %d diagonals and aaindex %u\n",
2719 		    querypos,ndiagonals[querypos],oligos[querypos]));
2720 #else
2721       debug6(printf("Adding batch for querypos %d with %d diagonals and oligo %06X\n",
2722 		    querypos,ndiagonals[querypos],oligos[querypos]));
2723 #endif
2724       batch = &(batchpool[i++]);
2725       Batch_init(batch,querypos,diagonals[querypos],ndiagonals[querypos]);
2726       min_heap_insert(heap,&heapsize,batch);
2727     }
2728   }
2729 
2730   if (maxsegments == 0) {
2731     FREE(heap);
2732     FREE(batchpool);
2733     *nsegments = 0;
2734     return (struct Segment_T *) NULL;
2735   } else {
2736     ptr = segments = (struct Segment_T *) CALLOC(maxsegments,sizeof(struct Segment_T));
2737     ordered = (int *) CALLOC(i,sizeof(int));
2738   }
2739 
2740   /* Set up rest of heap */
2741   sentinel_struct.querypos = querylength; /* infinity */
2742   sentinel_struct.diagonal = (Univcoord_T) -1; /* infinity */
2743   sentinel = &sentinel_struct;
2744   for (i = heapsize+1; i <= 2*heapsize+1; i++) {
2745     heap[i] = sentinel;
2746   }
2747 
2748 
2749   /* Initialize loop */
2750   batch = heap[1];
2751   last_querypos = querypos = batch->querypos;
2752   last_diagonal = diagonal = batch->diagonal;
2753   debug6(printf("diagonal %u, querypos %d\n",diagonal,querypos));
2754 #ifdef PMAP
2755   ptr->score = 3*oligobase;
2756 #else
2757   ptr->score = oligobase;
2758 #endif
2759   ptr->querypos5 = ptr->querypos3 = querypos;
2760   ordered[0] = querypos;
2761   ptr->noligomers = 1;
2762 
2763   if (--batch->ndiagonals <= 0) {
2764     /* Use last entry in heap for insertion */
2765     batch = heap[heapsize];
2766     querypos = batch->querypos;
2767     heap[heapsize--] = sentinel;
2768 
2769   } else {
2770     /* Use this batch for insertion (same querypos) */
2771     batch->diagonal = *(++batch->diagonals);
2772   }
2773 
2774   /* heapify */
2775   diagonal = batch->diagonal;
2776   parenti = 1;
2777   smallesti = 2 + ((heap[3]->diagonal < heap[2]->diagonal) |
2778 		   ((heap[3]->diagonal == heap[2]->diagonal) &
2779 		    (heap[3]->querypos < heap[2]->querypos)));
2780   /* Note that diagonal/querypos will never exceed a sentinel diagonal/querypos */
2781   while (diagonal > heap[smallesti]->diagonal ||
2782 	 (diagonal == heap[smallesti]->diagonal &&
2783 	  querypos > heap[smallesti]->querypos)) {
2784     heap[parenti] = heap[smallesti];
2785     parenti = smallesti;
2786     smallesti = LEFT(parenti);
2787     righti = smallesti+1;
2788     smallesti += ((heap[righti]->diagonal < heap[smallesti]->diagonal) |
2789 		  ((heap[righti]->diagonal == heap[smallesti]->diagonal) &
2790 		   (heap[righti]->querypos < heap[smallesti]->querypos)));
2791   }
2792   heap[parenti] = batch;
2793 
2794 
2795   /* Continue after initializaation */
2796   while (heapsize > 0) {
2797     batch = heap[1];
2798     querypos = batch->querypos;
2799     diagonal = batch->diagonal;
2800     debug6(printf("diagonal %u, querypos %d\n",diagonal,querypos));
2801 
2802     if (diagonal == last_diagonal) {
2803       /* Continuation of exact match or substitution */
2804 #ifdef PMAP
2805       if (querypos <= last_querypos + oligobase) {
2806 	ptr->score += 3*(querypos - last_querypos);
2807       } else {
2808 	ptr->score += 3*oligobase;
2809       }
2810 #else
2811       if (querypos <= last_querypos + oligobase) {
2812 	ptr->score += querypos - last_querypos;
2813       } else {
2814 	ptr->score += oligobase;
2815       }
2816 #endif
2817 
2818       if (querypos > last_querypos) {
2819 	ordered[ptr->noligomers++] = querypos;
2820       }
2821       ptr->querypos3 = querypos;
2822 
2823     } else {
2824       /* End of diagonal */
2825       ptr->diagonal = last_diagonal;
2826 
2827       if (ptr->score >= threshold_score) {
2828 	/* The above test allows shorter matches on either end to find splicing, and requires longer matches internally */
2829 	debug6(printf("  => %c, score %d, noligomers %d, querypos %d..%d, median %d, position %u..%u\n",
2830 		      plusp ? '+' : '-',ptr->score,ptr->noligomers,ptr->querypos5,ptr->querypos3,
2831 		      (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2,
2832 		      plusp ? ptr->diagonal + ptr->querypos5 - querylength : ptr->diagonal - ptr->querypos5,
2833 		      plusp ? ptr->diagonal + ptr->querypos3 - querylength : ptr->diagonal - ptr->querypos3));
2834 	ptr->median = (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2;
2835 	ptr->querypos3 += oligobase;
2836 	ptr++;
2837       }
2838 
2839       ptr->querypos5 = ptr->querypos3 = querypos;
2840       ordered[0] = querypos;
2841       last_diagonal = diagonal;
2842 
2843 #ifdef PMAP
2844       ptr->score = 3*oligobase;
2845 #else
2846       ptr->score = oligobase;
2847 #endif
2848       ptr->noligomers = 1;
2849     }
2850     last_querypos = querypos;
2851 
2852 
2853     if (--batch->ndiagonals <= 0) {
2854       /* Use last entry in heap for insertion */
2855       batch = heap[heapsize];
2856       querypos = batch->querypos;
2857       heap[heapsize--] = sentinel;
2858 
2859     } else {
2860       /* Use this batch for insertion (same querypos) */
2861       batch->diagonal = *(++batch->diagonals);
2862     }
2863 
2864     /* heapify */
2865     diagonal = batch->diagonal;
2866     parenti = 1;
2867     debug7(printf("Comparing left %d/right %d: %u @ %d and %u @ %d\n",
2868 		  2,3,heap[3]->diagonal,heap[3]->querypos,heap[4]->diagonal,heap[4]->querypos));
2869     smallesti = 2 + ((heap[3]->diagonal < heap[2]->diagonal) |
2870 		     ((heap[3]->diagonal == heap[2]->diagonal) &
2871 		      (heap[3]->querypos < heap[2]->querypos)));
2872     /* Note that diagonal/querypos will never exceed a sentinel diagonal/querypos */
2873     while (diagonal > heap[smallesti]->diagonal ||
2874 	   (diagonal == heap[smallesti]->diagonal &&
2875 	    querypos > heap[smallesti]->querypos)) {
2876       heap[parenti] = heap[smallesti];
2877       parenti = smallesti;
2878       smallesti = LEFT(parenti);
2879       righti = smallesti+1;
2880       debug7(printf("Comparing left %d/right %d (heapsize %d): %u @ %d and %u @ %d\n",
2881 		    righti,smallesti,heapsize,heap[righti]->diagonal,
2882 		    heap[righti]->querypos,heap[smallesti]->diagonal,
2883 		    heap[smallesti]->querypos));
2884       smallesti += ((heap[righti]->diagonal < heap[smallesti]->diagonal) |
2885 		    ((heap[righti]->diagonal == heap[smallesti]->diagonal) &
2886 		     (heap[righti]->querypos < heap[smallesti]->querypos)));
2887     }
2888     heap[parenti] = batch;
2889 
2890   }
2891 
2892   /* Terminate loop. */
2893   ptr->diagonal = last_diagonal;
2894   if (ptr->score >= threshold_score) {
2895     /* The above test allows shorter matches on either end to find splicing, and requires longer matches internally */
2896     debug6(printf("  => %c, score %d, noligomers %d, querypos %d..%d, median %d, position %u..%u\n",
2897 		  plusp ? '+' : '-',ptr->score,ptr->noligomers,ptr->querypos5,ptr->querypos3,
2898 		  (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2,
2899 		  plusp ? ptr->diagonal + ptr->querypos5 - querylength : ptr->diagonal - ptr->querypos5,
2900 		  plusp ? ptr->diagonal + ptr->querypos3 - querylength : ptr->diagonal - ptr->querypos3));
2901     ptr->median = (ordered[ptr->noligomers/2] + ordered[ptr->noligomers/2 - ((ptr->noligomers+1) % 2)])/2;
2902     ptr->querypos3 += oligobase;
2903     ptr++;			/* Needed to get correct value for nsegments below */
2904   }
2905 
2906   FREE(heap);
2907   FREE(batchpool);
2908 
2909   *nsegments = ptr - segments;
2910   debug6(for (j = 0; j < *nsegments; j++) {
2911 	   printf("diagonal %u\tquerypos %d..%d\tscore %d\tnoligomers %d\n",
2912 		  segments[j].diagonal,segments[j].querypos5,segments[j].querypos3,
2913 		  segments[j].score,segments[j].noligomers);
2914 	   if (segments[j].querypos3 < segments[j].querypos5) {
2915 	     abort();
2916 	   }
2917 	 });
2918 
2919   FREE(ordered);
2920   return segments;
2921 }
2922 
2923 
2924 static void
compute_paths(int ** prev,int ** scores,struct Segment_T * segments,int nsegments,int maxexons,bool plusp)2925 compute_paths (int **prev, int **scores, struct Segment_T *segments, int nsegments, int maxexons, bool plusp) {
2926   int *bestscore, score, intronadj, overlapadj;
2927   struct Segment_T *segmentj;
2928   int k, j, i, *besti;
2929   int medianj;
2930   Univcoord_T diagonalj, intronlength;
2931 
2932   if (nsegments == 0) {
2933     return;
2934   } else {
2935     for (i = 1; i <= maxexons; i++) {
2936       prev[i] = (int *) CALLOC(nsegments,sizeof(int)); /* Return value */
2937       scores[i] = (int *) CALLOC(nsegments,sizeof(int)); /* Return value */
2938     }
2939     bestscore = (int *) MALLOCA((maxexons+1) * sizeof(int));
2940     besti = (int *) MALLOCA((maxexons+1) * sizeof(int));
2941   }
2942 
2943   if (plusp == true) {
2944     for (j = 0; j < nsegments; j++) {
2945       segmentj = &(segments[j]);
2946       diagonalj = segmentj->diagonal;
2947       medianj = segmentj->median;
2948 
2949       /* k = 1 */
2950       scores[1][j] = segmentj->score;
2951       prev[1][j] = -1;
2952 
2953       for (k = 2; k <= maxexons; k++) {
2954 	bestscore[k] = 0;
2955 	besti[k] = -1;
2956       }
2957 
2958       i = j-1;
2959       while (i >= 0 && segments[i].diagonal + HIGHINTRONLENGTH > diagonalj) {
2960 	/* for plus, median should be ascending */
2961 	if (segments[i].median < medianj) {
2962 	  intronlength = diagonalj - segments[i].diagonal;
2963 	  intronadj = intronlength/INTRONLEN_PENALTY;
2964 	  overlapadj = 0;
2965 	  if (segmentj->querypos5 < segments[i].querypos3) {
2966 #ifdef PMAP
2967 	    overlapadj = 3*(segments[i].querypos3 - segmentj->querypos5);
2968 #else
2969 	    overlapadj = (segments[i].querypos3 - segmentj->querypos5);
2970 #endif
2971 	  }
2972 
2973 	  if (segmentj->score > intronadj + overlapadj) {
2974 	    /* Require that score be monotonically increasing */
2975 	    for (k = 2; k <= maxexons; k++) {
2976 	      if ((score = scores[k-1][i] - intronadj - overlapadj) > bestscore[k]) {
2977 		bestscore[k] = score;
2978 		besti[k] = i;
2979 	      }
2980 	    }
2981 
2982 	    /* For k = maxexons, also check against the same level */
2983 	    if ((score = scores[maxexons][i] - intronadj - overlapadj) > bestscore[maxexons]) {
2984 	      bestscore[maxexons] = score;
2985 	      besti[maxexons] = i;
2986 	    }
2987 	  }
2988 
2989 	}
2990 	i--;
2991       }
2992 
2993       for (k = 2; k <= maxexons; k++) {
2994 	scores[k][j] = bestscore[k] + segmentj->score;
2995 	prev[k][j] = besti[k];
2996 	debug8(printf("+#%d, nexons %d: diagonal %u, querypos %d..%d, median %d, score %u from previous #%d\n",
2997 		      j,k,diagonalj,segmentj->querypos5,segmentj->querypos3,medianj,scores[k][j],besti[k]));
2998       }
2999     }
3000 
3001   } else {
3002     for (j = 0; j < nsegments; j++) {
3003       segmentj = &(segments[j]);
3004       diagonalj = segmentj->diagonal;
3005       medianj = segmentj->median;
3006 
3007       /* k = 1 */
3008       scores[1][j] = segmentj->score;
3009       prev[1][j] = -1;
3010 
3011       for (k = 2; k <= maxexons; k++) {
3012 	bestscore[k] = 0;
3013 	besti[k] = -1;
3014       }
3015 
3016       i = j-1;
3017       while (i >= 0 && segments[i].diagonal + HIGHINTRONLENGTH > diagonalj) {
3018 	/* for minus, median should be descending */
3019 	if (segments[i].median > medianj) {
3020 	  intronlength = diagonalj - segments[i].diagonal;
3021 	  intronadj = intronlength/INTRONLEN_PENALTY;
3022 	  overlapadj = 0;
3023 	  if (segments[i].querypos5 < segmentj->querypos3) {
3024 #ifdef PMAP
3025 	    overlapadj = 3*(segmentj->querypos3 - segments[i].querypos5);
3026 #else
3027 	    overlapadj = (segmentj->querypos3 - segments[i].querypos5);
3028 #endif
3029 	  }
3030 
3031 	  if (segmentj->score > intronadj + overlapadj) {
3032 	    /* Require that score be monotonically increasing */
3033 	    for (k = 2; k <= maxexons; k++) {
3034 	      if ((score = scores[k-1][i] - intronadj - overlapadj) > bestscore[k]) {
3035 		bestscore[k] = score;
3036 		besti[k] = i;
3037 	      }
3038 	    }
3039 
3040 	    /* For k = maxexons, also check against the same level */
3041 	    if ((score = scores[maxexons][i] - intronadj - overlapadj) > bestscore[maxexons]) {
3042 	      bestscore[maxexons] = score;
3043 	      besti[maxexons] = i;
3044 	    }
3045 	  }
3046 
3047 	}
3048 	i--;
3049       }
3050 
3051       for (k = 2; k <= maxexons; k++) {
3052 	scores[k][j] = bestscore[k] + segmentj->score;
3053 	prev[k][j] = besti[k];
3054 	debug8(printf("-#%d, nexons %d: diagonal %u, querypos %d..%d, median %d, score %u from previous #%d\n",
3055 		      j,k,diagonalj,segmentj->querypos5,segmentj->querypos3,medianj,scores[k][j],besti[k]));
3056       }
3057     }
3058   }
3059 
3060   FREEA(besti);
3061   FREEA(bestscore);
3062 
3063   return;
3064 }
3065 
3066 
3067 static void
find_best_scores(int * nthbest,int * bestscores,int ** plus_scores,int plus_nscores,int ** minus_scores,int minus_nscores,int maxexons,int n)3068 find_best_scores (int *nthbest, int *bestscores, int **plus_scores, int plus_nscores,
3069 		  int **minus_scores, int minus_nscores, int maxexons,
3070 		  int n) {
3071   int ninserted = 0, k, j, i;
3072   int *heap, newscore;
3073   int parenti, smallesti, righti;
3074 
3075   memset(bestscores,0,(maxexons+1) * sizeof(int));
3076   memset(nthbest,0,(maxexons+1) * sizeof(int));
3077 
3078   heap = (int *) MALLOCA((2*n+1+1) * sizeof(int));
3079 
3080   for (k = 1; k <= maxexons; k++) {
3081     /* Initialize heap */
3082     heap[1] = 0;
3083     for (i = 2; i < 2*n+1+1; i++) {
3084       heap[i] = 1000000000;
3085     }
3086 
3087     /* plus */
3088     for (j = 0; j < plus_nscores; j++) {
3089       if ((newscore = plus_scores[k][j]) > heap[1]) {
3090 	if (ninserted < n) {
3091 	  /* put new score at bottom and heapify up */
3092 	  i = ++ninserted;
3093 	  while (i > 1 && heap[PARENT(i)] > newscore) {
3094 	    heap[i] = heap[PARENT(i)];
3095 	    i = PARENT(i);
3096 	  }
3097 	  heap[i] = newscore;
3098 
3099 	} else {
3100 	  /* replace nth largest with new score and heapify down */
3101 	  parenti = 1;
3102 	  smallesti = 2 + (heap[3] < heap[2]);
3103 	  while (newscore > heap[smallesti]) {
3104 	    heap[parenti] = heap[smallesti];
3105 	    parenti = smallesti;
3106 	    smallesti = LEFT(parenti);
3107 	    righti = smallesti+1;
3108 	    smallesti += heap[righti] < heap[smallesti];
3109 	  }
3110 	  heap[parenti] = newscore;
3111 	}
3112       }
3113     }
3114 
3115     /* minus */
3116     for (j = 0; j < minus_nscores; j++) {
3117       if ((newscore = minus_scores[k][j]) > heap[1]) {
3118 	if (ninserted < n) {
3119 	  /* put new score at bottom and heapify up */
3120 	  i = ++ninserted;
3121 	  while (i > 1 && heap[PARENT(i)] > newscore) {
3122 	    heap[i] = heap[PARENT(i)];
3123 	    i = PARENT(i);
3124 	  }
3125 	  heap[i] = newscore;
3126 
3127 	} else {
3128 	  /* replace nth largest with new score and heapify down */
3129 	  parenti = 1;
3130 	  smallesti = 2 + (heap[3] < heap[2]);
3131 	  while (newscore > heap[smallesti]) {
3132 	    heap[parenti] = heap[smallesti];
3133 	    parenti = smallesti;
3134 	    smallesti = LEFT(parenti);
3135 	    righti = smallesti+1;
3136 	    smallesti += heap[righti] < heap[smallesti];
3137 	  }
3138 	  heap[parenti] = newscore;
3139 	}
3140       }
3141     }
3142 
3143     nthbest[k] = heap[1];
3144     bestscores[k] = 0;
3145     for (j = 1; j <= ninserted; j++) {
3146       if (heap[j] > bestscores[k]) {
3147 	bestscores[k] = heap[j];
3148       }
3149     }
3150     debug8(printf("For %d exons, best score is %d, %dth best is %d\n",bestscores[k],n,nthbest[k]));
3151   }
3152 
3153   FREEA(heap);
3154   return;
3155 }
3156 
3157 
3158 static void
find_best_scores_nonstranded(int * nthbest,int * bestscores,int ** plus_scores_fwd,int plus_nscores_fwd,int ** minus_scores_fwd,int minus_nscores_fwd,int ** plus_scores_rev,int plus_nscores_rev,int ** minus_scores_rev,int minus_nscores_rev,int maxexons,int n)3159 find_best_scores_nonstranded (int *nthbest, int *bestscores, int **plus_scores_fwd, int plus_nscores_fwd,
3160 			      int **minus_scores_fwd, int minus_nscores_fwd,
3161 			      int **plus_scores_rev, int plus_nscores_rev,
3162 			      int **minus_scores_rev, int minus_nscores_rev,
3163 			      int maxexons, int n) {
3164   int ninserted = 0, k, j, i;
3165   int *heap, newscore;
3166   int parenti, smallesti, righti;
3167 
3168   memset(bestscores,0,(maxexons+1) * sizeof(int));
3169   memset(nthbest,0,(maxexons+1) * sizeof(int));
3170   heap = (int *) MALLOCA((2*n+1+1) * sizeof(int));
3171 
3172   for (k = 1; k <= maxexons; k++) {
3173     /* Initialize heap */
3174     heap[1] = 0;
3175     for (i = 2; i < 2*n+1+1; i++) {
3176       heap[i] = 1000000000;
3177     }
3178 
3179     /* plus_fwd */
3180     for (j = 0; j < plus_nscores_fwd; j++) {
3181       if ((newscore = plus_scores_fwd[k][j]) > heap[1]) {
3182 	if (ninserted < n) {
3183 	  /* put new score at bottom and heapify up */
3184 	  i = ++ninserted;
3185 	  while (i > 1 && heap[PARENT(i)] > newscore) {
3186 	    heap[i] = heap[PARENT(i)];
3187 	    i = PARENT(i);
3188 	  }
3189 	  heap[i] = newscore;
3190 
3191 	} else {
3192 	  /* replace nth largest with new score and heapify down */
3193 	  parenti = 1;
3194 	  smallesti = 2 + (heap[3] < heap[2]);
3195 	  while (newscore > heap[smallesti]) {
3196 	    heap[parenti] = heap[smallesti];
3197 	    parenti = smallesti;
3198 	    smallesti = LEFT(parenti);
3199 	    righti = smallesti+1;
3200 	    smallesti += heap[righti] < heap[smallesti];
3201 	  }
3202 	  heap[parenti] = newscore;
3203 	}
3204       }
3205     }
3206 
3207     /* minus_fwd */
3208     for (j = 0; j < minus_nscores_fwd; j++) {
3209       if ((newscore = minus_scores_fwd[k][j]) > heap[1]) {
3210 	if (ninserted < n) {
3211 	  /* put new score at bottom and heapify up */
3212 	  i = ++ninserted;
3213 	  while (i > 1 && heap[PARENT(i)] > newscore) {
3214 	    heap[i] = heap[PARENT(i)];
3215 	    i = PARENT(i);
3216 	  }
3217 	  heap[i] = newscore;
3218 
3219 	} else {
3220 	  /* replace nth largest with new score and heapify down */
3221 	  parenti = 1;
3222 	  smallesti = 2 + (heap[3] < heap[2]);
3223 	  while (newscore > heap[smallesti]) {
3224 	    heap[parenti] = heap[smallesti];
3225 	    parenti = smallesti;
3226 	    smallesti = LEFT(parenti);
3227 	    righti = smallesti+1;
3228 	    smallesti += heap[righti] < heap[smallesti];
3229 	  }
3230 	  heap[parenti] = newscore;
3231 	}
3232       }
3233     }
3234 
3235     /* plus_rev */
3236     for (j = 0; j < plus_nscores_rev; j++) {
3237       if ((newscore = plus_scores_rev[k][j]) > heap[1]) {
3238 	if (ninserted < n) {
3239 	  /* put new score at bottom and heapify up */
3240 	  i = ++ninserted;
3241 	  while (i > 1 && heap[PARENT(i)] > newscore) {
3242 	    heap[i] = heap[PARENT(i)];
3243 	    i = PARENT(i);
3244 	  }
3245 	  heap[i] = newscore;
3246 
3247 	} else {
3248 	  /* replace nth largest with new score and heapify down */
3249 	  parenti = 1;
3250 	  smallesti = 2 + (heap[3] < heap[2]);
3251 	  while (newscore > heap[smallesti]) {
3252 	    heap[parenti] = heap[smallesti];
3253 	    parenti = smallesti;
3254 	    smallesti = LEFT(parenti);
3255 	    righti = smallesti+1;
3256 	    smallesti += heap[righti] < heap[smallesti];
3257 	  }
3258 	  heap[parenti] = newscore;
3259 	}
3260       }
3261     }
3262 
3263     /* minus_rev */
3264     for (j = 0; j < minus_nscores_rev; j++) {
3265       if ((newscore = minus_scores_rev[k][j]) > heap[1]) {
3266 	if (ninserted < n) {
3267 	  /* put new score at bottom and heapify up */
3268 	  i = ++ninserted;
3269 	  while (i > 1 && heap[PARENT(i)] > newscore) {
3270 	    heap[i] = heap[PARENT(i)];
3271 	    i = PARENT(i);
3272 	  }
3273 	  heap[i] = newscore;
3274 
3275 	} else {
3276 	  /* replace nth largest with new score and heapify down */
3277 	  parenti = 1;
3278 	  smallesti = 2 + (heap[3] < heap[2]);
3279 	  while (newscore > heap[smallesti]) {
3280 	    heap[parenti] = heap[smallesti];
3281 	    parenti = smallesti;
3282 	    smallesti = LEFT(parenti);
3283 	    righti = smallesti+1;
3284 	    smallesti += heap[righti] < heap[smallesti];
3285 	  }
3286 	  heap[parenti] = newscore;
3287 	}
3288       }
3289     }
3290 
3291     nthbest[k] = heap[1];
3292     bestscores[k] = 0;
3293     for (j = 1; j <= ninserted; j++) {
3294       if (heap[j] > bestscores[k]) {
3295 	bestscores[k] = heap[j];
3296       }
3297     }
3298     debug8(printf("For %d exons, best score is %d, %dth best is %d\n",bestscores[k],n,(*nthbest)[k]));
3299   }
3300 
3301   FREEA(heap);
3302   return;
3303 }
3304 
3305 
3306 
3307 static List_T
find_good_paths(List_T gregionlist,int nexons,int * prev,int * scores,struct Segment_T * segments,int nsegments,int threshold_score,Univ_IIT_T chromosome_iit,int querylength,int trimstart,int trimend,bool plusp,int genestrand)3308 find_good_paths (List_T gregionlist, int nexons, int *prev, int *scores,
3309 		 struct Segment_T *segments, int nsegments, int threshold_score,
3310 		 Univ_IIT_T chromosome_iit, int querylength,
3311 		 int trimstart, int trimend, bool plusp, int genestrand) {
3312   int bestj, besti;
3313   int querystart, queryend;
3314   Univcoord_T genomicstart, genomicend;
3315 
3316   debug9(printf("Starting find_good_paths with %d segments and threshold_score %d\n",nsegments,threshold_score));
3317 
3318   for (bestj = nsegments-1; bestj >= 0; bestj--) {
3319     debug9(printf("#%d: diagonal %u\tpathscore %d\tprev %d\n",
3320 		  bestj,segments[bestj].diagonal,scores[bestj],prev[bestj]));
3321     if (scores[bestj] >= threshold_score) {
3322       /* Traceback and wipe out scores along the way, so we don't use them again */
3323       besti = bestj;
3324       debug9(printf("*diagonal %u\tquerypos %d..%d\tscore %d\tnoligomers %d\n",
3325 		    segments[besti].diagonal,segments[besti].querypos5,segments[besti].querypos3,
3326 		    segments[besti].score,segments[besti].noligomers));
3327       while (prev[besti] >= 0) {
3328 	besti = prev[besti];
3329 	/* scores[besti] = 0; -- wipe out */
3330 	debug9(printf("*diagonal %u\tquerypos %d..%d\tscore %d\tnoligomers %d\n",
3331 		      segments[besti].diagonal,segments[besti].querypos5,segments[besti].querypos3,
3332 		      segments[besti].score,segments[besti].noligomers));
3333       }
3334       if (plusp == true) {
3335 	querystart = segments[besti].querypos5;
3336 	queryend = segments[bestj].querypos3;
3337 	if (segments[besti].diagonal + querystart > (Univcoord_T) querylength) {
3338 	  genomicstart = segments[besti].diagonal + querystart - querylength;
3339 	  genomicend = segments[bestj].diagonal + queryend - querylength;
3340 	  debug9(printf("Pushing gregion for %d plus: %d..%d: %u..%u, score=%d\n",
3341 			bestj,querystart,queryend,genomicstart,genomicend,scores[bestj]));
3342 #ifdef PMAP
3343 	  gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/true,genestrand,
3344 							  chromosome_iit,querystart,queryend,querylength,
3345 							  /*matchsize*/index1part_aa,trimstart,trimend,
3346 							  circular_typeint));
3347 #else
3348 	  gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/true,genestrand,
3349 							  chromosome_iit,querystart,queryend,querylength,
3350 							  /*matchsize*/index1part,trimstart,trimend,
3351 							  circular_typeint));
3352 #endif
3353 	}
3354       } else if (plusp == false) {
3355 	querystart = segments[bestj].querypos5;
3356 	queryend = segments[besti].querypos3;
3357 	if (segments[besti].diagonal > (Univcoord_T) queryend) {
3358 	  genomicstart = segments[besti].diagonal - queryend;
3359 	  genomicend = segments[bestj].diagonal - querystart;
3360 	  debug9(printf("Pushing gregion for %d minus: %d..%d: %u..%u, score=%d\n",
3361 			bestj,querystart,queryend,genomicstart,genomicend,scores[bestj]));
3362 #ifdef PMAP
3363 	  gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/false,genestrand,
3364 							  chromosome_iit,querystart,queryend,querylength,
3365 							  /*matchsize*/index1part_aa,trimstart,trimend,
3366 							  circular_typeint));
3367 #else
3368 	  gregionlist = List_push(gregionlist,Gregion_new(nexons,genomicstart,genomicend,/*plusp*/false,genestrand,
3369 							  chromosome_iit,querystart,queryend,querylength,
3370 							  /*matchsize*/index1part,trimstart,trimend,
3371 							  circular_typeint));
3372 #endif
3373 	}
3374       } else {
3375 	fprintf(stderr,"Got strange value %d for plusp\n",plusp);
3376 	abort();
3377       }
3378     }
3379   }
3380 
3381   debug9(printf("Returning %d gregions\n",List_length(gregionlist)));
3382   return gregionlist;
3383 }
3384 
3385 
3386 
3387 /* Probably don't want to iterate on 18-mers or shorter oligomers.  If
3388    we can't find a matching pair of 24-mers, it must be cross-species,
3389    so we should rely on our other method */
3390 
3391 static List_T
scan_ends(bool * shortseqp,bool second_pass_p,List_T oldlist,T this,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits,Diagnostic_T diagnostic,bool iteratep)3392 scan_ends (bool *shortseqp, bool second_pass_p, List_T oldlist, T this,
3393 	   Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
3394 	   Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
3395 	   Matchpool_T matchpool, int stutterhits, Diagnostic_T diagnostic, bool iteratep) {
3396   List_T newlist = NULL;
3397   double dangling5_pct, dangling3_pct;
3398   List_T dangling5, dangling3;
3399   bool foundpairp = false, loopp = true;
3400   Width_T matchsize;
3401 
3402   debug(printf("Starting scan_ends\n"));
3403 
3404 #ifdef PMAP
3405   matchsize = this->oligosize + this->oligosize;
3406 #else
3407   if (index1part < 12) {
3408     matchsize = index1part + index1part;
3409   } else {
3410     /* Originally set for 12-mers, but now allow for 13, 14, and 15-mers */
3411     matchsize = index1part + 12;
3412   }
3413 #endif
3414 
3415   /* Handle short query sequences */
3416   if (second_pass_p == true) {
3417     /* Use standard matchsize.  Can get higher specificity with longer matchsizes. */
3418   } else {
3419     /* Use small matchsize.  Can get higher sensitivity with smaller matchsizes. */
3420     *shortseqp = false;
3421     while (matchsize > this->querylength/4) {
3422       *shortseqp = true;
3423       matchsize -= 6;
3424     }
3425   }
3426   if (matchsize < this->oligosize) {
3427     matchsize = this->oligosize;
3428   }
3429 
3430   while (loopp && matchsize >= this->oligosize && foundpairp == false) {
3431     newlist = find_first_pair(&foundpairp,newlist,this,matchsize,
3432 			      indexdb_fwd,indexdb_rev,genestrand,chromosome_iit,
3433 			      chrsubset_start,chrsubset_end,matchpool);
3434     if (matchsize == this->oligosize) {
3435       loopp = false;
3436     } else if (foundpairp == false) {
3437 #ifdef PMAP
3438       matchsize -= this->oligosize;
3439 #else
3440       matchsize -= 6;  /* Originally set for 12-mers to be this->oligosize/2, but now allow for 13..15-mers */
3441       if (matchsize < this->oligosize) {
3442 	/* Allows for 24, 18, 12..15 when we have 12..15-mers */
3443 	matchsize = this->oligosize;
3444       }
3445 #endif
3446       stutterhits *= 2;
3447     }
3448     if (iteratep == false) {
3449       loopp = false;
3450     }
3451   }
3452 
3453   if (foundpairp == false) {
3454     debug(printf("*** No pair found ***\n"));
3455 
3456     diagnostic->firstpair_found_5 = 0;
3457     diagnostic->firstpair_found_3 = 0;
3458     diagnostic->stutter_searched_5 = diagnostic->firstpair_found_5;
3459     diagnostic->stutter_searched_3 = diagnostic->firstpair_found_3;
3460     diagnostic->stutter_nmatchpairs = 0;
3461 
3462     diagnostic->stutter_matches_5 = List_length(this->matches5);
3463     diagnostic->stutter_matches_3 = List_length(this->matches3);
3464     diagnostic->dangling_5 = count_dangling(this->matches5);
3465     diagnostic->dangling_3 = count_dangling(this->matches3);
3466     diagnostic->dangling_nmatchpairs = 0;
3467 
3468   } else {
3469     debug(printf("*** Found pair ***\n"));
3470     diagnostic->firstpair_found_5 = Block_querypos(this->block5);
3471     diagnostic->firstpair_found_3 = Block_querypos(this->block3);
3472 
3473     newlist = stutter(newlist,this,matchsize,indexdb_fwd,indexdb_rev,
3474 		      genestrand,chromosome_iit,chrsubset_start,chrsubset_end,matchpool,stutterhits);
3475 
3476     diagnostic->stutter_searched_5 = Block_querypos(this->block5);
3477     diagnostic->stutter_searched_3 = Block_querypos(this->block3);
3478     diagnostic->stutter_nmatchpairs = List_length(newlist);
3479 
3480     diagnostic->stutter_matches_5 = List_length(this->matches5);
3481     diagnostic->stutter_matches_3 = List_length(this->matches3);
3482     diagnostic->dangling_5 = count_dangling(this->matches5);
3483     diagnostic->dangling_3 = count_dangling(this->matches3);
3484 
3485     dangling5_pct = dangling_pct(this->matches5);
3486     dangling3_pct = dangling_pct(this->matches3);
3487     debug(printf("Dangling on 5' end: %d/%d\n",
3488 		 count_dangling(this->matches5),List_length(this->matches5)));
3489     debug(printf("Dangling on 3' end: %d/%d\n",
3490 		 count_dangling(this->matches3),List_length(this->matches3)));
3491 
3492     if (dangling5_pct > MAX_DANGLING_PCT) {
3493       dangling5 = get_dangling(this->matches5,matchpool);
3494       newlist = fill_in_3(newlist,this,matchsize,dangling5,indexdb_fwd,indexdb_rev,
3495 			  genestrand,chromosome_iit,chrsubset_start,chrsubset_end,matchpool);
3496       /* Not necessary to free */
3497       /* List_free(&dangling5); */
3498     }
3499 
3500     if (dangling3_pct > MAX_DANGLING_PCT) {
3501       dangling3 = get_dangling(this->matches3,matchpool);
3502       newlist = fill_in_5(newlist,this,matchsize,dangling3,indexdb_fwd,indexdb_rev,
3503 			  genestrand,chromosome_iit,chrsubset_start,chrsubset_end,matchpool);
3504       /* Not necessary to free */
3505       /* List_free(&dangling3); */
3506     }
3507 
3508 #if 0
3509     newlist = Gregion_filter_unique(newlist);
3510 #endif
3511     diagnostic->dangling_nmatchpairs = List_length(newlist);
3512   }
3513 
3514   debug(printf("Returning %d elements in newlist\n",List_length(newlist)));
3515   return List_append(newlist,oldlist);
3516 }
3517 
3518 
3519 #if 0
3520 static bool
3521 sufficient_gregion_p (List_T gregionlist) {
3522   List_T p;
3523   Gregion_T gregion;
3524 
3525   debug(printf("Checking for sufficient gregion on %d gregions\n",List_length(gregionlist)));
3526   for (p = gregionlist; p != NULL; p = List_next(p)) {
3527     gregion = (Gregion_T) List_head(p);
3528     if (Gregion_sufficient_support(gregion) == true) {
3529       debug(printf("Sufficient support found\n"));
3530       return true;
3531     }
3532   }
3533 
3534   debug(printf("Sufficient support not found\n"));
3535   return false;
3536 }
3537 #endif
3538 
3539 
3540 List_T
Stage1_compute(bool * lowidentityp,Sequence_T queryuc,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,int genestrand,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits,Diagnostic_T diagnostic,Stopwatch_T stopwatch,int nbest)3541 Stage1_compute (bool *lowidentityp, Sequence_T queryuc,
3542 		Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev, int genestrand,
3543 		Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
3544 		Matchpool_T matchpool, int stutterhits, Diagnostic_T diagnostic, Stopwatch_T stopwatch,
3545 		int nbest) {
3546   List_T gregionlist = NULL, p, q;
3547   T this = NULL;
3548   int trimlength, trimstart, trimend, maxentries, i;
3549   /* Width_T matchsize; */
3550   Chrpos_T maxtotallen;
3551   Univcoord_T extension5, extension3;
3552   Gregion_T gregion;
3553   bool shortseqp;
3554 
3555   struct Segment_T *plus_segments = NULL, *minus_segments = NULL;
3556   int plus_nsegments = 0, minus_nsegments = 0, maxexons, k;
3557   int nthbest[MAXEXONS+1], bestscores[MAXEXONS+1];
3558   int *plus_prev[MAXEXONS+1], *minus_prev[MAXEXONS+1], *plus_scores[MAXEXONS+1], *minus_scores[MAXEXONS+1];
3559 
3560 
3561   *lowidentityp = false;
3562 #ifdef DEBUG
3563   global_chromosome_iit = chromosome_iit;
3564 #endif
3565 
3566 #ifdef PMAP
3567   if (Sequence_fulllength_given(queryuc) < index1part_aa) {
3568     return (List_T) NULL;
3569   }
3570 #else
3571   if (Sequence_fulllength(queryuc) < index1part) {
3572     return (List_T) NULL;
3573   }
3574 #endif
3575 
3576 
3577   Stopwatch_start(stopwatch);
3578   diagnostic->sampling_rounds = 0;
3579   diagnostic->sampling_nskip = 0;
3580 
3581   debug(queryuc_ptr = Sequence_fullpointer(queryuc));
3582 
3583   /* Don't multiply trimlength by 3 in PMAP */
3584   trimlength = Sequence_trimlength(queryuc);
3585   trimstart = Sequence_trim_start(queryuc);
3586   trimend = Sequence_trim_end(queryuc);
3587   debug(printf("At start of Stage1_compute, we have trimstart %d, trimend %d\n",trimstart,trimend));
3588 
3589   if (trimlength <= SINGLEEXONLENGTH) {
3590     maxtotallen = 40 + trimlength;
3591   } else {
3592     maxtotallen = trimlength*SLOPE;
3593     if (maxtotallen < 10000) {
3594       maxtotallen = 10000;
3595     } else if (maxtotallen > maxtotallen_bound) {
3596       maxtotallen = maxtotallen_bound;
3597     }
3598   }
3599 
3600   debug(fprintf(stderr,"trimlength = %d, maxtotallen = %d\n",trimlength,maxtotallen));
3601 
3602   /* Scan ends (find first pair and stutter) */
3603 #ifdef PMAP
3604   /* matchsize = index1part_aa + index1part_aa; */
3605 #else
3606   /* matchsize = 24 */ /* Was index1part + index1part.  matchsize now set in scan_ends */;
3607 #endif
3608   maxentries = MAXENTRIES;
3609 
3610   debug(printf("Finding first pair with maxentries = %d\n",maxentries));
3611 
3612   this = Stage1_new(queryuc,maxtotallen,maxentries);
3613   read_oligos(this,queryuc,genestrand);
3614 #ifdef PMAP
3615   identify_repeated_oligos(this,/*oligobase*/index1part_aa,this->querylength);
3616 #else
3617   identify_repeated_oligos(this,/*oligobase*/index1part,this->querylength);
3618 #endif
3619 #ifdef SCAN_ENDS
3620   gregionlist = scan_ends(&shortseqp,/*second_pass_p*/false,gregionlist,this,
3621 			  indexdb_fwd,indexdb_rev,/*genestrand*/0,chromosome_iit,chrsubset_start,chrsubset_end,
3622 			  matchpool,stutterhits,diagnostic,/*iteratep*/false);
3623   debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this->matches5),dangling_pct(this->matches3)));
3624   if (shortseqp == true) {
3625     gregionlist = scan_ends(&shortseqp,/*second_pass_p*/true,gregionlist,this,
3626 			    indexdb_fwd,indexdb_rev,/*genestrand*/0,chromosome_iit,chrsubset_start,chrsubset_end,
3627 			    matchpool,stutterhits,diagnostic,/*iteratep*/false);
3628     debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this->matches5),dangling_pct(this->matches3)));
3629   }
3630 #endif
3631 
3632   if (gregionlist == NULL) {
3633     /* Don't use dangling to determine lowidentityp */
3634     *lowidentityp = true;
3635   }
3636 
3637   debug(printf("\nAfter scan_ends:\n"));
3638   debug(
3639 	for (p = gregionlist; p != NULL; p = List_next(p)) {
3640 	  gregion = (Gregion_T) List_head(p);
3641 	  Gregion_print(gregion);
3642 	}
3643 	);
3644   debug(printf("End of scan_ends.\n\n"));
3645 
3646   /* Perform sampling, if necessary */
3647   if (gregionlist == NULL ||
3648       (Gregion_best_weight(gregionlist) < SUFFICIENT_FIRST_WEIGHT &&
3649        dangling_pct(this->matches5) > MAX_DANGLING_PCT &&
3650        dangling_pct(this->matches3) > MAX_DANGLING_PCT)) {
3651 
3652 #if 0
3653     /* Start over by clearing gregionlist */
3654     for (p = gregionlist; p != NULL; p = List_next(p)) {
3655       gregion = (Gregion_T) List_head(p);
3656       Gregion_free(&gregion);
3657     }
3658     List_free(&gregionlist);
3659     gregionlist = (List_T) NULL;
3660 #endif
3661 
3662     debug(printf("Starting sample_oligos\n"));
3663 #ifdef PMAP
3664     sample_oligos(this,indexdb_fwd,indexdb_rev,this->querylength,/*oligobase*/index1part_aa);
3665     collapse_diagonals(this->plus_positions,this->plus_npositions,
3666 		       /*oligobase*/index1part_aa,this->querylength);
3667     plus_segments = find_segments(&plus_nsegments,this->plus_positions,this->plus_npositions,
3668 				  /*oligobase*/index1part_aa,this->querylength,
3669 				  /*threshold_score*/27);
3670     collapse_diagonals(this->minus_positions,this->minus_npositions,
3671 		       /*oligobase*/index1part_aa,this->querylength);
3672     minus_segments = find_segments(&minus_nsegments,this->minus_positions,this->minus_npositions,
3673 				   /*oligobase*/index1part_aa,this->querylength,
3674 				   /*threshold_score*/27);
3675 #else
3676     sample_oligos_nolimit(this,indexdb_fwd,indexdb_rev,this->querylength,/*oligobase*/index1part);
3677     collapse_diagonals(this->plus_positions,this->plus_npositions,
3678 		       /*oligobase*/index1part,this->querylength);
3679     plus_segments = find_segments(&plus_nsegments,this->plus_positions,this->plus_npositions,
3680 				  /*oligobase*/index1part,this->querylength,
3681 				  /*threshold_score*/18);
3682 
3683     collapse_diagonals(this->minus_positions,this->minus_npositions,
3684 		       /*oligobase*/index1part,this->querylength);
3685     minus_segments = find_segments(&minus_nsegments,this->minus_positions,this->minus_npositions,
3686 				   /*oligobase*/index1part,this->querylength,
3687 				   /*threshold_score*/18);
3688 #endif
3689 
3690     maxexons = 3;
3691     compute_paths(plus_prev,plus_scores,plus_segments,plus_nsegments,/*maxexons*/MAXEXONS,/*plusp*/true);
3692     compute_paths(minus_prev,minus_scores,minus_segments,minus_nsegments,/*maxexons*/MAXEXONS,/*plusp*/false);
3693     find_best_scores(nthbest,bestscores,plus_scores,/*plus_nscores*/plus_nsegments,
3694 		     minus_scores,/*minus_nscores*/minus_nsegments,maxexons,/*n*/nbest);
3695 
3696     for (k = 1; k <= maxexons; k++) {
3697       if (nthbest[k] < bestscores[k] - SUBOPTIMAL) {
3698 	nthbest[k] = bestscores[k] - SUBOPTIMAL;
3699       }
3700       if (plus_nsegments > 0) {
3701 	gregionlist = find_good_paths(gregionlist,/*nexons*/k,plus_prev[k],plus_scores[k],
3702 				      plus_segments,plus_nsegments,/*threshold_score*/nthbest[k],
3703 				      chromosome_iit,this->querylength,
3704 				      trimstart,trimend,/*plusp*/true,/*genestrand*/0);
3705       }
3706       if (minus_nsegments > 0) {
3707 	gregionlist = find_good_paths(gregionlist,/*nexons*/k,minus_prev[k],minus_scores[k],
3708 				      minus_segments,minus_nsegments,/*threshold_score*/nthbest[k],
3709 				      chromosome_iit,this->querylength,
3710 				      trimstart,trimend,/*plusp*/false,/*genestrand*/0);
3711       }
3712     }
3713 
3714     if (minus_nsegments > 0) {
3715       for (k = 1; k <= maxexons; k++) {
3716 	FREE(minus_scores[k]);
3717 	FREE(minus_prev[k]);
3718       }
3719     }
3720     if (plus_nsegments > 0) {
3721       for (k = 1; k <= maxexons; k++) {
3722 	FREE(plus_scores[k]);
3723 	FREE(plus_prev[k]);
3724       }
3725     }
3726     FREE(minus_segments);
3727     FREE(plus_segments);
3728   }
3729 
3730   /* Clean up gregionlist */
3731   debug(printf("Starting extensions on %d gregions\n",List_length(gregionlist)));
3732   for (p = gregionlist; p != NULL; p = List_next(p)) {
3733     gregion = (Gregion_T) List_head(p);
3734     if (Gregion_extendedp(gregion) == false) {
3735       /* Need to extend, otherwise we won't align NM_003360 */
3736       find_extensions(&extension5,&extension3,this,gregion,this->maxtotallen,/*continuousp*/false);
3737       Gregion_extend(gregion,extension5,extension3,this->querylength);
3738     }
3739   }
3740   debug(printf("Finished with extensions\n"));
3741 
3742   debug(printf("Before filtering, %d regions\n",List_length(gregionlist)));
3743   debug(
3744 	for (p = gregionlist; p != NULL; p = List_next(p)) {
3745 	  gregion = (Gregion_T) List_head(p);
3746 	  Gregion_print(gregion);
3747 	}
3748 	);
3749 
3750 #if 0
3751   /* Don't filter for support anymore.  Causes loss of good regions. */
3752   gregionlist = Gregion_filter_support(gregionlist,BOUNDARY_SUPPORT,PCT_MAX_SUPPORT,DIFF_MAX_SUPPORT);
3753   debug(printf("After filtering for support, %d regions\n",List_length(gregionlist)));
3754 #endif
3755 
3756 #if 0
3757   /* Don't filter for max_regions anymore */
3758   if (List_length(gregionlist) > MAX_GREGIONS_PRE_UNIQUE) {
3759     debug("Too many gregions, so erasing them\n");
3760     for (p = gregionlist; p != NULL; p = List_next(p)) {
3761       gregion = (Gregion_T) List_head(p);
3762       Gregion_free(&gregion);
3763     }
3764     List_free(&gregionlist);
3765     gregionlist = NULL;
3766 
3767   } else {
3768 #endif
3769 
3770 
3771 #ifdef USE_CLEAN
3772     Gregion_filter_clean(gregionlist,nchrs);
3773 #endif
3774 
3775 #if 0
3776     for (p = gregionlist; p != NULL; p = p->rest) {
3777       gregion = (Gregion_T) List_head(p);
3778       Gregion_print(gregion);
3779     }
3780 #endif
3781 
3782     debug0(printf("Before filtering for unique, %d regions\n",List_length(gregionlist)));
3783     debug0(
3784 	  for (p = gregionlist; p != NULL; p = List_next(p)) {
3785 	    gregion = (Gregion_T) List_head(p);
3786 	    Gregion_print(gregion);
3787 	  }
3788 	  );
3789 
3790     gregionlist = Gregion_filter_unique(gregionlist);
3791     debug0(printf("After filtering for unique, %d regions\n",List_length(gregionlist)));
3792     debug0(
3793 	  for (p = gregionlist; p != NULL; p = List_next(p)) {
3794 	    gregion = (Gregion_T) List_head(p);
3795 	    Gregion_print(gregion);
3796 	  }
3797 	  );
3798 
3799     debug0(
3800 	  if (List_length(gregionlist) > MAX_GREGIONS_POST_UNIQUE) {
3801 	    printf("Too many gregions %d, so taking the top %d\n",List_length(gregionlist),MAX_GREGIONS_POST_UNIQUE);
3802 	  });
3803 
3804     for (p = gregionlist, i = 1; p != NULL && i < MAX_GREGIONS_POST_UNIQUE; p = List_next(p)) {
3805       /* Advance */
3806       i++;
3807     }
3808     if (p != NULL) {
3809       q = List_next(p);
3810       p->rest = (List_T) NULL;
3811       for (p = q; p != NULL; p = List_next(p)) {
3812 	gregion = (Gregion_T) List_head(p);
3813 	Gregion_free(&gregion);
3814       }
3815       List_free(&q);
3816     }
3817 #if 0
3818   }
3819 #endif
3820 
3821   diagnostic->stage1_runtime = Stopwatch_stop(stopwatch);
3822   diagnostic->ngregions = List_length(gregionlist);
3823 
3824   Stage1_free(&this);
3825 
3826   debug0(printf("Stage 1 returning %d new regions\n",List_length(gregionlist)));
3827   debug0(
3828 	 for (p = gregionlist; p != NULL; p = List_next(p)) {
3829 	   gregion = (Gregion_T) List_head(p);
3830 	   Gregion_print(gregion);
3831 	 }
3832 	 );
3833 
3834   return gregionlist;
3835 }
3836 
3837 
3838 
3839 List_T
Stage1_compute_nonstranded(bool * lowidentityp,Sequence_T queryuc,Indexdb_T indexdb_fwd,Indexdb_T indexdb_rev,Univ_IIT_T chromosome_iit,Univcoord_T chrsubset_start,Univcoord_T chrsubset_end,Matchpool_T matchpool,int stutterhits,Diagnostic_T diagnostic,Stopwatch_T stopwatch,int nbest)3840 Stage1_compute_nonstranded (bool *lowidentityp, Sequence_T queryuc,
3841 			    Indexdb_T indexdb_fwd, Indexdb_T indexdb_rev,
3842 			    Univ_IIT_T chromosome_iit, Univcoord_T chrsubset_start, Univcoord_T chrsubset_end,
3843 			    Matchpool_T matchpool, int stutterhits, Diagnostic_T diagnostic, Stopwatch_T stopwatch,
3844 			    int nbest) {
3845   List_T gregionlist = NULL, p, q;
3846   T this_fwd = NULL, this_rev = NULL;
3847   Sequence_T queryrc;
3848   int trimlength, trimstart, trimend, maxentries, i;
3849   Chrpos_T maxtotallen;
3850   Univcoord_T extension5, extension3;
3851   Gregion_T gregion;
3852   bool shortseqp;
3853 
3854   struct Segment_T *plus_segments_fwd = NULL, *minus_segments_fwd = NULL, *plus_segments_rev = NULL, *minus_segments_rev = NULL;
3855   int plus_nsegments_fwd = 0, minus_nsegments_fwd = 0, plus_nsegments_rev = 0, minus_nsegments_rev = 0, maxexons, k;
3856   int nthbest[MAXEXONS+1], bestscores[MAXEXONS+1];
3857 
3858   int *plus_prev_fwd[MAXEXONS+1], *plus_scores_fwd[MAXEXONS+1], *minus_prev_fwd[MAXEXONS+1], *minus_scores_fwd[MAXEXONS+1];
3859   int *plus_prev_rev[MAXEXONS+1], *plus_scores_rev[MAXEXONS+1], *minus_prev_rev[MAXEXONS+1], *minus_scores_rev[MAXEXONS+1];
3860 
3861   *lowidentityp = false;
3862 #ifdef DEBUG
3863   global_chromosome_iit = chromosome_iit;
3864 #endif
3865 
3866 #ifdef PMAP
3867   if (Sequence_fulllength_given(queryuc) < index1part_aa) {
3868     return (List_T) NULL;
3869   }
3870 #else
3871   if (Sequence_fulllength(queryuc) < index1part) {
3872     return (List_T) NULL;
3873   }
3874 #endif
3875 
3876   queryrc = Sequence_revcomp(queryuc);
3877 
3878   Stopwatch_start(stopwatch);
3879   diagnostic->sampling_rounds = 0;
3880   diagnostic->sampling_nskip = 0;
3881 
3882   debug(queryuc_ptr = Sequence_fullpointer(queryuc));
3883 
3884   /* FWD */
3885 
3886 #ifdef PMAP
3887   /* matchsize = index1part_aa + index1part_aa; */
3888 #else
3889   /* matchsize = 24 */ /* Was index1part + index1part.  matchsize now set in scan_ends */;
3890 #endif
3891   maxentries = MAXENTRIES;
3892 
3893 
3894   /* Don't multiply trimlength by 3 in PMAP */
3895   trimlength = Sequence_trimlength(queryuc);
3896   trimstart = Sequence_trim_start(queryuc);
3897   trimend = Sequence_trim_end(queryuc);
3898   debug(printf("At start of Stage1_compute, we have trimstart %d, trimend %d\n",trimstart,trimend));
3899 
3900   if (trimlength <= SINGLEEXONLENGTH) {
3901     maxtotallen = 40 + trimlength;
3902   } else {
3903     maxtotallen = trimlength*SLOPE;
3904     if (maxtotallen < 10000) {
3905       maxtotallen = 10000;
3906     } else if (maxtotallen > maxtotallen_bound) {
3907       maxtotallen = maxtotallen_bound;
3908     }
3909   }
3910 
3911   debug(fprintf(stderr,"trimlength = %d, maxtotallen = %d\n",trimlength,maxtotallen));
3912 
3913   /* Scan ends (find first pair and stutter) */
3914   debug(printf("Finding first pair, fwd, with maxentries = %d\n",maxentries));
3915 
3916   this_fwd = Stage1_new(queryuc,maxtotallen,maxentries);
3917   read_oligos(this_fwd,queryuc,/*genestrand*/+1);
3918 #ifdef PMAP
3919   identify_repeated_oligos(this_fwd,/*oligobase*/index1part_aa,this_fwd->querylength);
3920 #else
3921   identify_repeated_oligos(this_fwd,/*oligobase*/index1part,this_fwd->querylength);
3922 #endif
3923 
3924 #ifdef SCAN_ENDS
3925   gregionlist = scan_ends(&shortseqp,/*second_pass_p*/false,gregionlist,this_fwd,
3926 			  indexdb_fwd,indexdb_fwd,/*genestrand*/+1,chromosome_iit,chrsubset_start,chrsubset_end,
3927 			  matchpool,stutterhits,diagnostic,/*iteratep*/false);
3928   debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_fwd->matches5),dangling_pct(this_fwd->matches3)));
3929   if (shortseqp == true) {
3930     gregionlist = scan_ends(&shortseqp,/*second_pass_p*/true,gregionlist,this_fwd,
3931 			    indexdb_fwd,indexdb_fwd,/*genestrand*/+1,chromosome_iit,chrsubset_start,chrsubset_end,
3932 			    matchpool,stutterhits,diagnostic,/*iteratep*/false);
3933     debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_fwd->matches5),dangling_pct(this_fwd->matches3)));
3934   }
3935 #endif
3936 
3937   debug(printf("\nAfter scan_ends, fwd:\n"));
3938   debug(
3939 	for (p = gregionlist; p != NULL; p = List_next(p)) {
3940 	  gregion = (Gregion_T) List_head(p);
3941 	  Gregion_print(gregion);
3942 	}
3943 	);
3944   debug(printf("End of scan_ends, fwd.\n\n"));
3945 
3946 
3947   /* REV */
3948 
3949   /* Don't multiply trimlength by 3 in PMAP */
3950   trimlength = Sequence_trimlength(queryrc);
3951   trimstart = Sequence_trim_start(queryrc);
3952   trimend = Sequence_trim_end(queryrc);
3953   debug(printf("At start of Stage1_compute, we have trimstart %d, trimend %d\n",trimstart,trimend));
3954 
3955   if (trimlength <= SINGLEEXONLENGTH) {
3956     maxtotallen = 40 + trimlength;
3957   } else {
3958     maxtotallen = trimlength*SLOPE;
3959     if (maxtotallen < 10000) {
3960       maxtotallen = 10000;
3961     } else if (maxtotallen > maxtotallen_bound) {
3962       maxtotallen = maxtotallen_bound;
3963     }
3964   }
3965 
3966   debug(fprintf(stderr,"trimlength = %d, maxtotallen = %d\n",trimlength,maxtotallen));
3967 
3968   /* Scan ends (find first pair and stutter) */
3969   debug(printf("Finding first pair, rev, with maxentries = %d\n",maxentries));
3970 
3971   this_rev = Stage1_new(queryrc,maxtotallen,maxentries);
3972   read_oligos(this_rev,queryrc,/*genestrand*/+2);
3973 #ifdef PMAP
3974   identify_repeated_oligos(this_rev,/*oligobase*/index1part_aa,this_rev->querylength);
3975 #else
3976   identify_repeated_oligos(this_rev,/*oligobase*/index1part,this_rev->querylength);
3977 #endif
3978 
3979 #ifdef SCAN_ENDS
3980   gregionlist = scan_ends(&shortseqp,/*second_pass_p*/false,gregionlist,this_rev,
3981 			  indexdb_rev,indexdb_rev,/*genestrand*/+2,chromosome_iit,chrsubset_start,chrsubset_end,
3982 			  matchpool,stutterhits,diagnostic,/*iteratep*/false);
3983   debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_rev->matches5),dangling_pct(this_rev->matches3)));
3984   if (shortseqp == true) {
3985     gregionlist = scan_ends(&shortseqp,/*second_pass_p*/true,gregionlist,this_rev,
3986 			    indexdb_rev,indexdb_rev,/*genestrand*/+2,chromosome_iit,chrsubset_start,chrsubset_end,
3987 			    matchpool,stutterhits,diagnostic,/*iteratep*/false);
3988     debug(printf("\nDangling5 = %f, Dangling3 = %f\n",dangling_pct(this_rev->matches5),dangling_pct(this_rev->matches3)));
3989   }
3990 #endif
3991 
3992   debug(printf("\nAfter scan_ends, rev:\n"));
3993   debug(
3994 	for (p = gregionlist; p != NULL; p = List_next(p)) {
3995 	  gregion = (Gregion_T) List_head(p);
3996 	  Gregion_print(gregion);
3997 	}
3998 	);
3999   debug(printf("End of scan_ends, rev.\n\n"));
4000 
4001 
4002   if (gregionlist == NULL) {
4003     /* Don't use dangling to determine lowidentityp */
4004     *lowidentityp = true;
4005   }
4006 
4007 #if 0
4008   /* Perform sampling, if necessary */
4009   if (gregionlist == NULL || Gregion_best_weight(gregionlist) < SUFFICIENT_FIRST_WEIGHT) {
4010     if (dangling_pct(this_fwd->matches5) > MAX_DANGLING_PCT &&
4011 	dangling_pct(this_fwd->matches3) > MAX_DANGLING_PCT) {
4012 #endif
4013 
4014 #ifdef PMAP
4015       debug(printf("Starting sample_oligos, fwd\n"));
4016       sample_oligos_nolimit(this_fwd,indexdb_fwd,indexdb_fwd,this_fwd->querylength,/*oligobase*/index1part_aa);
4017       collapse_diagonals(this_fwd->plus_positions,this_fwd->plus_npositions,
4018 			 /*oligobase*/index1part_aa,this_fwd->querylength);
4019       plus_segments_fwd = find_segments(&plus_nsegments_fwd,this_fwd->plus_positions,this_fwd->plus_npositions,
4020 					/*oligobase*/index1part_aa,this_fwd->querylength,
4021 					/*threshold_score*/18);
4022 
4023       collapse_diagonals(this_fwd->minus_positions,this_fwd->minus_npositions,
4024 			 /*oligobase*/index1part_aa,this_fwd->querylength);
4025       minus_segments_fwd = find_segments(&minus_nsegments_fwd,this_fwd->minus_positions,this_fwd->minus_npositions,
4026 					 /*oligobase*/index1part_aa,this_fwd->querylength,
4027 					 /*threshold_score*/18);
4028 #else
4029       debug(printf("Starting sample_oligos, fwd\n"));
4030       sample_oligos_nolimit(this_fwd,indexdb_fwd,indexdb_fwd,this_fwd->querylength,/*oligobase*/index1part);
4031       collapse_diagonals(this_fwd->plus_positions,this_fwd->plus_npositions,
4032 			 /*oligobase*/index1part,this_fwd->querylength);
4033       plus_segments_fwd = find_segments(&plus_nsegments_fwd,this_fwd->plus_positions,this_fwd->plus_npositions,
4034 					/*oligobase*/index1part,this_fwd->querylength,
4035 					/*threshold_score*/18);
4036 
4037       collapse_diagonals(this_fwd->minus_positions,this_fwd->minus_npositions,
4038 			 /*oligobase*/index1part,this_fwd->querylength);
4039       minus_segments_fwd = find_segments(&minus_nsegments_fwd,this_fwd->minus_positions,this_fwd->minus_npositions,
4040 					 /*oligobase*/index1part,this_fwd->querylength,
4041 					 /*threshold_score*/18);
4042 #endif
4043 
4044       maxexons = 3;
4045       compute_paths(plus_prev_fwd,plus_scores_fwd,plus_segments_fwd,plus_nsegments_fwd,/*maxexons*/MAXEXONS,/*plusp*/true);
4046       compute_paths(minus_prev_fwd,minus_scores_fwd,minus_segments_fwd,minus_nsegments_fwd,/*maxexons*/MAXEXONS,/*plusp*/false);
4047 
4048 #if 0
4049     }
4050 #endif
4051 
4052 #if 0
4053     if (dangling_pct(this_rev->matches5) > MAX_DANGLING_PCT &&
4054 	dangling_pct(this_rev->matches3) > MAX_DANGLING_PCT) {
4055 #endif
4056 
4057 #ifdef PMAP
4058       debug(printf("Starting sample_oligos, rev\n"));
4059       sample_oligos_nolimit(this_rev,indexdb_rev,indexdb_rev,this_rev->querylength,/*oligobase*/index1part_aa);
4060       collapse_diagonals(this_rev->plus_positions,this_rev->plus_npositions,
4061 			 /*oligobase*/index1part_aa,this_rev->querylength);
4062       plus_segments_rev = find_segments(&plus_nsegments_rev,this_rev->plus_positions,this_rev->plus_npositions,
4063 					/*oligobase*/index1part_aa,this_rev->querylength,
4064 					/*threshold_score*/18);
4065 
4066       collapse_diagonals(this_rev->minus_positions,this_rev->minus_npositions,
4067 			 /*oligobase*/index1part_aa,this_rev->querylength);
4068       minus_segments_rev = find_segments(&minus_nsegments_rev,this_rev->minus_positions,this_rev->minus_npositions,
4069 					 /*oligobase*/index1part_aa,this_rev->querylength,
4070 					 /*threshold_score*/18);
4071 #else
4072       debug(printf("Starting sample_oligos, rev\n"));
4073       sample_oligos_nolimit(this_rev,indexdb_rev,indexdb_rev,this_rev->querylength,/*oligobase*/index1part);
4074       collapse_diagonals(this_rev->plus_positions,this_rev->plus_npositions,
4075 			 /*oligobase*/index1part,this_rev->querylength);
4076       plus_segments_rev = find_segments(&plus_nsegments_rev,this_rev->plus_positions,this_rev->plus_npositions,
4077 					/*oligobase*/index1part,this_rev->querylength,
4078 					/*threshold_score*/18);
4079 
4080       collapse_diagonals(this_rev->minus_positions,this_rev->minus_npositions,
4081 			 /*oligobase*/index1part,this_rev->querylength);
4082       minus_segments_rev = find_segments(&minus_nsegments_rev,this_rev->minus_positions,this_rev->minus_npositions,
4083 					 /*oligobase*/index1part,this_rev->querylength,
4084 					 /*threshold_score*/18);
4085 #endif
4086 
4087       maxexons = 3;
4088       compute_paths(plus_prev_rev,plus_scores_rev,plus_segments_rev,plus_nsegments_rev,/*maxexons*/MAXEXONS,/*plusp*/true);
4089       compute_paths(minus_prev_rev,minus_scores_rev,minus_segments_rev,minus_nsegments_rev,/*maxexons*/MAXEXONS,/*plusp*/false);
4090 
4091 #if 0
4092     }
4093 #endif
4094 
4095     find_best_scores_nonstranded(nthbest,bestscores,plus_scores_fwd,/*plus_nscores*/plus_nsegments_fwd,
4096 				 minus_scores_fwd,/*minus_nscores*/minus_nsegments_fwd,
4097 				 plus_scores_rev,/*plus_nscores*/plus_nsegments_rev,
4098 				 minus_scores_rev,/*minus_nscores*/minus_nsegments_rev,
4099 				 /*maxexons*/3,/*n*/nbest);
4100 
4101     for (k = 1; k <= maxexons; k++) {
4102       if (nthbest[k] < bestscores[k] - SUBOPTIMAL) {
4103 	nthbest[k] = bestscores[k] - SUBOPTIMAL;
4104       }
4105       if (plus_nsegments_fwd > 0) {
4106 	gregionlist = find_good_paths(gregionlist,/*nexons*/k,plus_prev_fwd[k],plus_scores_fwd[k],
4107 				      plus_segments_fwd,plus_nsegments_fwd,/*threshold_score*/nthbest[k],
4108 				      chromosome_iit,this_fwd->querylength,
4109 				      trimstart,trimend,/*plusp*/true,/*genestrand*/+1);
4110       }
4111       if (minus_nsegments_fwd > 0) {
4112 	gregionlist = find_good_paths(gregionlist,/*nexons*/k,minus_prev_fwd[k],minus_scores_fwd[k],
4113 				      minus_segments_fwd,minus_nsegments_fwd,/*threshold_score*/nthbest[k],
4114 				      chromosome_iit,this_fwd->querylength,
4115 				      trimstart,trimend,/*plusp*/false,/*genestrand*/+1);
4116       }
4117       if (plus_nsegments_rev > 0) {
4118 	gregionlist = find_good_paths(gregionlist,/*nexons*/k,plus_prev_rev[k],plus_scores_rev[k],
4119 				      plus_segments_rev,plus_nsegments_rev,/*threshold_score*/nthbest[k],
4120 				      chromosome_iit,this_rev->querylength,
4121 				      trimstart,trimend,/*plusp*/true,/*genestrand*/+2);
4122       }
4123       if (minus_nsegments_rev > 0) {
4124 	gregionlist = find_good_paths(gregionlist,/*nexons*/k,minus_prev_rev[k],minus_scores_rev[k],
4125 				      minus_segments_rev,minus_nsegments_rev,/*threshold_score*/nthbest[k],
4126 				      chromosome_iit,this_rev->querylength,
4127 				      trimstart,trimend,/*plusp*/false,/*genestrand*/+2);
4128       }
4129     }
4130 
4131     if (minus_nsegments_rev > 0) {
4132       for (k = 1; k <= maxexons; k++) {
4133 	FREE(minus_scores_rev[k]);
4134 	FREE(minus_prev_rev[k]);
4135       }
4136     }
4137     if (plus_nsegments_rev > 0) {
4138       for (k = 1; k <= maxexons; k++) {
4139 	FREE(plus_scores_rev[k]);
4140 	FREE(plus_prev_rev[k]);
4141       }
4142     }
4143     if (minus_nsegments_fwd > 0) {
4144       for (k = 1; k <= maxexons; k++) {
4145 	FREE(minus_scores_fwd[k]);
4146 	FREE(minus_prev_fwd[k]);
4147       }
4148     }
4149     if (plus_nsegments_fwd > 0) {
4150       for (k = 1; k <= maxexons; k++) {
4151 	FREE(plus_scores_fwd[k]);
4152 	FREE(plus_prev_fwd[k]);
4153       }
4154     }
4155     FREE(minus_segments_rev);
4156     FREE(plus_segments_rev);
4157     FREE(minus_segments_fwd);
4158     FREE(plus_segments_fwd);
4159 
4160 #if 0
4161   }
4162 #endif
4163 
4164   /* Clean up gregionlist */
4165   debug(printf("Starting extensions\n"));
4166   for (p = gregionlist; p != NULL; p = List_next(p)) {
4167     gregion = (Gregion_T) List_head(p);
4168     if (Gregion_extendedp(gregion) == false) {
4169       /* Need to extend, otherwise we won't align NM_003360 */
4170       if (Gregion_genestrand(gregion) > 0) {
4171 	find_extensions(&extension5,&extension3,this_fwd,gregion,this_fwd->maxtotallen,/*continuousp*/false);
4172 	Gregion_extend(gregion,extension5,extension3,this_fwd->querylength);
4173       } else {
4174 	find_extensions(&extension5,&extension3,this_rev,gregion,this_rev->maxtotallen,/*continuousp*/false);
4175 	Gregion_extend(gregion,extension5,extension3,this_rev->querylength);
4176       }
4177     }
4178   }
4179   debug(printf("Finished with extensions\n"));
4180 
4181   debug(printf("Before filtering, %d regions\n",List_length(gregionlist)));
4182   debug(
4183 	for (p = gregionlist; p != NULL; p = List_next(p)) {
4184 	  gregion = (Gregion_T) List_head(p);
4185 	  Gregion_print(gregion);
4186 	}
4187 	);
4188 
4189 #if 0
4190   /* For nonstranded, want to filter for support */
4191   gregionlist = Gregion_filter_support(gregionlist,BOUNDARY_SUPPORT,PCT_MAX_SUPPORT,DIFF_MAX_SUPPORT);
4192   debug(printf("After filtering for support, %d regions\n",List_length(gregionlist)));
4193 #endif
4194 
4195 #if 0
4196   /* Don't filter for max_regions anymore */
4197   if (List_length(gregionlist) > MAX_GREGIONS_PRE_UNIQUE) {
4198     debug("Too many gregions, so erasing them\n");
4199     for (p = gregionlist; p != NULL; p = List_next(p)) {
4200       gregion = (Gregion_T) List_head(p);
4201       Gregion_free(&gregion);
4202     }
4203     List_free(&gregionlist);
4204     gregionlist = NULL;
4205 
4206   } else {
4207 #endif
4208 
4209 
4210 #ifdef USE_CLEAN
4211     Gregion_filter_clean(gregionlist,nchrs);
4212 #endif
4213 
4214 #if 0
4215     for (p = gregionlist; p != NULL; p = p->rest) {
4216       gregion = (Gregion_T) List_head(p);
4217       Gregion_print(gregion);
4218     }
4219 #endif
4220 
4221     gregionlist = Gregion_filter_unique(gregionlist);
4222     debug0(printf("After filtering for unique, %d regions\n",List_length(gregionlist)));
4223     debug0(
4224 	  for (p = gregionlist; p != NULL; p = List_next(p)) {
4225 	    gregion = (Gregion_T) List_head(p);
4226 	    Gregion_print(gregion);
4227 	  }
4228 	  );
4229 
4230     debug0(
4231 	  if (List_length(gregionlist) > MAX_GREGIONS_POST_UNIQUE) {
4232 	    printf("Too many gregions %d, so taking the top %d\n",List_length(gregionlist),MAX_GREGIONS_POST_UNIQUE);
4233 	  });
4234 
4235     for (p = gregionlist, i = 1; p != NULL && i < MAX_GREGIONS_POST_UNIQUE; p = List_next(p)) {
4236       /* Advance */
4237       i++;
4238     }
4239     if (p != NULL) {
4240       q = List_next(p);
4241       p->rest = (List_T) NULL;
4242       for (p = q; p != NULL; p = List_next(p)) {
4243 	gregion = (Gregion_T) List_head(p);
4244 	Gregion_free(&gregion);
4245       }
4246       List_free(&q);
4247     }
4248 #if 0
4249   }
4250 #endif
4251 
4252   diagnostic->stage1_runtime = Stopwatch_stop(stopwatch);
4253   diagnostic->ngregions = List_length(gregionlist);
4254 
4255   Stage1_free(&this_rev);
4256   Stage1_free(&this_fwd);
4257 
4258   Sequence_free(&queryrc);
4259 
4260   debug0(printf("Stage 1 returning %d new regions\n",List_length(gregionlist)));
4261   debug0(
4262 	 for (p = gregionlist; p != NULL; p = List_next(p)) {
4263 	   gregion = (Gregion_T) List_head(p);
4264 	   Gregion_print(gregion);
4265 	 }
4266 	 );
4267 
4268   return gregionlist;
4269 }
4270 
4271