1 static char rcsid[] = "$Id: pairpool.c 218147 2019-01-16 21:28:41Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "pairpool.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>		/* For memcpy */
10 #include "assert.h"
11 #include "mem.h"
12 #include "comp.h"
13 #include "pairdef.h"
14 #include "intron.h"
15 #include "complement.h"
16 
17 
18 #define CHUNKSIZE 20000
19 #define MICROINTRON_LENGTH 9	/* For Pairpool_add_genomeskip */
20 
21 #ifdef DEBUG
22 #define debug(x) x
23 #else
24 #define debug(x)
25 #endif
26 
27 /* For mechanics of memory allocation and deallocation */
28 #ifdef DEBUG1
29 #define debug1(x) x
30 #else
31 #define debug1(x)
32 #endif
33 
34 /* For popping */
35 #ifdef DEBUG2
36 #define debug2(x) x
37 #else
38 #define debug2(x)
39 #endif
40 
41 /* Getting genomic nt */
42 #ifdef DEBUG8
43 #define debug8(x) x
44 #else
45 #define debug8(x)
46 #endif
47 
48 /* joining ends */
49 #ifdef DEBUG15
50 #define debug15(x) x
51 #else
52 #define debug15(x)
53 #endif
54 
55 /* clean_join */
56 #ifdef DEBUG16
57 #define debug16(x) x
58 #else
59 #define debug16(x)
60 #endif
61 
62 
63 #define T Pairpool_T
64 struct T {
65   int npairs;
66   int pairctr;
67   struct Pair_T *pairptr;
68   List_T pairchunks;
69 
70   int nlistcells;
71   int listcellctr;
72   struct List_T *listcellptr;
73   List_T listcellchunks;
74 };
75 
76 void
Pairpool_free_memory(T this)77 Pairpool_free_memory (T this) {
78   List_T p;
79   struct Pair_T *pairptr;
80   struct List_T *listcellptr;
81 
82   for (p = this->pairchunks; p != NULL; p = List_next(p)) {
83     pairptr = (struct Pair_T *) List_head(p);
84     FREE_KEEP(pairptr);
85   }
86   List_free_keep(&this->pairchunks);
87   for (p = this->listcellchunks; p != NULL; p = List_next(p)) {
88     listcellptr = (struct List_T *) List_head(p);
89     FREE_KEEP(listcellptr);
90   }
91   List_free_keep(&this->listcellchunks);
92 
93   this->npairs = 0;
94   this->pairctr = 0;
95   this->pairchunks = NULL;
96   /* this->pairptr = add_new_pairchunk(this); */
97 
98   this->nlistcells = 0;
99   this->listcellctr = 0;
100   this->listcellchunks = NULL;
101   /* this->listcellptr = add_new_listcellchunk(this); */
102 
103   return;
104 }
105 
106 
107 void
Pairpool_free(T * old)108 Pairpool_free (T *old) {
109   Pairpool_free_memory(*old);
110   FREE_KEEP(*old);
111   return;
112 }
113 
114 
115 void
Pairpool_report_memory(T this)116 Pairpool_report_memory (T this) {
117   printf("Pairpool has %d pairchunks and %d listcellchunks\n",
118 	 List_length(this->pairchunks),List_length(this->listcellchunks));
119   return;
120 }
121 
122 
123 static struct Pair_T *
add_new_pairchunk(T this)124 add_new_pairchunk (T this) {
125   struct Pair_T *chunk;
126 
127   chunk = (struct Pair_T *) MALLOC_KEEP(CHUNKSIZE*sizeof(struct Pair_T));
128   this->pairchunks = List_push_keep(this->pairchunks,(void *) chunk);
129   debug1(printf("Adding a new chunk of pairs.  Ptr for pair %d is %p\n",
130 		this->npairs,chunk));
131 
132   this->npairs += CHUNKSIZE;
133 
134   return chunk;
135 }
136 
137 static struct List_T *
add_new_listcellchunk(T this)138 add_new_listcellchunk (T this) {
139   struct List_T *chunk;
140 
141   chunk = (struct List_T *) MALLOC_KEEP(CHUNKSIZE*sizeof(struct List_T));
142   this->listcellchunks = List_push_keep(this->listcellchunks,(void *) chunk);
143   debug1(printf("Adding a new chunk of listcells.  Ptr for listcell %d is %p\n",
144 	       this->nlistcells,chunk));
145 
146   this->nlistcells += CHUNKSIZE;
147 
148   return chunk;
149 }
150 
151 T
Pairpool_new(void)152 Pairpool_new (void) {
153   T new = (T) MALLOC_KEEP(sizeof(*new));
154 
155   new->npairs = 0;
156   new->pairctr = 0;
157   new->pairchunks = NULL;
158   /* new->pairptr = add_new_pairchunk(new); */
159 
160   new->nlistcells = 0;
161   new->listcellctr = 0;
162   new->listcellchunks = NULL;
163   /* new->listcellptr = add_new_listcellchunk(new); */
164 
165   return new;
166 }
167 
168 void
Pairpool_reset(T this)169 Pairpool_reset (T this) {
170   this->pairctr = 0;
171   this->listcellctr = 0;
172   return;
173 }
174 
175 /* gapp should be false for the following comps: MATCH_COMP,
176    DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, MISMATCH_COMP, INDEL_COMP,
177    SHORTGAP_COMP */
178 
179 List_T
Pairpool_push(List_T list,T this,int querypos,int genomepos,char cdna,char comp,char genome,char genomealt,int dynprogindex)180 Pairpool_push (List_T list, T this, int querypos, int genomepos, char cdna, char comp,
181 	       char genome, char genomealt, int dynprogindex) {
182   List_T listcell;
183   Pair_T pair;
184   List_T p;
185   int n;
186 
187   /* assert(querypos >= 0); */
188   /* assert(genomepos >= 0); */
189 
190   if (querypos < 0 || genomepos < 0) {
191     return list;
192   }
193 
194   if (this->pairctr >= this->npairs) {
195     this->pairptr = add_new_pairchunk(this);
196   } else if ((this->pairctr % CHUNKSIZE) == 0) {
197     for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
198 	 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
199     this->pairptr = (struct Pair_T *) p->first;
200     debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
201   }
202   pair = this->pairptr++;
203   this->pairctr++;
204 
205   pair->querypos = querypos;
206   pair->genomepos = genomepos;
207   pair->aapos = 0;
208   pair->aaphase_g = -1;
209   pair->aaphase_e = -1;
210   pair->cdna = cdna;
211   pair->comp = comp;
212   pair->genome = genome;
213   pair->genomealt = genomealt;
214   pair->dynprogindex = dynprogindex;
215 
216   pair->aa_g = ' ';
217   pair->aa_e = ' ';
218   pair->shortexonp = false;
219   pair->gapp = false;
220   pair->knowngapp = false;
221   pair->introntype = NONINTRON;
222   if (comp == EXTRAEXON_COMP) {
223     pair->extraexonp = true;
224   } else {
225     pair->extraexonp = false;
226   }
227 
228   pair->queryjump = 0;
229   pair->genomejump = 0;
230 
231   pair->state = GOOD;
232   pair->protectedp = false;
233   pair->disallowedp = false;
234   pair->donor_prob = 0.0;
235   pair->acceptor_prob = 0.0;
236   pair->end_intron_p = false;
237 
238   debug(
239 	printf("Creating %p: %d %d %c %c %c\n",
240 	       pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
241 	);
242 
243   if (this->listcellctr >= this->nlistcells) {
244     this->listcellptr = add_new_listcellchunk(this);
245   } else if ((this->listcellctr % CHUNKSIZE) == 0) {
246     for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
247 	 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
248     this->listcellptr = (struct List_T *) p->first;
249     debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
250   }
251   listcell = this->listcellptr++;
252   this->listcellctr++;
253 
254   listcell->first = (void *) pair;
255   listcell->rest = list;
256 
257   return listcell;
258 }
259 
260 
261 List_T
Pairpool_push_copy(List_T list,T this,Pair_T orig)262 Pairpool_push_copy (List_T list, T this, Pair_T orig) {
263   List_T listcell;
264   Pair_T pair;
265   List_T p;
266   int n;
267 
268   if (this->pairctr >= this->npairs) {
269     this->pairptr = add_new_pairchunk(this);
270   } else if ((this->pairctr % CHUNKSIZE) == 0) {
271     for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
272 	 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
273     this->pairptr = (struct Pair_T *) p->first;
274     debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
275   }
276   pair = this->pairptr++;
277   this->pairctr++;
278 
279   memcpy(pair,orig,sizeof(struct Pair_T));
280 
281   debug(
282 	printf("Copying %p <= %p: %d %d %c %c %c\n",
283 	       pair,orig,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
284 	);
285 
286   if (this->listcellctr >= this->nlistcells) {
287     this->listcellptr = add_new_listcellchunk(this);
288   } else if ((this->listcellctr % CHUNKSIZE) == 0) {
289     for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
290 	 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
291     this->listcellptr = (struct List_T *) p->first;
292     debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
293   }
294   listcell = this->listcellptr++;
295   this->listcellctr++;
296 
297   listcell->first = (void *) pair;
298   listcell->rest = list;
299 
300   return listcell;
301 }
302 
303 
304 List_T
Pairpool_push_gapalign(List_T list,T this,int querypos,int genomepos,char cdna,char comp,int introntype,char genome,char genomealt,bool extraexonp)305 Pairpool_push_gapalign (List_T list, T this, int querypos, int genomepos, char cdna, char comp,
306 			int introntype, char genome, char genomealt, bool extraexonp) {
307   List_T listcell;
308   Pair_T pair;
309   List_T p;
310   int n;
311 
312   if (this->pairctr >= this->npairs) {
313     this->pairptr = add_new_pairchunk(this);
314   } else if ((this->pairctr % CHUNKSIZE) == 0) {
315     for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
316 	 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
317     this->pairptr = (struct Pair_T *) p->first;
318     debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
319   }
320   pair = this->pairptr++;
321   this->pairctr++;
322 
323   pair->querypos = querypos;
324   pair->genomepos = genomepos;
325   pair->aapos = 0;
326   pair->aaphase_g = -1;
327   pair->aaphase_e = -1;
328   pair->cdna = cdna;
329   pair->comp = comp;
330   pair->genome = genome;
331   pair->genomealt = genomealt;
332   pair->dynprogindex = 0;
333 
334   pair->aa_g = ' ';
335   pair->aa_e = ' ';
336   pair->shortexonp = false;
337   pair->gapp = true;
338   pair->knowngapp = false;
339   pair->introntype = introntype;
340   pair->extraexonp = extraexonp;
341 
342   pair->queryjump = 0;
343   pair->genomejump = 0;
344 
345   pair->state = GOOD;
346   pair->protectedp = false;
347   pair->disallowedp = false;
348   pair->donor_prob = 0.0;
349   pair->acceptor_prob = 0.0;
350   pair->end_intron_p = false;
351 
352   debug(
353 	printf("Creating %p: %d %d %c %c %c introntype %d\n",
354 	       pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome,pair->introntype);
355 	);
356 
357 	if (this->listcellctr >= this->nlistcells) {
358     this->listcellptr = add_new_listcellchunk(this);
359   } else if ((this->listcellctr % CHUNKSIZE) == 0) {
360     for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
361 	 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
362     this->listcellptr = (struct List_T *) p->first;
363     debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
364   }
365   listcell = this->listcellptr++;
366   this->listcellctr++;
367 
368   listcell->first = (void *) pair;
369   listcell->rest = list;
370 
371   return listcell;
372 }
373 
374 List_T
Pairpool_push_gapholder(List_T list,T this,int queryjump,int genomejump,Pair_T leftpair,Pair_T rightpair,bool knownp)375 Pairpool_push_gapholder (List_T list, T this, int queryjump, int genomejump,
376 			 Pair_T leftpair, Pair_T rightpair, bool knownp) {
377   List_T listcell;
378   Pair_T pair;
379   List_T p;
380   int n;
381 
382   if (this->pairctr >= this->npairs) {
383     this->pairptr = add_new_pairchunk(this);
384   } else if ((this->pairctr % CHUNKSIZE) == 0) {
385     for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
386 	 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
387     this->pairptr = (struct Pair_T *) p->first;
388     debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
389   }
390   pair = this->pairptr++;
391   this->pairctr++;
392 
393   pair->querypos = -1;
394   pair->genomepos = -1;
395 
396   pair->aapos = 0;
397   pair->aaphase_g = -1;
398   pair->aaphase_e = -1;
399   pair->cdna = ' ';
400   pair->comp = ' ';
401   pair->genome = ' ';
402   pair->genomealt = ' ';
403   pair->dynprogindex = 0;
404 
405   pair->aa_g = ' ';
406   pair->aa_e = ' ';
407   pair->shortexonp = false;
408   pair->gapp = true;
409   if (knownp == true) {
410     pair->knowngapp = true;
411     pair->donor_prob = 2.0;
412     pair->acceptor_prob = 2.0;
413   } else {
414     pair->knowngapp = false;
415     pair->donor_prob = 0.0;
416     pair->acceptor_prob = 0.0;
417   }
418   pair->introntype = NONINTRON;
419   pair->extraexonp = false;
420 
421   if (leftpair && rightpair) {
422     queryjump = rightpair->querypos - leftpair->querypos - 1;
423     genomejump = rightpair->genomepos - leftpair->genomepos - 1;
424     if (leftpair->cdna == ' ') queryjump++;
425     if (leftpair->genome == ' ') genomejump++;
426   }
427 
428   pair->queryjump = queryjump;
429   pair->genomejump = genomejump;
430 
431   pair->state = GOOD;
432   pair->protectedp = false;
433   pair->disallowedp = false;
434   pair->end_intron_p = false;
435 
436   debug(printf("Creating gap %p, queryjump=%d, genomejump=%d\n",pair,queryjump,genomejump));
437 
438   if (this->listcellctr >= this->nlistcells) {
439     this->listcellptr = add_new_listcellchunk(this);
440   } else if ((this->listcellctr % CHUNKSIZE) == 0) {
441     for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
442 	 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
443     this->listcellptr = (struct List_T *) p->first;
444     debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
445   }
446   listcell = this->listcellptr++;
447   this->listcellctr++;
448 
449   listcell->first = (void *) pair;
450   listcell->rest = list;
451 
452   return listcell;
453 }
454 
455 List_T
Pairpool_push_existing(List_T list,T this,Pair_T pair)456 Pairpool_push_existing (List_T list, T this, Pair_T pair) {
457   List_T listcell;
458   List_T p;
459   int n;
460 
461   debug(
462 	Pair_T head;
463 	if (pair->gapp == true) {
464 	  printf("Pushing gap %p: queryjump=%d, genomejump=%d onto ",
465 		 pair,pair->queryjump,pair->genomejump);
466 	} else {
467 	  printf("Pushing %p: %d %d %c %c %c onto ",
468 		 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
469 	}
470 	if (list == NULL) {
471 	  printf("NULL\n");
472 	} else {
473 	  head = list->first;
474 	  if (head->gapp == true) {
475 	    printf("gap %p: queryjump=%d, genomejump=%d\n",
476 		   head,head->queryjump,head->genomejump);
477 	  } else {
478 	    printf("%p: %d %d %c %c %c\n",
479 		   head,head->querypos,head->genomepos,head->cdna,head->comp,head->genome);
480 	  }
481 	}
482 	);
483 
484   if (this->listcellctr >= this->nlistcells) {
485     this->listcellptr = add_new_listcellchunk(this);
486   } else if ((this->listcellctr % CHUNKSIZE) == 0) {
487     for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
488 	 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
489     this->listcellptr = (struct List_T *) p->first;
490     debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
491   }
492   listcell = this->listcellptr++;
493   this->listcellctr++;
494 
495   listcell->first = (void *) pair;
496   listcell->rest = list;
497 
498   return listcell;
499 }
500 
501 
502 /* Note: this does not free the list cell */
503 List_T
Pairpool_pop(List_T list,Pair_T * x)504 Pairpool_pop (List_T list, Pair_T *x) {
505   List_T head;
506 
507   if (list != NULL) {
508     head = list->rest;
509     *x = (Pair_T) list->first;
510     debug2(
511 	   if ((*x)->gapp == true) {
512 	     printf("Popping gap: queryjump=%d, genomejump=%d\n",
513 		    (*x)->queryjump,(*x)->genomejump);
514 	   } else {
515 	     printf("Popping: %d %d %c %c %c\n",
516 		    (*x)->querypos,(*x)->genomepos,(*x)->cdna,(*x)->comp,(*x)->genome);
517 	   }
518 	   );
519     return head;
520   } else {
521     return list;
522   }
523 }
524 
525 
526 List_T
Pairpool_transfer(List_T dest,List_T source)527 Pairpool_transfer (List_T dest, List_T source) {
528   List_T p, next;
529 #ifdef DEBUG
530   Pair_T pair;
531 #endif
532 
533   for (p = source; p != NULL; p = next) {
534     debug(
535 	  pair = List_head(p);
536 	  if (pair->cdna == '\0' || pair->genome == '\0') {
537 	    abort();
538 	  }
539 	  if (pair->gapp) {
540 	    printf("Transferring gap %p: queryjump=%d, genomejump=%d\n",
541 		   pair,pair->queryjump,pair->genomejump);
542 	  } else {
543 	    printf("Transferring %p: %d %d %c %c %c\n",
544 		   pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
545 	  }
546 	  );
547     next = p->rest;
548     p->rest = dest;
549     dest = p;
550   }
551   return dest;
552 }
553 
554 List_T
Pairpool_transfer_n(List_T dest,List_T source,int n)555 Pairpool_transfer_n (List_T dest, List_T source, int n) {
556   List_T p, next;
557 #ifdef DEBUG
558   Pair_T pair;
559 #endif
560 
561   for (p = source; p != NULL && --n >= 0; p = next) {
562     debug(
563 	  pair = List_head(p);
564 	  if (pair->cdna == '\0' || pair->genome == '\0') {
565 	    abort();
566 	  }
567 	  if (pair->gapp) {
568 	    printf("Transferring gap %p: queryjump=%d, genomejump=%d\n",
569 		   pair,pair->queryjump,pair->genomejump);
570 	  } else {
571 	    printf("Transferring %p: %d %d %c %c %c\n",
572 		   pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
573 	  }
574 	  );
575     next = p->rest;
576     p->rest = dest;
577     dest = p;
578   }
579   return dest;
580 }
581 
582 int
Pairpool_count_bounded(int * nstart,List_T source,int minpos,int maxpos)583 Pairpool_count_bounded (int *nstart, List_T source, int minpos, int maxpos) {
584   int npairs = 0;
585   List_T p, next;
586   Pair_T pair;
587 
588   *nstart = 0;
589   for (p = source; p != NULL; p = next) {
590     pair = List_head(p);
591     next = p->rest;
592     if (pair->querypos < minpos) {
593       *nstart += 1;
594     } else if (pair->querypos < maxpos) {
595       npairs++;
596     } else {
597       p = NULL;			/* Terminate transfer */
598     }
599   }
600   return npairs;
601 }
602 
603 
604 #if 0
605 List_T
606 Pairpool_transfer_bounded (List_T dest, List_T source, int minpos, int maxpos) {
607   List_T p, next;
608   Pair_T pair;
609 
610   for (p = source; p != NULL; p = next) {
611     debug(
612 	  pair = (Pair_T) List_head(p);
613 	  if (pair->cdna == '\0' || pair->genome == '\0') {
614 	    abort();
615 	  }
616 	  printf("Transferring %p: %d %d %c %c %c\n",
617 		 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
618 	  );
619     pair = (Pair_T) List_head(p);
620     next = p->rest;
621     if (pair->querypos == minpos) {
622       if (dest != NULL) {
623 	/* Pop last querypos off the stack, because we want only one of them */
624 	dest = dest->rest;
625       }
626       p->rest = dest;
627       dest = p;
628     } else if (pair->querypos == maxpos) {
629       p->rest = dest;
630       dest = p;
631       p = NULL;			/* Terminate transfer */
632     } else if (pair->querypos > minpos && pair->querypos < maxpos) {
633       p->rest = dest;
634       dest = p;
635     }
636   }
637 
638   return dest;
639 }
640 #endif
641 
642 
643 /* Originally prohibited copying of gaps */
644 List_T
Pairpool_copy(List_T source,T this)645 Pairpool_copy (List_T source, T this) {
646   List_T dest = NULL;
647 
648   while (source != NULL) {
649     dest = Pairpool_push_copy(dest,this,/*orig*/source->first);
650     source = source->rest;
651   }
652   return List_reverse(dest);
653 }
654 
655 
656 #if 0
657 /* Problem: Removes gaps.  Better to use clean_path_end3_chromosomal_bounds instead */
658 /* Note: Do not provide actual chrlength, but rather chrhigh - chroffset, so circular chromosomes work */
659 List_T
660 Pairpool_copy_bounded (List_T source, T this, Chrpos_T chrlength) {
661   List_T dest = NULL;
662   Pair_T pair;
663 
664   while (source != NULL) {
665     pair = (Pair_T) source->first;
666     if (pair->genomepos >= chrlength) {
667       /* Skip */
668     } else {
669       dest = Pairpool_push_copy(dest,this,pair);
670     }
671     source = source->rest;
672   }
673   return List_reverse(dest);
674 }
675 #endif
676 
677 
678 struct Pair_T *
Pairpool_copy_array(struct Pair_T * source,int npairs)679 Pairpool_copy_array (struct Pair_T *source, int npairs) {
680   struct Pair_T *dest;
681 
682   dest = (struct Pair_T *) MALLOC_OUT(npairs * sizeof(struct Pair_T));
683   memcpy(dest,source,npairs*sizeof(struct Pair_T));
684   return dest;
685 }
686 
687 
688 void
Pairpool_clean_join(List_T * left_path,List_T * right_pairs)689 Pairpool_clean_join (List_T *left_path, List_T *right_pairs) {
690   Pair_T leftpair, rightpair;
691   int queryjump, genomejump;
692 
693 
694   debug16(printf("Entered clean_join\n"));
695   debug16(printf("left path:\n"));
696   debug16(Pair_dump_list(*left_path,true));
697   debug16(printf("right pairs:\n"));
698   debug16(Pair_dump_list(*right_pairs,true));
699 
700 
701   while (*left_path != NULL && ((Pair_T) (*left_path)->first)->gapp == true) {
702     debug16(printf("Clearing gap on left\n"));
703     *left_path = Pairpool_pop(*left_path,&leftpair);
704   }
705   while (*right_pairs != NULL && ((Pair_T) (*right_pairs)->first)->gapp == true) {
706     debug16(printf("Clearing gap on right\n"));
707     *right_pairs = Pairpool_pop(*right_pairs,&rightpair);
708   }
709 
710   if (*left_path != NULL && *right_pairs != NULL) {
711     leftpair = (Pair_T) (*left_path)->first;
712     rightpair = (Pair_T) (*right_pairs)->first;
713     queryjump = rightpair->querypos - leftpair->querypos - 1;
714     genomejump = rightpair->genomepos - leftpair->genomepos - 1;
715     debug16(printf("queryjump %d, genomejump %d\n",queryjump,genomejump));
716 
717     /* Fix overlap */
718     while (*left_path != NULL && *right_pairs != NULL && (queryjump < 0 || genomejump < 0)) {
719       while (*left_path != NULL && ((Pair_T) (*left_path)->first)->gapp == true) {
720 	debug16(printf("Clearing gap on left\n"));
721 	*left_path = Pairpool_pop(*left_path,&leftpair);
722       }
723       while (*right_pairs != NULL && ((Pair_T) (*right_pairs)->first)->gapp == true) {
724 	debug16(printf("Clearing gap on right\n"));
725 	*right_pairs = Pairpool_pop(*right_pairs,&rightpair);
726       }
727       *left_path = Pairpool_pop(*left_path,&leftpair);
728       *right_pairs = Pairpool_pop(*right_pairs,&rightpair);
729       queryjump = rightpair->querypos - leftpair->querypos - 1;
730       genomejump = rightpair->genomepos - leftpair->genomepos - 1;
731       debug16(printf("Revising queryjump to be %d = %d - %d - 1\n",queryjump,rightpair->querypos,leftpair->querypos));
732     }
733 
734     while (*left_path != NULL && ((Pair_T) (*left_path)->first)->gapp == true) {
735       debug16(printf("Clearing gap on left\n"));
736       *left_path = Pairpool_pop(*left_path,&leftpair);
737     }
738     while (*right_pairs != NULL && ((Pair_T) (*right_pairs)->first)->gapp == true) {
739       debug16(printf("Clearing gap on right\n"));
740       *right_pairs = Pairpool_pop(*right_pairs,&rightpair);
741     }
742   }
743 
744   return;
745 }
746 
747 
748 List_T
Pairpool_remove_gapholders(List_T pairs)749 Pairpool_remove_gapholders (List_T pairs) {
750   List_T path = NULL;
751   Pair_T pair;
752 
753   while (pairs != NULL) {
754     /* pairptr = pairs; */
755     /* pairs = Pairpool_pop(pairs,&pair); */
756     pair = (Pair_T) pairs->first;
757     if (pair->gapp == true) {
758       debug(printf("Removing a gap with queryjump = %d, genomejump = %d\n",
759 		   pair->queryjump,pair->genomejump));
760       pairs = Pairpool_pop(pairs,&pair);
761     } else {
762 #ifdef WASTE
763       path = Pairpool_push_existing(path,pairpool,pair);
764 #else
765       path = List_transfer_one(path,&pairs);
766 #endif
767     }
768   }
769 
770   return List_reverse(path);
771 }
772 
773 
774 #ifdef CHECK_ASSERTIONS
775 static List_T
check_for_gapholders(List_T pairs)776 check_for_gapholders (List_T pairs) {
777   List_T path = NULL;
778   Pair_T pair;
779 
780   while (pairs != NULL) {
781     /* pairptr = pairs; */
782     /* pairs = Pairpool_pop(pairs,&pair); */
783     pair = (Pair_T) pairs->first;
784     if (pair->gapp == true) {
785       abort();
786     } else {
787 #ifdef WASTE
788       path = Pairpool_push_existing(path,pairpool,pair);
789 #else
790       path = List_transfer_one(path,&pairs);
791 #endif
792     }
793   }
794 
795   return List_reverse(path);
796 }
797 #endif
798 
799 
800 List_T
Pairpool_join_end3(List_T path_orig,List_T end3_pairs_orig,Pairpool_T pairpool,bool copy_end_p)801 Pairpool_join_end3 (List_T path_orig, List_T end3_pairs_orig, Pairpool_T pairpool,
802 		    bool copy_end_p) {
803   List_T path, end3_pairs;
804   Pair_T pair, leftpair;
805   int queryjump = -1, genomejump = -1;
806 
807 #ifdef CHECK_ASSERTIONS
808   path_orig = check_for_gapholders(path_orig);
809   end3_pairs_orig = check_for_gapholders(end3_pairs_orig);
810 #endif
811 
812   path = Pairpool_copy(path_orig,pairpool);
813   if (copy_end_p == true) {
814     end3_pairs = Pairpool_copy(end3_pairs_orig,pairpool);
815   } else {
816     end3_pairs = end3_pairs_orig;
817   }
818 
819   if (path == NULL && end3_pairs == NULL) {
820     return (List_T) NULL;
821   } else if (path == NULL) {
822     return List_reverse(end3_pairs);
823   } else if (end3_pairs == NULL) {
824     return path;
825   }
826 
827   debug15(printf("Entered join_end3\n"));
828   debug15(printf("path:\n"));
829   debug15(Pair_dump_list(path,true));
830   debug15(printf("end3_pairs:\n"));
831   debug15(Pair_dump_list(end3_pairs,true));
832 
833 
834   leftpair = (Pair_T) path->first;
835   pair = (Pair_T) end3_pairs->first;
836   queryjump = pair->querypos - leftpair->querypos - 1;
837   genomejump = pair->genomepos - leftpair->genomepos - 1;
838   debug15(printf("queryjump %d, genomejump %d\n",queryjump,genomejump));
839 
840   if (queryjump == 0 && genomejump == 0) {
841     /* Do nothing, although this is unexpected */
842   } else if (queryjump >= 0 && genomejump >= 0) {
843     /* Insert a gapholder */
844     path = Pairpool_push_gapholder(path,pairpool,queryjump,genomejump,
845 				   /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
846   } else {
847     /* Fix overlap */
848     while (path != NULL && end3_pairs != NULL && (queryjump < 0 || genomejump < 0)) {
849       pair = (Pair_T) end3_pairs->first;
850 
851       if (path != NULL) {
852 	path = Pairpool_pop(path,&leftpair);
853       }
854       if (end3_pairs != NULL) {
855 	end3_pairs = Pairpool_pop(end3_pairs,&pair);
856       }
857       queryjump = pair->querypos - leftpair->querypos - 1;
858       genomejump = pair->genomepos - leftpair->genomepos - 1;
859       debug15(printf("Revising queryjump to be %d = %d - %d - 1\n",queryjump,pair->querypos,leftpair->querypos));
860     }
861 
862     path = Pairpool_push_existing(path,pairpool,leftpair);
863     if (queryjump < 0 || genomejump < 0) {
864       /* Discard pair and all remaining pairs from end */
865       debug15(printf("Not possible to make end work, since queryjump = %d and genomejump = %d\n",
866 		     queryjump,genomejump));
867       end3_pairs = (List_T) NULL;
868     } else if (queryjump == 0 && genomejump == 0) {
869       /* No gapholder needed */
870       path = Pairpool_push_existing(path,pairpool,pair);
871     } else {
872       path = Pairpool_push_gapholder(path,pairpool,queryjump,genomejump,
873 				     /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
874       path = Pairpool_push_existing(path,pairpool,pair);
875     }
876   }
877 
878   while (end3_pairs != NULL) {
879     path = List_transfer_one(path,&end3_pairs);
880   }
881 
882   debug15(printf("joined path:\n"));
883   debug15(Pair_dump_list(path,true));
884   debug15(printf("\n"));
885 
886   return path;
887 }
888 
889 
890 List_T
Pairpool_join_end5(List_T pairs_orig,List_T end5_path_orig,Pairpool_T pairpool,bool copy_end_p)891 Pairpool_join_end5 (List_T pairs_orig, List_T end5_path_orig, Pairpool_T pairpool,
892 		    bool copy_end_p) {
893   List_T pairs, end5_path;
894   Pair_T pair, rightpair;
895   int queryjump = -1, genomejump = -1;
896 
897 #ifdef CHECK_ASSERTIONS
898   pairs_orig = check_for_gapholders(pairs_orig);
899   end5_path_orig = check_for_gapholders(end5_path_orig);
900 #endif
901 
902   pairs = Pairpool_copy(pairs_orig,pairpool);
903   if (copy_end_p == true) {
904     end5_path = Pairpool_copy(end5_path_orig,pairpool);
905   } else {
906     end5_path = end5_path_orig;
907   }
908 
909   if (pairs == NULL && end5_path == NULL) {
910     return (List_T) NULL;
911   } else if (pairs == NULL) {
912     return List_reverse(end5_path);
913   } else if (end5_path == NULL) {
914     return pairs;
915   }
916 
917   debug15(printf("Entered join_end5\n"));
918   debug15(printf("pairs:\n"));
919   debug15(Pair_dump_list(pairs,true));
920   debug15(printf("end5_path:\n"));
921   debug15(Pair_dump_list(end5_path,true));
922 
923 
924   rightpair = (Pair_T) pairs->first;
925   pair = (Pair_T) end5_path->first;
926   queryjump = rightpair->querypos - pair->querypos - 1;
927   genomejump = rightpair->genomepos - pair->genomepos - 1;
928   debug15(printf("queryjump %d, genomejump %d\n",queryjump,genomejump));
929 
930   if (queryjump == 0 && genomejump == 0) {
931     /* Do nothing, although this is unexpected */
932   } else if (queryjump >= 0 && genomejump >= 0) {
933     /* Insert a gapholder */
934     pairs = Pairpool_push_gapholder(pairs,pairpool,queryjump,genomejump,
935 				    /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
936   } else {
937     /* Fix overlap */
938     while (pairs != NULL && end5_path != NULL && (queryjump < 0 || genomejump < 0)) {
939       pair = (Pair_T) end5_path->first;
940 
941       if (pairs != NULL) {
942 	pairs = Pairpool_pop(pairs,&rightpair);
943       }
944       if (end5_path != NULL) {
945 	end5_path = Pairpool_pop(end5_path,&pair);
946       }
947       queryjump = rightpair->querypos - pair->querypos - 1;
948       genomejump = rightpair->genomepos - pair->genomepos - 1;
949       debug15(printf("Revising queryjump to be %d = %d - %d - 1\n",queryjump,pair->querypos,rightpair->querypos));
950     }
951 
952     pairs = Pairpool_push_existing(pairs,pairpool,rightpair);
953     if (queryjump < 0 || genomejump < 0) {
954       /* Discard pair and all remaining pairs from end */
955       debug15(printf("Not possible to make end work, since queryjump = %d and genomejump = %d\n",
956 		     queryjump,genomejump));
957       end5_path = (List_T) NULL;
958     } else if (queryjump == 0 && genomejump == 0) {
959       /* No gapholder needed */
960       pairs = Pairpool_push_existing(pairs,pairpool,pair);
961     } else {
962       pairs = Pairpool_push_gapholder(pairs,pairpool,queryjump,genomejump,
963 				      /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
964       pairs = Pairpool_push_existing(pairs,pairpool,pair);
965     }
966   }
967 
968   while (end5_path != NULL) {
969     pairs = List_transfer_one(pairs,&end5_path);
970   }
971 
972   debug15(printf("joined pairs:\n"));
973   debug15(Pair_dump_list(pairs,true));
974   debug15(printf("\n"));
975 
976   return pairs;
977 }
978 
979 
980 List_T
Pairpool_add_queryskip(List_T pairs,int r,int c,int dist,char * querysequence,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,int dynprogindex)981 Pairpool_add_queryskip (List_T pairs, int r, int c, int dist, char *querysequence,
982 			int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp, int dynprogindex) {
983   int j;
984   char c1;
985   int querycoord, genomecoord, step;
986 
987   querycoord = r-1;
988   genomecoord = c-1;
989 
990   if (revp == true) {
991     querycoord = -querycoord;
992     genomecoord = -genomecoord;
993     step = +1;
994   } else {
995     /* Advance to next genomepos */
996     /* genomecoord++;  -- No, this leads to bugs */
997     step = -1;
998   }
999   debug(printf("Entered Pairpool_add_genomeskip with r %d, c %d, revp %d => querycoord %d\n",r,c,revp,querycoord));
1000 
1001   for (j = 0; j < dist; j++) {
1002     c1 = querysequence[querycoord];
1003     debug(printf("Pushing %d,%d [%d,%d] (%c,-), ",r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1));
1004     pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
1005 			  c1,INDEL_COMP,/*genome*/' ',/*genomealt*/' ',dynprogindex);
1006     debug(r--);
1007     querycoord += step;
1008   }
1009 
1010   return pairs;
1011 }
1012 
1013 
1014 static char complCode[128] = COMPLEMENT_LC;
1015 
1016 static char
get_genomic_nt(char * g_alt,int genomicpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)1017 get_genomic_nt (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
1018 		bool watsonp) {
1019   char c2, c2_alt;
1020   Univcoord_T pos;
1021 
1022 #if 0
1023   /* If the read has a deletion, then we will extend beyond 0 or genomiclength, so do not restrict. */
1024   if (genomicpos < 0) {
1025     return '*';
1026 
1027   } else if (genomicpos >= genomiclength) {
1028     return '*';
1029 
1030   }
1031 #endif
1032 
1033   if (watsonp) {
1034     if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1035       *g_alt = '*';
1036       return '*';
1037 
1038     } else if (pos >= chrhigh) {
1039       *g_alt = '*';
1040       return '*';
1041 
1042 #if 0
1043     } else if (genome) {
1044       /* Not necessary, because Genome_get_char_blocks should work */
1045       debug8(printf("At %u, genomicnt is %c\n",
1046 		    genomicpos,Genome_get_char(genome,pos)));
1047       return Genome_get_char(genome,pos);
1048 #endif
1049 
1050     } else {
1051       /* GMAP with user-supplied genomic segment */
1052       debug8(printf("At %u, genomicnt is %c\n",
1053 		    genomicpos,Genome_get_char_blocks(&(*g_alt),pos)));
1054       return Genome_get_char_blocks(&(*g_alt),pos);
1055     }
1056 
1057   } else {
1058     if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1059       *g_alt = '*';
1060       return '*';
1061 
1062     } else if (pos >= chrhigh) {
1063       *g_alt = '*';
1064       return '*';
1065 
1066 #if 0
1067     } else if (genome) {
1068       /* Not necessary, because Genome_get_char_blocks should work */
1069       c2 = Genome_get_char(genome,pos);
1070 #endif
1071 
1072     } else {
1073       /* GMAP with user-supplied genomic segment */
1074       c2 = Genome_get_char_blocks(&c2_alt,pos);
1075     }
1076     debug8(printf("At %u, genomicnt is %c\n",genomicpos,complCode[(int) c2]));
1077     *g_alt = complCode[(int) c2_alt];
1078     return complCode[(int) c2];
1079   }
1080 }
1081 
1082 
1083 List_T
Pairpool_add_genomeskip(bool * add_dashes_p,List_T pairs,int r,int c,int dist,char * genomesequence,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int dynprogindex)1084 Pairpool_add_genomeskip (bool *add_dashes_p, List_T pairs, int r, int c, int dist,
1085 			 char *genomesequence, int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
1086 			 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int dynprogindex) {
1087   int j;
1088 #ifdef DEBUG
1089   char left1, left2, right2, right1;
1090   char left1_alt, left2_alt, right2_alt, right1_alt;
1091   /* int introntype; */
1092 #endif
1093   char c2, c2_alt;
1094   int querycoord, leftgenomecoord, rightgenomecoord, genomecoord, temp, step;
1095 
1096   querycoord = r-1;
1097   leftgenomecoord = c-dist;
1098   rightgenomecoord = c-1;
1099 
1100   if (revp == true) {
1101     querycoord = -querycoord;
1102     temp = leftgenomecoord;
1103     leftgenomecoord = -rightgenomecoord;
1104     rightgenomecoord = -temp;
1105     step = +1;
1106   } else {
1107     /* Advance to next querypos */
1108     /* querycoord++; -- No, this leads to bugs */
1109     step = -1;
1110   }
1111   debug(printf("Entered Pairpool_add_genomeskip with r %d, c %d, revp %d => querycoord %d\n",r,c,revp,querycoord));
1112 
1113   if (dist < MICROINTRON_LENGTH) {
1114     *add_dashes_p = true;
1115   } else {
1116 #ifdef DEBUG
1117     /* Check for intron */
1118     if (genomesequence != NULL) {
1119       left1 = left1_alt = genomesequence[leftgenomecoord];
1120       left2 = left2_alt = genomesequence[leftgenomecoord+1];
1121       right2 = right2_alt = genomesequence[rightgenomecoord-1];
1122       right1 = right1_alt = genomesequence[rightgenomecoord];
1123     } else {
1124       left1 = get_genomic_nt(&left1_alt,genomeoffset+leftgenomecoord,chroffset,chrhigh,watsonp);
1125       left2 = get_genomic_nt(&left2_alt,genomeoffset+leftgenomecoord+1,chroffset,chrhigh,watsonp);
1126       right2 = get_genomic_nt(&right2_alt,genomeoffset+rightgenomecoord-1,chroffset,chrhigh,watsonp);
1127       right1 = get_genomic_nt(&right1_alt,genomeoffset+rightgenomecoord,chroffset,chrhigh,watsonp);
1128     }
1129 #endif
1130 
1131 #if 0
1132 #ifdef PMAP
1133     introntype = Intron_type(left1,left2,right2,right1,
1134 			     left1_alt,left2_alt,right2_alt,right1_alt,
1135 			     /*cdna_direction*/+1);
1136 #else
1137     introntype = Intron_type(left1,left2,right2,right1,
1138 			     left1_alt,left2_alt,right2_alt,right1_alt,
1139 			     cdna_direction);
1140 #endif
1141 #endif
1142 
1143 #if 0
1144     if (introntype == NONINTRON) {
1145       *add_dashes_p = true;
1146     } else {
1147       *add_dashes_p = false;
1148     }
1149 #endif
1150     *add_dashes_p = false;
1151   }
1152 
1153   if (*add_dashes_p == true) {
1154     if (revp == true) {
1155       genomecoord = leftgenomecoord;
1156     } else {
1157       genomecoord = rightgenomecoord;
1158     }
1159 
1160     if (genomesequence != NULL) {
1161       for (j = 0; j < dist; j++) {
1162 	c2 = c2_alt = genomesequence[genomecoord];
1163 
1164 	debug(printf("Pushing %d,%d [%d,%d] (-,%c),",r,c,queryoffset+querycoord,genomeoffset+genomecoord,c2));
1165 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
1166 			      ' ',INDEL_COMP,c2,c2_alt,dynprogindex);
1167 	debug(c--);
1168 	genomecoord += step;
1169       }
1170 
1171     } else {
1172       for (j = 0; j < dist; j++) {
1173 	c2 = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
1174 #ifdef EXTRACT_GENOMICSEG
1175 	assert(c2 == genomesequence[genomecoord]);
1176 #endif
1177 
1178 	debug(printf("Pushing %d,%d [%d,%d] (-,%c),",r,c,queryoffset+querycoord,genomeoffset+genomecoord,c2));
1179 	pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
1180 			      ' ',INDEL_COMP,c2,c2_alt,dynprogindex);
1181 	debug(c--);
1182 	genomecoord += step;
1183       }
1184     }
1185 
1186   } else {
1187     debug(printf("Large gap %c%c..%c%c\n",left1,left2,right2,right1));
1188 #ifndef NOGAPHOLDER
1189     pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/dist,
1190 				    /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
1191 #endif
1192   }
1193 
1194   return pairs;
1195 }
1196 
1197 
1198 /* Used for compacting pairs from working pairpool to final pairpool
1199    (dest) at end of stage2 and stage3, so the working pairpool can be
1200    re-used for the next gregion */
1201 List_T
Pairpool_compact_copy(List_T list,T dest)1202 Pairpool_compact_copy (List_T list, T dest) {
1203   List_T newlist = NULL, listcell;
1204   Pair_T pair, orig;
1205   List_T p, q;
1206   int n;
1207 
1208   for (q = list; q != NULL; q = q->rest) {
1209     orig = (Pair_T) q->first;
1210 
1211     /* Following code taken from Pairpool_push_copy */
1212     if (dest->pairctr >= dest->npairs) {
1213       dest->pairptr = add_new_pairchunk(dest);
1214     } else if ((dest->pairctr % CHUNKSIZE) == 0) {
1215       for (n = dest->npairs - CHUNKSIZE, p = dest->pairchunks;
1216 	   n > dest->pairctr; p = p->rest, n -= CHUNKSIZE) ;
1217       dest->pairptr = (struct Pair_T *) p->first;
1218       debug1(printf("Located pair %d at %p\n",dest->pairctr,dest->pairptr));
1219     }
1220     pair = dest->pairptr++;
1221     dest->pairctr++;
1222 
1223     memcpy(pair,orig,sizeof(struct Pair_T));
1224 
1225     debug(
1226 	  printf("Copying %p: %d %d %c %c %c\n",
1227 		 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
1228 	  );
1229 
1230     if (dest->listcellctr >= dest->nlistcells) {
1231       dest->listcellptr = add_new_listcellchunk(dest);
1232     } else if ((dest->listcellctr % CHUNKSIZE) == 0) {
1233       for (n = dest->nlistcells - CHUNKSIZE, p = dest->listcellchunks;
1234 	   n > dest->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
1235       dest->listcellptr = (struct List_T *) p->first;
1236       debug1(printf("Located listcell %d at %p\n",dest->listcellctr,dest->listcellptr));
1237     }
1238     listcell = dest->listcellptr++;
1239     dest->listcellctr++;
1240 
1241     listcell->first = (void *) pair;
1242     listcell->rest = newlist;
1243     newlist = listcell;
1244   }
1245 
1246   return List_reverse(newlist);
1247 }
1248 
1249