1 static char rcsid[] = "$Id: pairpool.c 218147 2019-01-16 21:28:41Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "pairpool.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h> /* For memcpy */
10 #include "assert.h"
11 #include "mem.h"
12 #include "comp.h"
13 #include "pairdef.h"
14 #include "intron.h"
15 #include "complement.h"
16
17
18 #define CHUNKSIZE 20000
19 #define MICROINTRON_LENGTH 9 /* For Pairpool_add_genomeskip */
20
21 #ifdef DEBUG
22 #define debug(x) x
23 #else
24 #define debug(x)
25 #endif
26
27 /* For mechanics of memory allocation and deallocation */
28 #ifdef DEBUG1
29 #define debug1(x) x
30 #else
31 #define debug1(x)
32 #endif
33
34 /* For popping */
35 #ifdef DEBUG2
36 #define debug2(x) x
37 #else
38 #define debug2(x)
39 #endif
40
41 /* Getting genomic nt */
42 #ifdef DEBUG8
43 #define debug8(x) x
44 #else
45 #define debug8(x)
46 #endif
47
48 /* joining ends */
49 #ifdef DEBUG15
50 #define debug15(x) x
51 #else
52 #define debug15(x)
53 #endif
54
55 /* clean_join */
56 #ifdef DEBUG16
57 #define debug16(x) x
58 #else
59 #define debug16(x)
60 #endif
61
62
63 #define T Pairpool_T
64 struct T {
65 int npairs;
66 int pairctr;
67 struct Pair_T *pairptr;
68 List_T pairchunks;
69
70 int nlistcells;
71 int listcellctr;
72 struct List_T *listcellptr;
73 List_T listcellchunks;
74 };
75
76 void
Pairpool_free_memory(T this)77 Pairpool_free_memory (T this) {
78 List_T p;
79 struct Pair_T *pairptr;
80 struct List_T *listcellptr;
81
82 for (p = this->pairchunks; p != NULL; p = List_next(p)) {
83 pairptr = (struct Pair_T *) List_head(p);
84 FREE_KEEP(pairptr);
85 }
86 List_free_keep(&this->pairchunks);
87 for (p = this->listcellchunks; p != NULL; p = List_next(p)) {
88 listcellptr = (struct List_T *) List_head(p);
89 FREE_KEEP(listcellptr);
90 }
91 List_free_keep(&this->listcellchunks);
92
93 this->npairs = 0;
94 this->pairctr = 0;
95 this->pairchunks = NULL;
96 /* this->pairptr = add_new_pairchunk(this); */
97
98 this->nlistcells = 0;
99 this->listcellctr = 0;
100 this->listcellchunks = NULL;
101 /* this->listcellptr = add_new_listcellchunk(this); */
102
103 return;
104 }
105
106
107 void
Pairpool_free(T * old)108 Pairpool_free (T *old) {
109 Pairpool_free_memory(*old);
110 FREE_KEEP(*old);
111 return;
112 }
113
114
115 void
Pairpool_report_memory(T this)116 Pairpool_report_memory (T this) {
117 printf("Pairpool has %d pairchunks and %d listcellchunks\n",
118 List_length(this->pairchunks),List_length(this->listcellchunks));
119 return;
120 }
121
122
123 static struct Pair_T *
add_new_pairchunk(T this)124 add_new_pairchunk (T this) {
125 struct Pair_T *chunk;
126
127 chunk = (struct Pair_T *) MALLOC_KEEP(CHUNKSIZE*sizeof(struct Pair_T));
128 this->pairchunks = List_push_keep(this->pairchunks,(void *) chunk);
129 debug1(printf("Adding a new chunk of pairs. Ptr for pair %d is %p\n",
130 this->npairs,chunk));
131
132 this->npairs += CHUNKSIZE;
133
134 return chunk;
135 }
136
137 static struct List_T *
add_new_listcellchunk(T this)138 add_new_listcellchunk (T this) {
139 struct List_T *chunk;
140
141 chunk = (struct List_T *) MALLOC_KEEP(CHUNKSIZE*sizeof(struct List_T));
142 this->listcellchunks = List_push_keep(this->listcellchunks,(void *) chunk);
143 debug1(printf("Adding a new chunk of listcells. Ptr for listcell %d is %p\n",
144 this->nlistcells,chunk));
145
146 this->nlistcells += CHUNKSIZE;
147
148 return chunk;
149 }
150
151 T
Pairpool_new(void)152 Pairpool_new (void) {
153 T new = (T) MALLOC_KEEP(sizeof(*new));
154
155 new->npairs = 0;
156 new->pairctr = 0;
157 new->pairchunks = NULL;
158 /* new->pairptr = add_new_pairchunk(new); */
159
160 new->nlistcells = 0;
161 new->listcellctr = 0;
162 new->listcellchunks = NULL;
163 /* new->listcellptr = add_new_listcellchunk(new); */
164
165 return new;
166 }
167
168 void
Pairpool_reset(T this)169 Pairpool_reset (T this) {
170 this->pairctr = 0;
171 this->listcellctr = 0;
172 return;
173 }
174
175 /* gapp should be false for the following comps: MATCH_COMP,
176 DYNPROG_MATCH_COMP, AMBIGUOUS_COMP, MISMATCH_COMP, INDEL_COMP,
177 SHORTGAP_COMP */
178
179 List_T
Pairpool_push(List_T list,T this,int querypos,int genomepos,char cdna,char comp,char genome,char genomealt,int dynprogindex)180 Pairpool_push (List_T list, T this, int querypos, int genomepos, char cdna, char comp,
181 char genome, char genomealt, int dynprogindex) {
182 List_T listcell;
183 Pair_T pair;
184 List_T p;
185 int n;
186
187 /* assert(querypos >= 0); */
188 /* assert(genomepos >= 0); */
189
190 if (querypos < 0 || genomepos < 0) {
191 return list;
192 }
193
194 if (this->pairctr >= this->npairs) {
195 this->pairptr = add_new_pairchunk(this);
196 } else if ((this->pairctr % CHUNKSIZE) == 0) {
197 for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
198 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
199 this->pairptr = (struct Pair_T *) p->first;
200 debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
201 }
202 pair = this->pairptr++;
203 this->pairctr++;
204
205 pair->querypos = querypos;
206 pair->genomepos = genomepos;
207 pair->aapos = 0;
208 pair->aaphase_g = -1;
209 pair->aaphase_e = -1;
210 pair->cdna = cdna;
211 pair->comp = comp;
212 pair->genome = genome;
213 pair->genomealt = genomealt;
214 pair->dynprogindex = dynprogindex;
215
216 pair->aa_g = ' ';
217 pair->aa_e = ' ';
218 pair->shortexonp = false;
219 pair->gapp = false;
220 pair->knowngapp = false;
221 pair->introntype = NONINTRON;
222 if (comp == EXTRAEXON_COMP) {
223 pair->extraexonp = true;
224 } else {
225 pair->extraexonp = false;
226 }
227
228 pair->queryjump = 0;
229 pair->genomejump = 0;
230
231 pair->state = GOOD;
232 pair->protectedp = false;
233 pair->disallowedp = false;
234 pair->donor_prob = 0.0;
235 pair->acceptor_prob = 0.0;
236 pair->end_intron_p = false;
237
238 debug(
239 printf("Creating %p: %d %d %c %c %c\n",
240 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
241 );
242
243 if (this->listcellctr >= this->nlistcells) {
244 this->listcellptr = add_new_listcellchunk(this);
245 } else if ((this->listcellctr % CHUNKSIZE) == 0) {
246 for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
247 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
248 this->listcellptr = (struct List_T *) p->first;
249 debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
250 }
251 listcell = this->listcellptr++;
252 this->listcellctr++;
253
254 listcell->first = (void *) pair;
255 listcell->rest = list;
256
257 return listcell;
258 }
259
260
261 List_T
Pairpool_push_copy(List_T list,T this,Pair_T orig)262 Pairpool_push_copy (List_T list, T this, Pair_T orig) {
263 List_T listcell;
264 Pair_T pair;
265 List_T p;
266 int n;
267
268 if (this->pairctr >= this->npairs) {
269 this->pairptr = add_new_pairchunk(this);
270 } else if ((this->pairctr % CHUNKSIZE) == 0) {
271 for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
272 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
273 this->pairptr = (struct Pair_T *) p->first;
274 debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
275 }
276 pair = this->pairptr++;
277 this->pairctr++;
278
279 memcpy(pair,orig,sizeof(struct Pair_T));
280
281 debug(
282 printf("Copying %p <= %p: %d %d %c %c %c\n",
283 pair,orig,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
284 );
285
286 if (this->listcellctr >= this->nlistcells) {
287 this->listcellptr = add_new_listcellchunk(this);
288 } else if ((this->listcellctr % CHUNKSIZE) == 0) {
289 for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
290 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
291 this->listcellptr = (struct List_T *) p->first;
292 debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
293 }
294 listcell = this->listcellptr++;
295 this->listcellctr++;
296
297 listcell->first = (void *) pair;
298 listcell->rest = list;
299
300 return listcell;
301 }
302
303
304 List_T
Pairpool_push_gapalign(List_T list,T this,int querypos,int genomepos,char cdna,char comp,int introntype,char genome,char genomealt,bool extraexonp)305 Pairpool_push_gapalign (List_T list, T this, int querypos, int genomepos, char cdna, char comp,
306 int introntype, char genome, char genomealt, bool extraexonp) {
307 List_T listcell;
308 Pair_T pair;
309 List_T p;
310 int n;
311
312 if (this->pairctr >= this->npairs) {
313 this->pairptr = add_new_pairchunk(this);
314 } else if ((this->pairctr % CHUNKSIZE) == 0) {
315 for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
316 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
317 this->pairptr = (struct Pair_T *) p->first;
318 debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
319 }
320 pair = this->pairptr++;
321 this->pairctr++;
322
323 pair->querypos = querypos;
324 pair->genomepos = genomepos;
325 pair->aapos = 0;
326 pair->aaphase_g = -1;
327 pair->aaphase_e = -1;
328 pair->cdna = cdna;
329 pair->comp = comp;
330 pair->genome = genome;
331 pair->genomealt = genomealt;
332 pair->dynprogindex = 0;
333
334 pair->aa_g = ' ';
335 pair->aa_e = ' ';
336 pair->shortexonp = false;
337 pair->gapp = true;
338 pair->knowngapp = false;
339 pair->introntype = introntype;
340 pair->extraexonp = extraexonp;
341
342 pair->queryjump = 0;
343 pair->genomejump = 0;
344
345 pair->state = GOOD;
346 pair->protectedp = false;
347 pair->disallowedp = false;
348 pair->donor_prob = 0.0;
349 pair->acceptor_prob = 0.0;
350 pair->end_intron_p = false;
351
352 debug(
353 printf("Creating %p: %d %d %c %c %c introntype %d\n",
354 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome,pair->introntype);
355 );
356
357 if (this->listcellctr >= this->nlistcells) {
358 this->listcellptr = add_new_listcellchunk(this);
359 } else if ((this->listcellctr % CHUNKSIZE) == 0) {
360 for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
361 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
362 this->listcellptr = (struct List_T *) p->first;
363 debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
364 }
365 listcell = this->listcellptr++;
366 this->listcellctr++;
367
368 listcell->first = (void *) pair;
369 listcell->rest = list;
370
371 return listcell;
372 }
373
374 List_T
Pairpool_push_gapholder(List_T list,T this,int queryjump,int genomejump,Pair_T leftpair,Pair_T rightpair,bool knownp)375 Pairpool_push_gapholder (List_T list, T this, int queryjump, int genomejump,
376 Pair_T leftpair, Pair_T rightpair, bool knownp) {
377 List_T listcell;
378 Pair_T pair;
379 List_T p;
380 int n;
381
382 if (this->pairctr >= this->npairs) {
383 this->pairptr = add_new_pairchunk(this);
384 } else if ((this->pairctr % CHUNKSIZE) == 0) {
385 for (n = this->npairs - CHUNKSIZE, p = this->pairchunks;
386 n > this->pairctr; p = p->rest, n -= CHUNKSIZE) ;
387 this->pairptr = (struct Pair_T *) p->first;
388 debug1(printf("Located pair %d at %p\n",this->pairctr,this->pairptr));
389 }
390 pair = this->pairptr++;
391 this->pairctr++;
392
393 pair->querypos = -1;
394 pair->genomepos = -1;
395
396 pair->aapos = 0;
397 pair->aaphase_g = -1;
398 pair->aaphase_e = -1;
399 pair->cdna = ' ';
400 pair->comp = ' ';
401 pair->genome = ' ';
402 pair->genomealt = ' ';
403 pair->dynprogindex = 0;
404
405 pair->aa_g = ' ';
406 pair->aa_e = ' ';
407 pair->shortexonp = false;
408 pair->gapp = true;
409 if (knownp == true) {
410 pair->knowngapp = true;
411 pair->donor_prob = 2.0;
412 pair->acceptor_prob = 2.0;
413 } else {
414 pair->knowngapp = false;
415 pair->donor_prob = 0.0;
416 pair->acceptor_prob = 0.0;
417 }
418 pair->introntype = NONINTRON;
419 pair->extraexonp = false;
420
421 if (leftpair && rightpair) {
422 queryjump = rightpair->querypos - leftpair->querypos - 1;
423 genomejump = rightpair->genomepos - leftpair->genomepos - 1;
424 if (leftpair->cdna == ' ') queryjump++;
425 if (leftpair->genome == ' ') genomejump++;
426 }
427
428 pair->queryjump = queryjump;
429 pair->genomejump = genomejump;
430
431 pair->state = GOOD;
432 pair->protectedp = false;
433 pair->disallowedp = false;
434 pair->end_intron_p = false;
435
436 debug(printf("Creating gap %p, queryjump=%d, genomejump=%d\n",pair,queryjump,genomejump));
437
438 if (this->listcellctr >= this->nlistcells) {
439 this->listcellptr = add_new_listcellchunk(this);
440 } else if ((this->listcellctr % CHUNKSIZE) == 0) {
441 for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
442 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
443 this->listcellptr = (struct List_T *) p->first;
444 debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
445 }
446 listcell = this->listcellptr++;
447 this->listcellctr++;
448
449 listcell->first = (void *) pair;
450 listcell->rest = list;
451
452 return listcell;
453 }
454
455 List_T
Pairpool_push_existing(List_T list,T this,Pair_T pair)456 Pairpool_push_existing (List_T list, T this, Pair_T pair) {
457 List_T listcell;
458 List_T p;
459 int n;
460
461 debug(
462 Pair_T head;
463 if (pair->gapp == true) {
464 printf("Pushing gap %p: queryjump=%d, genomejump=%d onto ",
465 pair,pair->queryjump,pair->genomejump);
466 } else {
467 printf("Pushing %p: %d %d %c %c %c onto ",
468 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
469 }
470 if (list == NULL) {
471 printf("NULL\n");
472 } else {
473 head = list->first;
474 if (head->gapp == true) {
475 printf("gap %p: queryjump=%d, genomejump=%d\n",
476 head,head->queryjump,head->genomejump);
477 } else {
478 printf("%p: %d %d %c %c %c\n",
479 head,head->querypos,head->genomepos,head->cdna,head->comp,head->genome);
480 }
481 }
482 );
483
484 if (this->listcellctr >= this->nlistcells) {
485 this->listcellptr = add_new_listcellchunk(this);
486 } else if ((this->listcellctr % CHUNKSIZE) == 0) {
487 for (n = this->nlistcells - CHUNKSIZE, p = this->listcellchunks;
488 n > this->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
489 this->listcellptr = (struct List_T *) p->first;
490 debug1(printf("Located listcell %d at %p\n",this->listcellctr,this->listcellptr));
491 }
492 listcell = this->listcellptr++;
493 this->listcellctr++;
494
495 listcell->first = (void *) pair;
496 listcell->rest = list;
497
498 return listcell;
499 }
500
501
502 /* Note: this does not free the list cell */
503 List_T
Pairpool_pop(List_T list,Pair_T * x)504 Pairpool_pop (List_T list, Pair_T *x) {
505 List_T head;
506
507 if (list != NULL) {
508 head = list->rest;
509 *x = (Pair_T) list->first;
510 debug2(
511 if ((*x)->gapp == true) {
512 printf("Popping gap: queryjump=%d, genomejump=%d\n",
513 (*x)->queryjump,(*x)->genomejump);
514 } else {
515 printf("Popping: %d %d %c %c %c\n",
516 (*x)->querypos,(*x)->genomepos,(*x)->cdna,(*x)->comp,(*x)->genome);
517 }
518 );
519 return head;
520 } else {
521 return list;
522 }
523 }
524
525
526 List_T
Pairpool_transfer(List_T dest,List_T source)527 Pairpool_transfer (List_T dest, List_T source) {
528 List_T p, next;
529 #ifdef DEBUG
530 Pair_T pair;
531 #endif
532
533 for (p = source; p != NULL; p = next) {
534 debug(
535 pair = List_head(p);
536 if (pair->cdna == '\0' || pair->genome == '\0') {
537 abort();
538 }
539 if (pair->gapp) {
540 printf("Transferring gap %p: queryjump=%d, genomejump=%d\n",
541 pair,pair->queryjump,pair->genomejump);
542 } else {
543 printf("Transferring %p: %d %d %c %c %c\n",
544 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
545 }
546 );
547 next = p->rest;
548 p->rest = dest;
549 dest = p;
550 }
551 return dest;
552 }
553
554 List_T
Pairpool_transfer_n(List_T dest,List_T source,int n)555 Pairpool_transfer_n (List_T dest, List_T source, int n) {
556 List_T p, next;
557 #ifdef DEBUG
558 Pair_T pair;
559 #endif
560
561 for (p = source; p != NULL && --n >= 0; p = next) {
562 debug(
563 pair = List_head(p);
564 if (pair->cdna == '\0' || pair->genome == '\0') {
565 abort();
566 }
567 if (pair->gapp) {
568 printf("Transferring gap %p: queryjump=%d, genomejump=%d\n",
569 pair,pair->queryjump,pair->genomejump);
570 } else {
571 printf("Transferring %p: %d %d %c %c %c\n",
572 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
573 }
574 );
575 next = p->rest;
576 p->rest = dest;
577 dest = p;
578 }
579 return dest;
580 }
581
582 int
Pairpool_count_bounded(int * nstart,List_T source,int minpos,int maxpos)583 Pairpool_count_bounded (int *nstart, List_T source, int minpos, int maxpos) {
584 int npairs = 0;
585 List_T p, next;
586 Pair_T pair;
587
588 *nstart = 0;
589 for (p = source; p != NULL; p = next) {
590 pair = List_head(p);
591 next = p->rest;
592 if (pair->querypos < minpos) {
593 *nstart += 1;
594 } else if (pair->querypos < maxpos) {
595 npairs++;
596 } else {
597 p = NULL; /* Terminate transfer */
598 }
599 }
600 return npairs;
601 }
602
603
604 #if 0
605 List_T
606 Pairpool_transfer_bounded (List_T dest, List_T source, int minpos, int maxpos) {
607 List_T p, next;
608 Pair_T pair;
609
610 for (p = source; p != NULL; p = next) {
611 debug(
612 pair = (Pair_T) List_head(p);
613 if (pair->cdna == '\0' || pair->genome == '\0') {
614 abort();
615 }
616 printf("Transferring %p: %d %d %c %c %c\n",
617 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
618 );
619 pair = (Pair_T) List_head(p);
620 next = p->rest;
621 if (pair->querypos == minpos) {
622 if (dest != NULL) {
623 /* Pop last querypos off the stack, because we want only one of them */
624 dest = dest->rest;
625 }
626 p->rest = dest;
627 dest = p;
628 } else if (pair->querypos == maxpos) {
629 p->rest = dest;
630 dest = p;
631 p = NULL; /* Terminate transfer */
632 } else if (pair->querypos > minpos && pair->querypos < maxpos) {
633 p->rest = dest;
634 dest = p;
635 }
636 }
637
638 return dest;
639 }
640 #endif
641
642
643 /* Originally prohibited copying of gaps */
644 List_T
Pairpool_copy(List_T source,T this)645 Pairpool_copy (List_T source, T this) {
646 List_T dest = NULL;
647
648 while (source != NULL) {
649 dest = Pairpool_push_copy(dest,this,/*orig*/source->first);
650 source = source->rest;
651 }
652 return List_reverse(dest);
653 }
654
655
656 #if 0
657 /* Problem: Removes gaps. Better to use clean_path_end3_chromosomal_bounds instead */
658 /* Note: Do not provide actual chrlength, but rather chrhigh - chroffset, so circular chromosomes work */
659 List_T
660 Pairpool_copy_bounded (List_T source, T this, Chrpos_T chrlength) {
661 List_T dest = NULL;
662 Pair_T pair;
663
664 while (source != NULL) {
665 pair = (Pair_T) source->first;
666 if (pair->genomepos >= chrlength) {
667 /* Skip */
668 } else {
669 dest = Pairpool_push_copy(dest,this,pair);
670 }
671 source = source->rest;
672 }
673 return List_reverse(dest);
674 }
675 #endif
676
677
678 struct Pair_T *
Pairpool_copy_array(struct Pair_T * source,int npairs)679 Pairpool_copy_array (struct Pair_T *source, int npairs) {
680 struct Pair_T *dest;
681
682 dest = (struct Pair_T *) MALLOC_OUT(npairs * sizeof(struct Pair_T));
683 memcpy(dest,source,npairs*sizeof(struct Pair_T));
684 return dest;
685 }
686
687
688 void
Pairpool_clean_join(List_T * left_path,List_T * right_pairs)689 Pairpool_clean_join (List_T *left_path, List_T *right_pairs) {
690 Pair_T leftpair, rightpair;
691 int queryjump, genomejump;
692
693
694 debug16(printf("Entered clean_join\n"));
695 debug16(printf("left path:\n"));
696 debug16(Pair_dump_list(*left_path,true));
697 debug16(printf("right pairs:\n"));
698 debug16(Pair_dump_list(*right_pairs,true));
699
700
701 while (*left_path != NULL && ((Pair_T) (*left_path)->first)->gapp == true) {
702 debug16(printf("Clearing gap on left\n"));
703 *left_path = Pairpool_pop(*left_path,&leftpair);
704 }
705 while (*right_pairs != NULL && ((Pair_T) (*right_pairs)->first)->gapp == true) {
706 debug16(printf("Clearing gap on right\n"));
707 *right_pairs = Pairpool_pop(*right_pairs,&rightpair);
708 }
709
710 if (*left_path != NULL && *right_pairs != NULL) {
711 leftpair = (Pair_T) (*left_path)->first;
712 rightpair = (Pair_T) (*right_pairs)->first;
713 queryjump = rightpair->querypos - leftpair->querypos - 1;
714 genomejump = rightpair->genomepos - leftpair->genomepos - 1;
715 debug16(printf("queryjump %d, genomejump %d\n",queryjump,genomejump));
716
717 /* Fix overlap */
718 while (*left_path != NULL && *right_pairs != NULL && (queryjump < 0 || genomejump < 0)) {
719 while (*left_path != NULL && ((Pair_T) (*left_path)->first)->gapp == true) {
720 debug16(printf("Clearing gap on left\n"));
721 *left_path = Pairpool_pop(*left_path,&leftpair);
722 }
723 while (*right_pairs != NULL && ((Pair_T) (*right_pairs)->first)->gapp == true) {
724 debug16(printf("Clearing gap on right\n"));
725 *right_pairs = Pairpool_pop(*right_pairs,&rightpair);
726 }
727 *left_path = Pairpool_pop(*left_path,&leftpair);
728 *right_pairs = Pairpool_pop(*right_pairs,&rightpair);
729 queryjump = rightpair->querypos - leftpair->querypos - 1;
730 genomejump = rightpair->genomepos - leftpair->genomepos - 1;
731 debug16(printf("Revising queryjump to be %d = %d - %d - 1\n",queryjump,rightpair->querypos,leftpair->querypos));
732 }
733
734 while (*left_path != NULL && ((Pair_T) (*left_path)->first)->gapp == true) {
735 debug16(printf("Clearing gap on left\n"));
736 *left_path = Pairpool_pop(*left_path,&leftpair);
737 }
738 while (*right_pairs != NULL && ((Pair_T) (*right_pairs)->first)->gapp == true) {
739 debug16(printf("Clearing gap on right\n"));
740 *right_pairs = Pairpool_pop(*right_pairs,&rightpair);
741 }
742 }
743
744 return;
745 }
746
747
748 List_T
Pairpool_remove_gapholders(List_T pairs)749 Pairpool_remove_gapholders (List_T pairs) {
750 List_T path = NULL;
751 Pair_T pair;
752
753 while (pairs != NULL) {
754 /* pairptr = pairs; */
755 /* pairs = Pairpool_pop(pairs,&pair); */
756 pair = (Pair_T) pairs->first;
757 if (pair->gapp == true) {
758 debug(printf("Removing a gap with queryjump = %d, genomejump = %d\n",
759 pair->queryjump,pair->genomejump));
760 pairs = Pairpool_pop(pairs,&pair);
761 } else {
762 #ifdef WASTE
763 path = Pairpool_push_existing(path,pairpool,pair);
764 #else
765 path = List_transfer_one(path,&pairs);
766 #endif
767 }
768 }
769
770 return List_reverse(path);
771 }
772
773
774 #ifdef CHECK_ASSERTIONS
775 static List_T
check_for_gapholders(List_T pairs)776 check_for_gapholders (List_T pairs) {
777 List_T path = NULL;
778 Pair_T pair;
779
780 while (pairs != NULL) {
781 /* pairptr = pairs; */
782 /* pairs = Pairpool_pop(pairs,&pair); */
783 pair = (Pair_T) pairs->first;
784 if (pair->gapp == true) {
785 abort();
786 } else {
787 #ifdef WASTE
788 path = Pairpool_push_existing(path,pairpool,pair);
789 #else
790 path = List_transfer_one(path,&pairs);
791 #endif
792 }
793 }
794
795 return List_reverse(path);
796 }
797 #endif
798
799
800 List_T
Pairpool_join_end3(List_T path_orig,List_T end3_pairs_orig,Pairpool_T pairpool,bool copy_end_p)801 Pairpool_join_end3 (List_T path_orig, List_T end3_pairs_orig, Pairpool_T pairpool,
802 bool copy_end_p) {
803 List_T path, end3_pairs;
804 Pair_T pair, leftpair;
805 int queryjump = -1, genomejump = -1;
806
807 #ifdef CHECK_ASSERTIONS
808 path_orig = check_for_gapholders(path_orig);
809 end3_pairs_orig = check_for_gapholders(end3_pairs_orig);
810 #endif
811
812 path = Pairpool_copy(path_orig,pairpool);
813 if (copy_end_p == true) {
814 end3_pairs = Pairpool_copy(end3_pairs_orig,pairpool);
815 } else {
816 end3_pairs = end3_pairs_orig;
817 }
818
819 if (path == NULL && end3_pairs == NULL) {
820 return (List_T) NULL;
821 } else if (path == NULL) {
822 return List_reverse(end3_pairs);
823 } else if (end3_pairs == NULL) {
824 return path;
825 }
826
827 debug15(printf("Entered join_end3\n"));
828 debug15(printf("path:\n"));
829 debug15(Pair_dump_list(path,true));
830 debug15(printf("end3_pairs:\n"));
831 debug15(Pair_dump_list(end3_pairs,true));
832
833
834 leftpair = (Pair_T) path->first;
835 pair = (Pair_T) end3_pairs->first;
836 queryjump = pair->querypos - leftpair->querypos - 1;
837 genomejump = pair->genomepos - leftpair->genomepos - 1;
838 debug15(printf("queryjump %d, genomejump %d\n",queryjump,genomejump));
839
840 if (queryjump == 0 && genomejump == 0) {
841 /* Do nothing, although this is unexpected */
842 } else if (queryjump >= 0 && genomejump >= 0) {
843 /* Insert a gapholder */
844 path = Pairpool_push_gapholder(path,pairpool,queryjump,genomejump,
845 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
846 } else {
847 /* Fix overlap */
848 while (path != NULL && end3_pairs != NULL && (queryjump < 0 || genomejump < 0)) {
849 pair = (Pair_T) end3_pairs->first;
850
851 if (path != NULL) {
852 path = Pairpool_pop(path,&leftpair);
853 }
854 if (end3_pairs != NULL) {
855 end3_pairs = Pairpool_pop(end3_pairs,&pair);
856 }
857 queryjump = pair->querypos - leftpair->querypos - 1;
858 genomejump = pair->genomepos - leftpair->genomepos - 1;
859 debug15(printf("Revising queryjump to be %d = %d - %d - 1\n",queryjump,pair->querypos,leftpair->querypos));
860 }
861
862 path = Pairpool_push_existing(path,pairpool,leftpair);
863 if (queryjump < 0 || genomejump < 0) {
864 /* Discard pair and all remaining pairs from end */
865 debug15(printf("Not possible to make end work, since queryjump = %d and genomejump = %d\n",
866 queryjump,genomejump));
867 end3_pairs = (List_T) NULL;
868 } else if (queryjump == 0 && genomejump == 0) {
869 /* No gapholder needed */
870 path = Pairpool_push_existing(path,pairpool,pair);
871 } else {
872 path = Pairpool_push_gapholder(path,pairpool,queryjump,genomejump,
873 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
874 path = Pairpool_push_existing(path,pairpool,pair);
875 }
876 }
877
878 while (end3_pairs != NULL) {
879 path = List_transfer_one(path,&end3_pairs);
880 }
881
882 debug15(printf("joined path:\n"));
883 debug15(Pair_dump_list(path,true));
884 debug15(printf("\n"));
885
886 return path;
887 }
888
889
890 List_T
Pairpool_join_end5(List_T pairs_orig,List_T end5_path_orig,Pairpool_T pairpool,bool copy_end_p)891 Pairpool_join_end5 (List_T pairs_orig, List_T end5_path_orig, Pairpool_T pairpool,
892 bool copy_end_p) {
893 List_T pairs, end5_path;
894 Pair_T pair, rightpair;
895 int queryjump = -1, genomejump = -1;
896
897 #ifdef CHECK_ASSERTIONS
898 pairs_orig = check_for_gapholders(pairs_orig);
899 end5_path_orig = check_for_gapholders(end5_path_orig);
900 #endif
901
902 pairs = Pairpool_copy(pairs_orig,pairpool);
903 if (copy_end_p == true) {
904 end5_path = Pairpool_copy(end5_path_orig,pairpool);
905 } else {
906 end5_path = end5_path_orig;
907 }
908
909 if (pairs == NULL && end5_path == NULL) {
910 return (List_T) NULL;
911 } else if (pairs == NULL) {
912 return List_reverse(end5_path);
913 } else if (end5_path == NULL) {
914 return pairs;
915 }
916
917 debug15(printf("Entered join_end5\n"));
918 debug15(printf("pairs:\n"));
919 debug15(Pair_dump_list(pairs,true));
920 debug15(printf("end5_path:\n"));
921 debug15(Pair_dump_list(end5_path,true));
922
923
924 rightpair = (Pair_T) pairs->first;
925 pair = (Pair_T) end5_path->first;
926 queryjump = rightpair->querypos - pair->querypos - 1;
927 genomejump = rightpair->genomepos - pair->genomepos - 1;
928 debug15(printf("queryjump %d, genomejump %d\n",queryjump,genomejump));
929
930 if (queryjump == 0 && genomejump == 0) {
931 /* Do nothing, although this is unexpected */
932 } else if (queryjump >= 0 && genomejump >= 0) {
933 /* Insert a gapholder */
934 pairs = Pairpool_push_gapholder(pairs,pairpool,queryjump,genomejump,
935 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
936 } else {
937 /* Fix overlap */
938 while (pairs != NULL && end5_path != NULL && (queryjump < 0 || genomejump < 0)) {
939 pair = (Pair_T) end5_path->first;
940
941 if (pairs != NULL) {
942 pairs = Pairpool_pop(pairs,&rightpair);
943 }
944 if (end5_path != NULL) {
945 end5_path = Pairpool_pop(end5_path,&pair);
946 }
947 queryjump = rightpair->querypos - pair->querypos - 1;
948 genomejump = rightpair->genomepos - pair->genomepos - 1;
949 debug15(printf("Revising queryjump to be %d = %d - %d - 1\n",queryjump,pair->querypos,rightpair->querypos));
950 }
951
952 pairs = Pairpool_push_existing(pairs,pairpool,rightpair);
953 if (queryjump < 0 || genomejump < 0) {
954 /* Discard pair and all remaining pairs from end */
955 debug15(printf("Not possible to make end work, since queryjump = %d and genomejump = %d\n",
956 queryjump,genomejump));
957 end5_path = (List_T) NULL;
958 } else if (queryjump == 0 && genomejump == 0) {
959 /* No gapholder needed */
960 pairs = Pairpool_push_existing(pairs,pairpool,pair);
961 } else {
962 pairs = Pairpool_push_gapholder(pairs,pairpool,queryjump,genomejump,
963 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
964 pairs = Pairpool_push_existing(pairs,pairpool,pair);
965 }
966 }
967
968 while (end5_path != NULL) {
969 pairs = List_transfer_one(pairs,&end5_path);
970 }
971
972 debug15(printf("joined pairs:\n"));
973 debug15(Pair_dump_list(pairs,true));
974 debug15(printf("\n"));
975
976 return pairs;
977 }
978
979
980 List_T
Pairpool_add_queryskip(List_T pairs,int r,int c,int dist,char * querysequence,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,int dynprogindex)981 Pairpool_add_queryskip (List_T pairs, int r, int c, int dist, char *querysequence,
982 int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp, int dynprogindex) {
983 int j;
984 char c1;
985 int querycoord, genomecoord, step;
986
987 querycoord = r-1;
988 genomecoord = c-1;
989
990 if (revp == true) {
991 querycoord = -querycoord;
992 genomecoord = -genomecoord;
993 step = +1;
994 } else {
995 /* Advance to next genomepos */
996 /* genomecoord++; -- No, this leads to bugs */
997 step = -1;
998 }
999 debug(printf("Entered Pairpool_add_genomeskip with r %d, c %d, revp %d => querycoord %d\n",r,c,revp,querycoord));
1000
1001 for (j = 0; j < dist; j++) {
1002 c1 = querysequence[querycoord];
1003 debug(printf("Pushing %d,%d [%d,%d] (%c,-), ",r,c,queryoffset+querycoord,genomeoffset+genomecoord,c1));
1004 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
1005 c1,INDEL_COMP,/*genome*/' ',/*genomealt*/' ',dynprogindex);
1006 debug(r--);
1007 querycoord += step;
1008 }
1009
1010 return pairs;
1011 }
1012
1013
1014 static char complCode[128] = COMPLEMENT_LC;
1015
1016 static char
get_genomic_nt(char * g_alt,int genomicpos,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp)1017 get_genomic_nt (char *g_alt, int genomicpos, Univcoord_T chroffset, Univcoord_T chrhigh,
1018 bool watsonp) {
1019 char c2, c2_alt;
1020 Univcoord_T pos;
1021
1022 #if 0
1023 /* If the read has a deletion, then we will extend beyond 0 or genomiclength, so do not restrict. */
1024 if (genomicpos < 0) {
1025 return '*';
1026
1027 } else if (genomicpos >= genomiclength) {
1028 return '*';
1029
1030 }
1031 #endif
1032
1033 if (watsonp) {
1034 if ((pos = chroffset + genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1035 *g_alt = '*';
1036 return '*';
1037
1038 } else if (pos >= chrhigh) {
1039 *g_alt = '*';
1040 return '*';
1041
1042 #if 0
1043 } else if (genome) {
1044 /* Not necessary, because Genome_get_char_blocks should work */
1045 debug8(printf("At %u, genomicnt is %c\n",
1046 genomicpos,Genome_get_char(genome,pos)));
1047 return Genome_get_char(genome,pos);
1048 #endif
1049
1050 } else {
1051 /* GMAP with user-supplied genomic segment */
1052 debug8(printf("At %u, genomicnt is %c\n",
1053 genomicpos,Genome_get_char_blocks(&(*g_alt),pos)));
1054 return Genome_get_char_blocks(&(*g_alt),pos);
1055 }
1056
1057 } else {
1058 if ((pos = chrhigh - genomicpos) < chroffset) { /* Must be <, and not <=, or dynamic programming will fail */
1059 *g_alt = '*';
1060 return '*';
1061
1062 } else if (pos >= chrhigh) {
1063 *g_alt = '*';
1064 return '*';
1065
1066 #if 0
1067 } else if (genome) {
1068 /* Not necessary, because Genome_get_char_blocks should work */
1069 c2 = Genome_get_char(genome,pos);
1070 #endif
1071
1072 } else {
1073 /* GMAP with user-supplied genomic segment */
1074 c2 = Genome_get_char_blocks(&c2_alt,pos);
1075 }
1076 debug8(printf("At %u, genomicnt is %c\n",genomicpos,complCode[(int) c2]));
1077 *g_alt = complCode[(int) c2_alt];
1078 return complCode[(int) c2];
1079 }
1080 }
1081
1082
1083 List_T
Pairpool_add_genomeskip(bool * add_dashes_p,List_T pairs,int r,int c,int dist,char * genomesequence,int queryoffset,int genomeoffset,Pairpool_T pairpool,bool revp,Univcoord_T chroffset,Univcoord_T chrhigh,bool watsonp,int dynprogindex)1084 Pairpool_add_genomeskip (bool *add_dashes_p, List_T pairs, int r, int c, int dist,
1085 char *genomesequence, int queryoffset, int genomeoffset, Pairpool_T pairpool, bool revp,
1086 Univcoord_T chroffset, Univcoord_T chrhigh, bool watsonp, int dynprogindex) {
1087 int j;
1088 #ifdef DEBUG
1089 char left1, left2, right2, right1;
1090 char left1_alt, left2_alt, right2_alt, right1_alt;
1091 /* int introntype; */
1092 #endif
1093 char c2, c2_alt;
1094 int querycoord, leftgenomecoord, rightgenomecoord, genomecoord, temp, step;
1095
1096 querycoord = r-1;
1097 leftgenomecoord = c-dist;
1098 rightgenomecoord = c-1;
1099
1100 if (revp == true) {
1101 querycoord = -querycoord;
1102 temp = leftgenomecoord;
1103 leftgenomecoord = -rightgenomecoord;
1104 rightgenomecoord = -temp;
1105 step = +1;
1106 } else {
1107 /* Advance to next querypos */
1108 /* querycoord++; -- No, this leads to bugs */
1109 step = -1;
1110 }
1111 debug(printf("Entered Pairpool_add_genomeskip with r %d, c %d, revp %d => querycoord %d\n",r,c,revp,querycoord));
1112
1113 if (dist < MICROINTRON_LENGTH) {
1114 *add_dashes_p = true;
1115 } else {
1116 #ifdef DEBUG
1117 /* Check for intron */
1118 if (genomesequence != NULL) {
1119 left1 = left1_alt = genomesequence[leftgenomecoord];
1120 left2 = left2_alt = genomesequence[leftgenomecoord+1];
1121 right2 = right2_alt = genomesequence[rightgenomecoord-1];
1122 right1 = right1_alt = genomesequence[rightgenomecoord];
1123 } else {
1124 left1 = get_genomic_nt(&left1_alt,genomeoffset+leftgenomecoord,chroffset,chrhigh,watsonp);
1125 left2 = get_genomic_nt(&left2_alt,genomeoffset+leftgenomecoord+1,chroffset,chrhigh,watsonp);
1126 right2 = get_genomic_nt(&right2_alt,genomeoffset+rightgenomecoord-1,chroffset,chrhigh,watsonp);
1127 right1 = get_genomic_nt(&right1_alt,genomeoffset+rightgenomecoord,chroffset,chrhigh,watsonp);
1128 }
1129 #endif
1130
1131 #if 0
1132 #ifdef PMAP
1133 introntype = Intron_type(left1,left2,right2,right1,
1134 left1_alt,left2_alt,right2_alt,right1_alt,
1135 /*cdna_direction*/+1);
1136 #else
1137 introntype = Intron_type(left1,left2,right2,right1,
1138 left1_alt,left2_alt,right2_alt,right1_alt,
1139 cdna_direction);
1140 #endif
1141 #endif
1142
1143 #if 0
1144 if (introntype == NONINTRON) {
1145 *add_dashes_p = true;
1146 } else {
1147 *add_dashes_p = false;
1148 }
1149 #endif
1150 *add_dashes_p = false;
1151 }
1152
1153 if (*add_dashes_p == true) {
1154 if (revp == true) {
1155 genomecoord = leftgenomecoord;
1156 } else {
1157 genomecoord = rightgenomecoord;
1158 }
1159
1160 if (genomesequence != NULL) {
1161 for (j = 0; j < dist; j++) {
1162 c2 = c2_alt = genomesequence[genomecoord];
1163
1164 debug(printf("Pushing %d,%d [%d,%d] (-,%c),",r,c,queryoffset+querycoord,genomeoffset+genomecoord,c2));
1165 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
1166 ' ',INDEL_COMP,c2,c2_alt,dynprogindex);
1167 debug(c--);
1168 genomecoord += step;
1169 }
1170
1171 } else {
1172 for (j = 0; j < dist; j++) {
1173 c2 = get_genomic_nt(&c2_alt,genomeoffset+genomecoord,chroffset,chrhigh,watsonp);
1174 #ifdef EXTRACT_GENOMICSEG
1175 assert(c2 == genomesequence[genomecoord]);
1176 #endif
1177
1178 debug(printf("Pushing %d,%d [%d,%d] (-,%c),",r,c,queryoffset+querycoord,genomeoffset+genomecoord,c2));
1179 pairs = Pairpool_push(pairs,pairpool,queryoffset+querycoord,genomeoffset+genomecoord,
1180 ' ',INDEL_COMP,c2,c2_alt,dynprogindex);
1181 debug(c--);
1182 genomecoord += step;
1183 }
1184 }
1185
1186 } else {
1187 debug(printf("Large gap %c%c..%c%c\n",left1,left2,right2,right1));
1188 #ifndef NOGAPHOLDER
1189 pairs = Pairpool_push_gapholder(pairs,pairpool,/*queryjump*/0,/*genomejump*/dist,
1190 /*leftpair*/NULL,/*rightpair*/NULL,/*knownp*/false);
1191 #endif
1192 }
1193
1194 return pairs;
1195 }
1196
1197
1198 /* Used for compacting pairs from working pairpool to final pairpool
1199 (dest) at end of stage2 and stage3, so the working pairpool can be
1200 re-used for the next gregion */
1201 List_T
Pairpool_compact_copy(List_T list,T dest)1202 Pairpool_compact_copy (List_T list, T dest) {
1203 List_T newlist = NULL, listcell;
1204 Pair_T pair, orig;
1205 List_T p, q;
1206 int n;
1207
1208 for (q = list; q != NULL; q = q->rest) {
1209 orig = (Pair_T) q->first;
1210
1211 /* Following code taken from Pairpool_push_copy */
1212 if (dest->pairctr >= dest->npairs) {
1213 dest->pairptr = add_new_pairchunk(dest);
1214 } else if ((dest->pairctr % CHUNKSIZE) == 0) {
1215 for (n = dest->npairs - CHUNKSIZE, p = dest->pairchunks;
1216 n > dest->pairctr; p = p->rest, n -= CHUNKSIZE) ;
1217 dest->pairptr = (struct Pair_T *) p->first;
1218 debug1(printf("Located pair %d at %p\n",dest->pairctr,dest->pairptr));
1219 }
1220 pair = dest->pairptr++;
1221 dest->pairctr++;
1222
1223 memcpy(pair,orig,sizeof(struct Pair_T));
1224
1225 debug(
1226 printf("Copying %p: %d %d %c %c %c\n",
1227 pair,pair->querypos,pair->genomepos,pair->cdna,pair->comp,pair->genome);
1228 );
1229
1230 if (dest->listcellctr >= dest->nlistcells) {
1231 dest->listcellptr = add_new_listcellchunk(dest);
1232 } else if ((dest->listcellctr % CHUNKSIZE) == 0) {
1233 for (n = dest->nlistcells - CHUNKSIZE, p = dest->listcellchunks;
1234 n > dest->listcellctr; p = p->rest, n -= CHUNKSIZE) ;
1235 dest->listcellptr = (struct List_T *) p->first;
1236 debug1(printf("Located listcell %d at %p\n",dest->listcellctr,dest->listcellptr));
1237 }
1238 listcell = dest->listcellptr++;
1239 dest->listcellctr++;
1240
1241 listcell->first = (void *) pair;
1242 listcell->rest = newlist;
1243 newlist = listcell;
1244 }
1245
1246 return List_reverse(newlist);
1247 }
1248
1249