1 static char rcsid[] = "$Id: splicetrie_build.c 184476 2016-02-18 00:13:26Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "splicetrie_build.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h> /* For qsort */
11
12 #include "assert.h"
13 #include "mem.h"
14 #include "iitdef.h"
15 #include "interval.h"
16
17
18 #define MININTRONLEN 9
19 #define SPLICEDIST_EXTRA 10 /* Reported intron length does not include dinucleotides */
20
21 #define EMPTY_POINTER 0U
22 #define INTRON_HIGH_TO_LOW 1U /* Because splicesites based on Interval_low (x..x+1 and y..y+1), but introns go from low to high (x..y+1) */
23
24
25 /* Generating trie */
26 #ifdef DEBUG
27 #define debug(x) x
28 #else
29 #define debug(x)
30 #endif
31
32 /* Generating trie, details */
33 #ifdef DEBUG0
34 #define debug0(x) x
35 #else
36 #define debug0(x)
37 #endif
38
39 /* Full dump */
40 #ifdef DEBUG2
41 #define debug2(x) x
42 #else
43 #define debug2(x)
44 #endif
45
46
47 char *
Splicetype_string(Splicetype_T splicetype)48 Splicetype_string (Splicetype_T splicetype) {
49 switch (splicetype) {
50 case DONOR: return "donor";
51 case ACCEPTOR: return "acceptor";
52 case ANTIDONOR: return "antidonor";
53 case ANTIACCEPTOR: return "antiacceptor";
54 default: abort();
55 }
56 }
57
58
59 /* 87654321 */
60 #define LOW_TWO_BITS 0x00000003
61
62 static void
splicefrag_nt_leftward(char * nt,Genomecomp_T splicefrag)63 splicefrag_nt_leftward (char *nt, Genomecomp_T splicefrag) {
64 int i, j;
65 Genomecomp_T highbits, lowbits;
66
67 highbits = splicefrag >> 16;
68 lowbits = splicefrag /* & 0x0000FFFF */;
69
70 j = 15;
71 for (i = 0; i < 16; i++) {
72 switch (((highbits & 0x01) << 1) | (lowbits & 0x01)) {
73 case RIGHT_A: nt[j] = 'A'; break;
74 case RIGHT_C: nt[j] = 'C'; break;
75 case RIGHT_G: nt[j] = 'G'; break;
76 case RIGHT_T: nt[j] = 'T'; break;
77 }
78 highbits >>= 1;
79 lowbits >>= 1;
80 j--;
81 }
82
83 return;
84 }
85
86 static void
splicefrag_nt_rightward(char * nt,Genomecomp_T splicefrag)87 splicefrag_nt_rightward (char *nt, Genomecomp_T splicefrag) {
88 int i, j;
89 Genomecomp_T highbits, lowbits;
90
91 highbits = splicefrag >> 16;
92 lowbits = splicefrag /* & 0x0000FFFF */;
93
94 j = 0;
95 for (i = 0; i < 16; i++) {
96 switch (((highbits & 0x01) << 1) | (lowbits & 0x01)) {
97 case RIGHT_A: nt[j] = 'A'; break;
98 case RIGHT_C: nt[j] = 'C'; break;
99 case RIGHT_G: nt[j] = 'G'; break;
100 case RIGHT_T: nt[j] = 'T'; break;
101 }
102 highbits >>= 1;
103 lowbits >>= 1;
104 j++;
105 }
106
107 return;
108 }
109
110
111
112 /* Splicetype is for the anchor splice */
113 void
Splicetrie_dump(Trieoffset_T * triestart,Univcoord_T * splicesites,Splicetype_T splicetype,Genomecomp_T * splicefrags_ref)114 Splicetrie_dump (Trieoffset_T *triestart, Univcoord_T *splicesites,
115 Splicetype_T splicetype, Genomecomp_T *splicefrags_ref) {
116 Triecontent_T leaf;
117 int nleaves, i;
118 Univcoord_T position;
119 Triecontent_T offseta, offsetc, offsetg, offsett;
120 char gbuffer[17];
121
122 gbuffer[16] = '\0';
123
124 if (single_leaf_p(leaf = triestart[0])) {
125 position = splicesites[leaf];
126 printf("%d %llu",(int) leaf,(unsigned long long) position);
127 if (splicetype == DONOR || splicetype == ANTIACCEPTOR) {
128 splicefrag_nt_rightward(gbuffer,splicefrags_ref[leaf]);
129 printf(" %s (rightward)\n",gbuffer);
130 } else {
131 splicefrag_nt_leftward(gbuffer,splicefrags_ref[leaf]);
132 printf(" %s (leftward)\n",gbuffer);
133 }
134
135 } else if (multiple_leaf_p(leaf)) {
136 nleaves = (int) (-leaf);
137 for (i = 1; i <= nleaves; i++) {
138 leaf = triestart[i];
139 position = splicesites[leaf];
140 printf("%d %llu",(int) leaf,(unsigned long long) position);
141 if (splicetype == DONOR || splicetype == ANTIACCEPTOR) {
142 splicefrag_nt_rightward(gbuffer,splicefrags_ref[leaf]);
143 printf(" %s (rightward)\n",gbuffer);
144 } else {
145 splicefrag_nt_leftward(gbuffer,splicefrags_ref[leaf]);
146 printf(" %s (leftward)\n",gbuffer);
147 }
148 }
149
150 } else {
151 #ifdef USE_2BYTE_RELOFFSETS
152 get_offsets(&offseta,&offsetc,&offsetg,&offsett,triestart[1],triestart[2]);
153 #else
154 offseta = (int) triestart[1];
155 offsetc = (int) triestart[2];
156 offsetg = (int) triestart[3];
157 offsett = (int) triestart[4];
158 #endif
159
160 if (offseta > 0) {
161 Splicetrie_dump(&(triestart[-offseta]),splicesites,splicetype,splicefrags_ref);
162 }
163 if (offsetc > 0) {
164 Splicetrie_dump(&(triestart[-offsetc]),splicesites,splicetype,splicefrags_ref);
165 }
166 if (offsetg > 0) {
167 Splicetrie_dump(&(triestart[-offsetg]),splicesites,splicetype,splicefrags_ref);
168 }
169 if (offsett > 0) {
170 Splicetrie_dump(&(triestart[-offsett]),splicesites,splicetype,splicefrags_ref);
171 }
172 }
173
174 return;
175 }
176
177
178 /* 87654321 */
179 #define UINT4_LEFT_A 0x00000000
180 #define UINT4_LEFT_C 0x40000000
181 #define UINT4_LEFT_G 0x80000000
182 #define UINT4_LEFT_T 0xC0000000
183 #define UINT4_HIGH2 0xC0000000
184
185
186 /* Puts leftmost character into lowest bits */
187 /* For right splicestrings, we want the leftmost character in the highest bits */
188
189 static Genomecomp_T
compress16(bool * saw_n_p,char * buffer)190 compress16 (bool *saw_n_p, char *buffer) {
191 Genomecomp_T low = 0U;
192 int c;
193 int i;
194
195 /* *saw_n_p = false; -- Want to check both ref and alt, so rely on caller to set */
196 for (i = 0; i < 16; i++) {
197 c = buffer[i];
198 low >>= 2;
199 switch (c) {
200 case 'A': break;
201 case 'C': low |= UINT4_LEFT_C; break;
202 case 'G': low |= UINT4_LEFT_G; break;
203 case 'T': low |= UINT4_LEFT_T; break;
204 default: *saw_n_p = true; break;
205 }
206 }
207
208 return low;
209 }
210
211
212 static Genomecomp_T
uint4_reverse(Genomecomp_T forward)213 uint4_reverse (Genomecomp_T forward) {
214 Genomecomp_T reverse = 0U;
215 int c;
216 int i;
217
218 for (i = 0; i < 16; i++) {
219 c = forward & 0x03;
220 reverse <<= 2;
221 reverse |= c;
222 forward >>= 2;
223 }
224
225 return reverse;
226 }
227
228
229
230 /* 87654321 */
231 #define LEFT_SET 0x80000000
232 #define LEFT_CLEAR 0x00000000
233
234 /* Puts the odd bits in the top 16 bits, and the even bits in the
235 bottom 16 bits. Needed to match genomebits format, although that
236 has 32-bit blocks instead of 16 bits. */
237 static Genomecomp_T
unshuffle16(bool * saw_n_p,char * buffer)238 unshuffle16 (bool *saw_n_p, char *buffer) {
239 Genomecomp_T bits = 0U;
240 int c;
241 int i;
242
243 /* *saw_n_p = false; -- Want to check both ref and alt, so rely on caller to set */
244 /* Low bits */
245 for (i = 0; i < 16; i++) {
246 c = buffer[i];
247 bits >>= 1;
248 switch (c) {
249 case 'A': /* bits |= LEFT_CLEAR; */ break;
250 case 'C': bits |= LEFT_SET; break;
251 case 'G': /* bits |= LEFT_CLEAR; */ break;
252 case 'T': bits |= LEFT_SET; break;
253 default: *saw_n_p = true; break;
254 }
255 }
256
257 /* High bits */
258 for (i = 0; i < 16; i++) {
259 c = buffer[i];
260 bits >>= 1;
261 switch (c) {
262 case 'A': /* bits |= LEFT_CLEAR; */ break;
263 case 'C': /* bits |= LEFT_CLEAR; */ break;
264 case 'G': bits |= LEFT_SET; break;
265 case 'T': bits |= LEFT_SET; break;
266 default: *saw_n_p = true; break;
267 }
268 }
269
270 return bits;
271 }
272
273
274 #if 0
275 /* Replaced by procedures in splicestringpool.c */
276 static void
277 Splicestring_free (Splicestring_T *old) {
278 FREE(*old);
279 return;
280 }
281
282 void
283 Splicestring_gc (List_T *splicestrings, int nsplicesites) {
284 int i;
285 List_T list, p;
286 Splicestring_T splicestring;
287
288 for (i = 0; i < nsplicesites; i++) {
289 list = splicestrings[i];
290 for (p = list; p != NULL; p = List_next(p)) {
291 splicestring = (Splicestring_T) List_head(p);
292 Splicestring_free(&splicestring);
293 }
294 List_free(&list);
295 }
296 FREE(splicestrings);
297 return;
298 }
299
300 static Splicestring_T
301 Splicestring_new (Genomecomp_T string, Genomecomp_T splicesite, int splicesite_i) {
302 Splicestring_T new = (Splicestring_T) MALLOC(sizeof(*new));
303
304 new->string = string;
305 new->splicesite = splicesite;
306 new->splicesite_i = (Genomecomp_T) splicesite_i;
307 return new;
308 }
309 #endif
310
311 static int
Splicestring_cmp(const void * a,const void * b)312 Splicestring_cmp (const void *a, const void *b) {
313 Splicestring_T x = * (Splicestring_T *) a;
314 Splicestring_T y = * (Splicestring_T *) b;
315
316 #ifdef USE_STRINGS
317 return strcmp(x->string,y->string);
318 #else
319 if (x->string < y->string) {
320 return -1;
321 } else if (x->string > y->string) {
322 return +1;
323 } else {
324 return 0;
325 }
326 #endif
327 }
328
329
330 static List_T
allelic_combinations(Genomecomp_T refstring,Genomecomp_T altstring,Genomecomp_T splicesite,int splicesite_i,Splicestringpool_T splicestringpool)331 allelic_combinations (Genomecomp_T refstring, Genomecomp_T altstring, Genomecomp_T splicesite, int splicesite_i,
332 Splicestringpool_T splicestringpool) {
333 List_T splicestrings = NULL;
334 Uintlist_T combinations, newcombinations, temp, p;
335 int refc, altc;
336 int i;
337
338 combinations = Uintlist_push(NULL,0U);
339 for (i = 0; i < 16; i++) {
340 newcombinations = NULL;
341
342 refc = (refstring & UINT4_HIGH2) >> 30;
343 altc = (altstring & UINT4_HIGH2) >> 30;
344 refstring <<= 2;
345 altstring <<= 2;
346 if (refc != altc) {
347 for (p = combinations; p != NULL; p = Uintlist_next(p)) {
348 newcombinations = Uintlist_push(newcombinations,(Uintlist_head(p) << 2) | refc);
349 newcombinations = Uintlist_push(newcombinations,(Uintlist_head(p) << 2) | altc);
350 }
351 } else {
352 for (p = combinations; p != NULL; p = Uintlist_next(p)) {
353 newcombinations = Uintlist_push(newcombinations,(Uintlist_head(p) << 2) | refc);
354 }
355 }
356
357 temp = combinations;
358 combinations = Uintlist_reverse(newcombinations);
359 Uintlist_free(&temp);
360 }
361
362 for (p = combinations; p != NULL; p = Uintlist_next(p)) {
363 splicestrings = Splicestringpool_push(splicestrings,splicestringpool,
364 Uintlist_head(p),splicesite,splicesite_i);
365 }
366
367 Uintlist_free(&combinations);
368 return splicestrings;
369 }
370
371
372
373
374
375
376 /* If distances are provided, splicedists are the observed distances */
377 Univcoord_T *
Splicetrie_retrieve_via_splicesites(bool * distances_observed_p,Genomecomp_T ** splicecomp,Splicetype_T ** splicetypes,Chrpos_T ** splicedists,List_T ** splicestrings,Genomecomp_T ** splicefrags_ref,Genomecomp_T ** splicefrags_alt,int * nsplicesites,IIT_T splicing_iit,int * splicing_divint_crosstable,int donor_typeint,int acceptor_typeint,Univ_IIT_T chromosome_iit,Genome_T genome,Genome_T genomealt,Chrpos_T shortsplicedist,Splicestringpool_T splicestringpool)378 Splicetrie_retrieve_via_splicesites (bool *distances_observed_p,
379 #ifdef GSNAP
380 Genomecomp_T **splicecomp,
381 #endif
382 Splicetype_T **splicetypes, Chrpos_T **splicedists,
383 List_T **splicestrings, Genomecomp_T **splicefrags_ref, Genomecomp_T **splicefrags_alt,
384 int *nsplicesites, IIT_T splicing_iit, int *splicing_divint_crosstable,
385 int donor_typeint, int acceptor_typeint, Univ_IIT_T chromosome_iit,
386 Genome_T genome, Genome_T genomealt, Chrpos_T shortsplicedist,
387 Splicestringpool_T splicestringpool) {
388 Univcoord_T *splicesites, chroffset, chrhigh, position;
389 Chrpos_T chrlength, chrpos;
390 Univcoord_T last_donor, last_antidonor, last_acceptor, last_antiacceptor;
391 int last_donor_k, last_antidonor_k, last_acceptor_k, last_antiacceptor_k;
392 int distance;
393 Genomecomp_T refstring, altstring;
394 int *splicesites1;
395 int divno, nsplicesites1, i, k;
396 Chrnum_T chrnum;
397 Interval_T *intervals, interval;
398 struct Interval_T *interval_structs;
399 char gbuffer_ref[17], gbuffer_alt[17], *chr;
400 char *restofheader;
401 /* char *annot; */
402 bool firstp = true, saw_n_p, allocp, alloc_header_p;
403 int ntoolong = 0;
404
405 #ifdef DEBUG2
406 List_T p;
407 Splicestring_T splicestring;
408 #endif
409
410 #ifdef GSNAP
411 int nblocks;
412 #endif
413
414 k = 0;
415 for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
416 if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
417 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
418 /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
419 splicesites1 = IIT_get_with_divno(&nsplicesites1,splicing_iit,divno,
420 0U,chrlength-1U,/*sortp*/false);
421 if (nsplicesites1 > 0) {
422 if (firstp == true) {
423 /* annot = */ IIT_annotation(&restofheader,splicing_iit,splicesites1[0],&alloc_header_p);
424 if (restofheader[0] == '\0') {
425 fprintf(stderr,"splice distances absent...");
426 *distances_observed_p = false;
427 } else if (sscanf(restofheader,"%d",&distance) < 1) {
428 fprintf(stderr,"splice distances absent...");
429 *distances_observed_p = false;
430 } else {
431 fprintf(stderr,"splice distances present...");
432 *distances_observed_p = true;
433 }
434 if (alloc_header_p == true) {
435 FREE(restofheader);
436 }
437 firstp = false;
438 }
439
440 intervals = (Interval_T *) CALLOC(nsplicesites1,sizeof(Interval_T));
441 for (i = 0; i < nsplicesites1; i++) {
442 intervals[i] = &(splicing_iit->intervals[divno][i]);
443 }
444 qsort(intervals,nsplicesites1,sizeof(Interval_T),Interval_cmp_low);
445
446 last_donor = last_antidonor = last_acceptor = last_antiacceptor = 0U;
447 for (i = 0; i < nsplicesites1; i++) {
448 interval = intervals[i];
449 chrpos = Interval_low(interval);
450 position = chrpos + chroffset;
451
452 if (position >= chrhigh) {
453 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
454 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
455 chr,chrpos,chrlength);
456 if (allocp) FREE(chr);
457
458 } else if (Interval_type(interval) == donor_typeint) {
459 if (Interval_sign(interval) > 0) {
460 if (position != last_donor) {
461 last_donor = position;
462 k++;
463 }
464 } else {
465 if (position != last_antidonor) {
466 last_antidonor = position;
467 k++;
468 }
469 }
470 } else if (Interval_type(interval) == acceptor_typeint) {
471 if (Interval_sign(interval) > 0) {
472 if (position != last_acceptor) {
473 last_acceptor = position;
474 k++;
475 }
476 } else {
477 if (position != last_antiacceptor) {
478 last_antiacceptor = position;
479 k++;
480 }
481 }
482 }
483 }
484 FREE(intervals);
485 FREE(splicesites1);
486 }
487 }
488 }
489
490 *nsplicesites = k;
491 debug(printf("total unique splicesites: %d\n",*nsplicesites));
492 fprintf(stderr,"%d unique splicesites...",*nsplicesites);
493
494 /* The above procedure determines number of unique splicesites */
495
496 if (*nsplicesites == 0) {
497 #ifdef GSNAP
498 *splicecomp = (Genomecomp_T *) NULL;
499 #endif
500 splicesites = (Univcoord_T *) NULL;
501 *splicetypes = (Splicetype_T *) NULL;
502 *splicedists = (Chrpos_T *) NULL;
503 *splicestrings = (List_T *) NULL;
504 *splicefrags_ref = (Genomecomp_T *) NULL;
505 *splicefrags_alt = (Genomecomp_T *) NULL;
506 return splicesites;
507 }
508
509 #ifdef GSNAP
510 nblocks = (Genome_totallength(genome)+31)/32U;
511 *splicecomp = (Genomecomp_T *) CALLOC(nblocks,sizeof(Genomecomp_T));
512 #endif
513 splicesites = (Univcoord_T *) CALLOC((*nsplicesites) + 1,sizeof(Univcoord_T));
514 *splicetypes = (Splicetype_T *) CALLOC(*nsplicesites,sizeof(Splicetype_T));
515 *splicedists = (Chrpos_T *) CALLOC(*nsplicesites,sizeof(Chrpos_T));
516 *splicestrings = (List_T *) CALLOC(*nsplicesites,sizeof(List_T));
517 *splicefrags_ref = (Genomecomp_T *) CALLOC(*nsplicesites,sizeof(Genomecomp_T));
518 if (genomealt == NULL) {
519 *splicefrags_alt = *splicefrags_ref;
520 } else {
521 *splicefrags_alt = (Genomecomp_T *) CALLOC(*nsplicesites,sizeof(Genomecomp_T));
522 }
523
524 /* Use interval_structs instead of intervals, because we want to
525 copy information and avoid creating many Interval_T objects */
526
527 k = 0;
528 for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
529 if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
530 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
531 /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
532 splicesites1 = IIT_get_with_divno(&nsplicesites1,splicing_iit,divno,
533 0U,chrlength-1U,/*sortp*/false);
534 if (nsplicesites1 > 0) {
535 interval_structs = (struct Interval_T *) CALLOC(nsplicesites1,sizeof(struct Interval_T));
536 for (i = 0; i < nsplicesites1; i++) {
537 /* intervals[i] = &(splicing_iit->intervals[divno][i]); */
538 /* Copy so we can store distance information in Interval_high */
539 Interval_copy_existing(&(interval_structs[i]),&(splicing_iit->intervals[divno][i]));
540 if (*distances_observed_p == false) {
541 /* No, want to have essentially zero distance */
542 /* Interval_store_length(intervals[i],shortsplicedist); */
543 } else {
544 /* annot = */ IIT_annotation(&restofheader,splicing_iit,splicesites1[i],&alloc_header_p);
545 if (sscanf(restofheader,"%d",&distance) != 1) {
546 fprintf(stderr,"splicesites file missing distance in entry %s...exiting\n",
547 IIT_label(splicing_iit,splicesites1[i],&allocp));
548 exit(9);
549 } else if (distance < 0) {
550 fprintf(stderr,"splicesites file has a negative distance %d in entry %s...exiting\n",
551 distance,IIT_label(splicing_iit,splicesites1[i],&allocp));
552 exit(9);
553 } else if (distance > (int) shortsplicedist) {
554 ntoolong++;
555 Interval_store_length(&(interval_structs[i]),distance + SPLICEDIST_EXTRA); /* Previously stored shortsplicedist */
556 } else {
557 Interval_store_length(&(interval_structs[i]),distance + SPLICEDIST_EXTRA);
558 }
559 if (alloc_header_p == true) {
560 FREE(restofheader);
561 }
562 }
563 }
564
565 qsort(interval_structs,nsplicesites1,sizeof(struct Interval_T),Interval_cmp_low_struct);
566
567 last_donor = last_antidonor = last_acceptor = last_antiacceptor = 0U;
568 for (i = 0; i < nsplicesites1; i++) {
569 interval = &(interval_structs[i]);
570 chrpos = Interval_low(interval);
571 position = chrpos + chroffset;
572
573 if (position >= chrhigh) {
574 #if 0
575 /* Warning given previously */
576 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
577 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
578 chr,chrpos,chrlength);
579 if (allocp) FREE(chr);
580 #endif
581
582 } else if (Interval_type(interval) == donor_typeint) {
583 if (Interval_sign(interval) > 0) {
584 if (position == last_donor) {
585 if (Interval_length(interval) > (*splicedists)[last_donor_k]) {
586 (*splicedists)[last_donor_k] = Interval_length(interval);
587 }
588
589 } else {
590 last_donor_k = k;
591 last_donor = splicesites[k] = position;
592 #ifdef GSNAP
593 assert(position/32U < (unsigned int) nblocks);
594 (*splicecomp)[position/32U] |= (1 << (position % 32));
595 #endif
596 (*splicetypes)[k] = DONOR;
597 (*splicedists)[k] = Interval_length(interval);
598
599 saw_n_p = false;
600 Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
601 refstring = compress16(&saw_n_p,gbuffer_ref);
602 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
603 if (genomealt) {
604 Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
605 altstring = compress16(&saw_n_p,gbuffer_alt);
606 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
607 }
608
609 if (saw_n_p == true) {
610 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
611 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
612 chr,chrpos);
613 if (allocp) FREE(chr);
614
615 } else if (genomealt) {
616 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
617 k++;
618 } else {
619 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
620 k++;
621 }
622 }
623
624 } else {
625 if (position == last_antidonor) {
626 if (Interval_length(interval) > (*splicedists)[last_antidonor_k]) {
627 (*splicedists)[last_antidonor_k] = Interval_length(interval);
628 }
629
630 } else {
631 last_antidonor_k = k;
632 last_antidonor = splicesites[k] = position;
633 #ifdef GSNAP
634 assert(position/32U < (unsigned int) nblocks);
635 (*splicecomp)[position/32U] |= (1 << (position % 32));
636 #endif
637 (*splicetypes)[k] = ANTIDONOR;
638 (*splicedists)[k] = Interval_length(interval);
639
640 saw_n_p = false;
641 Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
642 refstring = uint4_reverse(compress16(&saw_n_p,gbuffer_ref));
643 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
644 if (genomealt) {
645 Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
646 altstring = uint4_reverse(compress16(&saw_n_p,gbuffer_alt));
647 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
648 }
649
650 if (saw_n_p == true) {
651 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
652 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
653 chr,chrpos);
654 if (allocp) FREE(chr);
655 } else if (genomealt) {
656 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
657 k++;
658 } else {
659 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
660 k++;
661 }
662 }
663 }
664
665 } else if (Interval_type(interval) == acceptor_typeint) {
666 if (Interval_sign(interval) > 0) {
667 if (position == last_acceptor) {
668 if (Interval_length(interval) > (*splicedists)[last_acceptor_k]) {
669 (*splicedists)[last_acceptor_k] = Interval_length(interval);
670 }
671
672 } else {
673 last_acceptor_k = k;
674 last_acceptor = splicesites[k] = position;
675 #ifdef GSNAP
676 assert(position/32U < (unsigned int) nblocks);
677 (*splicecomp)[position/32U] |= (1 << (position % 32));
678 #endif
679 (*splicetypes)[k] = ACCEPTOR;
680 (*splicedists)[k] = Interval_length(interval);
681
682 saw_n_p = false;
683 Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
684 refstring = uint4_reverse(compress16(&saw_n_p,gbuffer_ref));
685 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
686 if (genomealt) {
687 Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
688 altstring = uint4_reverse(compress16(&saw_n_p,gbuffer_alt));
689 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
690 }
691
692 if (saw_n_p == true) {
693 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
694 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
695 chr,chrpos);
696 if (allocp) FREE(chr);
697 } else if (genomealt) {
698 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
699 k++;
700 } else {
701 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
702 k++;
703 }
704 }
705
706 } else {
707 if (position == last_antiacceptor) {
708 if (Interval_length(interval) > (*splicedists)[last_antiacceptor_k]) {
709 (*splicedists)[last_antiacceptor_k] = Interval_length(interval);
710 }
711
712 } else {
713 last_antiacceptor_k = k;
714 last_antiacceptor = splicesites[k] = position;
715 #ifdef GSNAP
716 assert(position/32U < (unsigned int) nblocks);
717 (*splicecomp)[position/32U] |= (1 << (position % 32));
718 #endif
719 (*splicetypes)[k] = ANTIACCEPTOR;
720 (*splicedists)[k] = Interval_length(interval);
721
722 saw_n_p = false;
723 Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
724 refstring = compress16(&saw_n_p,gbuffer_ref);
725 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
726 if (genomealt) {
727 Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
728 altstring = compress16(&saw_n_p,gbuffer_alt);
729 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
730 }
731
732 if (saw_n_p == true) {
733 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
734 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
735 chr,chrpos);
736 if (allocp) FREE(chr);
737 } else if (genomealt) {
738 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
739 k++;
740 } else {
741 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
742 k++;
743 }
744 }
745
746 }
747 }
748 }
749
750 FREE(interval_structs);
751 FREE(splicesites1);
752 }
753 }
754 }
755
756 *nsplicesites = k;
757 splicesites[*nsplicesites] = (Univcoord_T) -1; /* Marker for comparison in identify_all_segments */
758 fprintf(stderr,"\n%d splicesites are valid...",*nsplicesites);
759
760 #ifdef DEBUG2
761 for (k = 0; k < *nsplicesites; k++) {
762 printf("%d: %u %s %08X\n",k,splicesites[k],Splicetype_string((*splicetypes)[k]),(*splicefrags_ref)[k]);
763 for (p = (*splicestrings)[k]; p != NULL; p = List_next(p)) {
764 splicestring = (Splicestring_T) List_head(p);
765 printf(" %u %u %u\n",splicestring->string,splicestring->splicesite,splicestring->splicesite_i);
766 }
767 }
768 #endif
769
770 if (ntoolong > 0) {
771 fprintf(stderr,"%d entries with distance > %d specified for local splice distance...",ntoolong,shortsplicedist);
772 }
773
774 return splicesites;
775 }
776
777
778 struct Cell_T {
779 int k; /* original k */
780 Univcoord_T pos; /* splicesite pos */
781 };
782
783 static int
Cell_position_cmp(const void * a,const void * b)784 Cell_position_cmp (const void *a, const void *b) {
785 struct Cell_T x = * (struct Cell_T *) a;
786 struct Cell_T y = * (struct Cell_T *) b;
787
788 if (x.pos < y.pos) {
789 return -1;
790 } else if (y.pos < x.pos) {
791 return +1;
792 } else {
793 return 0;
794 }
795 }
796
797
798 Univcoord_T *
Splicetrie_retrieve_via_introns(Genomecomp_T ** splicecomp,Splicetype_T ** splicetypes,Chrpos_T ** splicedists,List_T ** splicestrings,Genomecomp_T ** splicefrags_ref,Genomecomp_T ** splicefrags_alt,int * nsplicesites,IIT_T splicing_iit,int * splicing_divint_crosstable,Univ_IIT_T chromosome_iit,Genome_T genome,Genome_T genomealt,Splicestringpool_T splicestringpool)799 Splicetrie_retrieve_via_introns (
800 #ifdef GSNAP
801 Genomecomp_T **splicecomp,
802 #endif
803 Splicetype_T **splicetypes, Chrpos_T **splicedists,
804 List_T **splicestrings, Genomecomp_T **splicefrags_ref, Genomecomp_T **splicefrags_alt,
805 int *nsplicesites, IIT_T splicing_iit, int *splicing_divint_crosstable,
806 Univ_IIT_T chromosome_iit, Genome_T genome, Genome_T genomealt,
807 Splicestringpool_T splicestringpool) {
808 Univcoord_T *splicesites, chroffset, chrhigh, position;
809 Chrpos_T chrlength, chrpos;
810 Univcoord_T last_donor, last_antidonor, last_acceptor, last_antiacceptor;
811 int last_donor_k, last_antidonor_k, last_acceptor_k, last_antiacceptor_k;
812 Genomecomp_T refstring, altstring;
813 int *introns1;
814 int divno, nintrons1, i, k, j;
815 Chrnum_T chrnum;
816 Interval_T *intervals, interval;
817 char gbuffer_ref[17], gbuffer_alt[17], *chr;
818 bool saw_n_p, allocp;
819
820 struct Cell_T *cells;
821 Univcoord_T *temp_splicesites;
822 Splicetype_T *temp_splicetypes;
823 Chrpos_T *temp_splicedists;
824 List_T *temp_splicestrings, p;
825 UINT4 *temp_splicefrags_ref, *temp_splicefrags_alt;
826 Splicestring_T splicestring;
827
828 #ifdef GSNAP
829 int nblocks;
830 #endif
831
832 k = 0;
833 for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
834 if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
835 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
836 /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
837 introns1 = IIT_get_with_divno(&nintrons1,splicing_iit,divno,0U,chrlength-1U,/*sortp*/false);
838 if (nintrons1 > 0) {
839 intervals = (Interval_T *) CALLOC(nintrons1,sizeof(Interval_T));
840 for (i = 0; i < nintrons1; i++) {
841 intervals[i] = &(splicing_iit->intervals[divno][i]);
842 }
843
844 qsort(intervals,nintrons1,sizeof(Interval_T),Interval_cmp_low);
845 last_donor = last_antiacceptor = 0U;
846 for (i = 0; i < nintrons1; i++) {
847 interval = intervals[i];
848 chrpos = Interval_low(interval);
849 position = chrpos + chroffset;
850
851 if (position >= chrhigh) {
852 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
853 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
854 chr,chrpos,chrlength);
855 if (allocp) FREE(chr);
856
857 } else if (Interval_sign(interval) > 0) {
858 if (position != last_donor) {
859 last_donor = position;
860 k++;
861 }
862 } else {
863 if (position != last_antiacceptor) {
864 last_antiacceptor = position;
865 k++;
866 }
867 }
868 }
869
870 qsort(intervals,nintrons1,sizeof(Interval_T),Interval_cmp_high);
871 last_acceptor = last_antidonor = 0U;
872 for (i = 0; i < nintrons1; i++) {
873 interval = intervals[i];
874 chrpos = Interval_high(interval) - INTRON_HIGH_TO_LOW;
875 position = chrpos + chroffset;
876
877 if (position >= chrhigh) {
878 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
879 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
880 chr,chrpos,chrlength);
881 if (allocp) FREE(chr);
882
883 } else if (Interval_sign(interval) > 0) {
884 if (position != last_acceptor) {
885 last_acceptor = position;
886 k++;
887 }
888 } else {
889 if (position != last_antidonor) {
890 last_antidonor = position;
891 k++;
892 }
893 }
894 }
895
896 FREE(intervals);
897 FREE(introns1);
898 }
899 }
900 }
901
902 *nsplicesites = k;
903 debug(printf("total unique splicesites: %d\n",*nsplicesites));
904 fprintf(stderr,"%d unique splicesites...",*nsplicesites);
905
906 /* The above procedure determines number of unique splicesites */
907
908 if (*nsplicesites == 0) {
909 #ifdef GSNAP
910 *splicecomp = (Genomecomp_T *) NULL;
911 #endif
912 splicesites = (Univcoord_T *) NULL;
913 *splicetypes = (Splicetype_T *) NULL;
914 *splicedists = (Chrpos_T *) NULL;
915 *splicestrings = (List_T *) NULL;
916 *splicefrags_ref = (Genomecomp_T *) NULL;
917 *splicefrags_alt = (Genomecomp_T *) NULL;
918
919 return splicesites;
920 }
921
922 #ifdef GSNAP
923 nblocks = (Genome_totallength(genome)+31)/32U;
924 *splicecomp = (Genomecomp_T *) CALLOC(nblocks,sizeof(Genomecomp_T));
925 #endif
926 splicesites = (Univcoord_T *) CALLOC((*nsplicesites) + 1,sizeof(Univcoord_T));
927 *splicetypes = (Splicetype_T *) CALLOC(*nsplicesites,sizeof(Splicetype_T));
928 *splicedists = (Chrpos_T *) CALLOC(*nsplicesites,sizeof(Chrpos_T));
929 *splicestrings = (List_T *) CALLOC(*nsplicesites,sizeof(List_T));
930 *splicefrags_ref = (Genomecomp_T *) CALLOC(*nsplicesites,sizeof(Genomecomp_T));
931 if (genomealt == NULL) {
932 *splicefrags_alt = *splicefrags_ref;
933 } else {
934 *splicefrags_alt = (Genomecomp_T *) CALLOC(*nsplicesites,sizeof(Genomecomp_T));
935 }
936
937
938 k = 0;
939 for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
940 if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
941 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
942 /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
943 introns1 = IIT_get_with_divno(&nintrons1,splicing_iit,divno,0U,chrlength-1U,/*sortp*/false);
944 if (nintrons1 > 0) {
945 intervals = (Interval_T *) CALLOC(nintrons1,sizeof(Interval_T));
946 for (i = 0; i < nintrons1; i++) {
947 intervals[i] = &(splicing_iit->intervals[divno][i]);
948 }
949
950 qsort(intervals,nintrons1,sizeof(Interval_T),Interval_cmp_low);
951 last_donor = last_antiacceptor = 0U;
952 for (i = 0; i < nintrons1; i++) {
953 interval = intervals[i];
954 chrpos = Interval_low(interval);
955 position = chrpos + chroffset;
956
957 if (position >= chrhigh) {
958 #if 0
959 /* Warning given previously */
960 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
961 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
962 chr,chrpos,chrlength);
963 if (allocp) FREE(chr);
964 #endif
965
966 } else if (Interval_sign(interval) > 0) {
967 if (position == last_donor) {
968 if (Interval_length(interval) > (*splicedists)[last_donor_k]) {
969 (*splicedists)[last_donor_k] = Interval_length(interval);
970 }
971
972 } else {
973 last_donor_k = k;
974 last_donor = splicesites[k] = position;
975 #ifdef GSNAP
976 assert(position/32U < (unsigned int) nblocks);
977 (*splicecomp)[position/32U] |= (1 << (position % 32));
978 #endif
979 (*splicetypes)[k] = DONOR;
980 (*splicedists)[k] = Interval_length(interval);
981
982 saw_n_p = false;
983 Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
984 refstring = compress16(&saw_n_p,gbuffer_ref);
985 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
986 if (genomealt) {
987 Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
988 altstring = compress16(&saw_n_p,gbuffer_alt);
989 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
990 }
991
992 if (saw_n_p == true) {
993 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
994 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
995 chr,chrpos);
996 if (allocp) FREE(chr);
997 } else if (genomealt) {
998 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
999 k++;
1000 } else {
1001 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
1002 k++;
1003 }
1004 }
1005
1006 } else {
1007 if (position == last_antiacceptor) {
1008 if (Interval_length(interval) > (*splicedists)[last_antiacceptor_k]) {
1009 (*splicedists)[last_antiacceptor_k] = Interval_length(interval);
1010 }
1011
1012 } else {
1013 last_antiacceptor_k = k;
1014 last_antiacceptor = splicesites[k] = position;
1015 #ifdef GSNAP
1016 assert(position/32U < (unsigned int) nblocks);
1017 (*splicecomp)[position/32U] |= (1 << (position % 32));
1018 #endif
1019 (*splicetypes)[k] = ANTIACCEPTOR;
1020 (*splicedists)[k] = Interval_length(interval);
1021
1022 saw_n_p = false;
1023 Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
1024 refstring = compress16(&saw_n_p,gbuffer_ref);
1025 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
1026 if (genomealt) {
1027 Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
1028 altstring = compress16(&saw_n_p,gbuffer_alt);
1029 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
1030 }
1031
1032 if (saw_n_p == true) {
1033 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
1034 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
1035 chr,chrpos);
1036 if (allocp) FREE(chr);
1037 } else if (genomealt) {
1038 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
1039 k++;
1040 } else {
1041 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
1042 k++;
1043 }
1044 }
1045
1046 }
1047 }
1048
1049 qsort(intervals,nintrons1,sizeof(Interval_T),Interval_cmp_high);
1050 last_acceptor = last_antidonor = 0U;
1051 for (i = 0; i < nintrons1; i++) {
1052 interval = intervals[i];
1053 chrpos = Interval_high(interval) - INTRON_HIGH_TO_LOW;
1054 position = chrpos + chroffset;
1055
1056 if (position >= chrhigh) {
1057 #if 0
1058 /* Warning given previously */
1059 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
1060 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
1061 chr,chrpos,chrlength);
1062 if (allocp) FREE(chr);
1063 #endif
1064
1065 } else if (Interval_sign(interval) > 0) {
1066 if (position == last_acceptor) {
1067 if (Interval_length(interval) > (*splicedists)[last_acceptor_k]) {
1068 (*splicedists)[last_acceptor_k] = Interval_length(interval);
1069 }
1070
1071 } else {
1072 last_acceptor_k = k;
1073 last_acceptor = splicesites[k] = position;
1074 #ifdef GSNAP
1075 assert(position/32U < (unsigned int) nblocks);
1076 (*splicecomp)[position/32U] |= (1 << (position % 32));
1077 #endif
1078 (*splicetypes)[k] = ACCEPTOR;
1079 (*splicedists)[k] = Interval_length(interval);
1080
1081 saw_n_p = false;
1082 Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
1083 refstring = uint4_reverse(compress16(&saw_n_p,gbuffer_ref));
1084 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
1085 if (genomealt) {
1086 Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
1087 altstring = uint4_reverse(compress16(&saw_n_p,gbuffer_alt));
1088 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
1089 }
1090
1091 if (saw_n_p == true) {
1092 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
1093 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
1094 chr,chrpos);
1095 if (allocp) FREE(chr);
1096 } else if (genomealt) {
1097 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
1098 k++;
1099 } else {
1100 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
1101 k++;
1102 }
1103 }
1104
1105 } else {
1106 if (position == last_antidonor) {
1107 if (Interval_length(interval) > (*splicedists)[last_antidonor_k]) {
1108 (*splicedists)[last_antidonor_k] = Interval_length(interval);
1109 }
1110
1111 } else {
1112 last_antidonor_k = k;
1113 last_antidonor = splicesites[k] = position;
1114 #ifdef GSNAP
1115 assert(position/32U < (unsigned int) nblocks);
1116 (*splicecomp)[position/32U] |= (1 << (position % 32));
1117 #endif
1118 (*splicetypes)[k] = ANTIDONOR;
1119 (*splicedists)[k] = Interval_length(interval);
1120
1121 saw_n_p = false;
1122 Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
1123 refstring = uint4_reverse(compress16(&saw_n_p,gbuffer_ref));
1124 (*splicefrags_ref)[k] = unshuffle16(&saw_n_p,gbuffer_ref);
1125 if (genomealt) {
1126 Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
1127 altstring = uint4_reverse(compress16(&saw_n_p,gbuffer_alt));
1128 (*splicefrags_alt)[k] = unshuffle16(&saw_n_p,gbuffer_alt);
1129 }
1130
1131 if (saw_n_p == true) {
1132 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
1133 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
1134 chr,chrpos);
1135 if (allocp) FREE(chr);
1136 } else if (genomealt) {
1137 (*splicestrings)[k] = allelic_combinations(refstring,altstring,position,k,splicestringpool);
1138 k++;
1139 } else {
1140 (*splicestrings)[k] = Splicestringpool_push(NULL,splicestringpool,refstring,position,k);
1141 k++;
1142 }
1143 }
1144 }
1145 }
1146
1147 FREE(intervals);
1148 FREE(introns1);
1149 }
1150 }
1151 }
1152
1153 *nsplicesites = k;
1154 fprintf(stderr,"\n%d splicesites are valid...",*nsplicesites);
1155
1156
1157 /* Need to sort by individual splicesites */
1158 cells = (struct Cell_T *) CALLOC(*nsplicesites,sizeof(struct Cell_T));
1159 for (k = 0; k < *nsplicesites; k++) {
1160 cells[k].k = k;
1161 cells[k].pos = splicesites[k];
1162 }
1163 qsort(cells,*nsplicesites,sizeof(struct Cell_T),Cell_position_cmp);
1164
1165
1166 /* Save unordered information */
1167 temp_splicesites = splicesites;
1168 temp_splicetypes = *splicetypes;
1169 temp_splicedists = *splicedists;
1170 temp_splicestrings = *splicestrings;
1171 temp_splicefrags_ref = *splicefrags_ref;
1172 if (genomealt == NULL) {
1173 temp_splicefrags_alt = temp_splicefrags_ref;
1174 } else {
1175 temp_splicefrags_alt = *splicefrags_alt;
1176 }
1177
1178 /* Allocate ordered information */
1179 splicesites = (Univcoord_T *) CALLOC((*nsplicesites) + 1,sizeof(Univcoord_T));
1180 *splicetypes = (Splicetype_T *) CALLOC(*nsplicesites,sizeof(Splicetype_T));
1181 *splicedists = (Chrpos_T *) CALLOC(*nsplicesites,sizeof(Chrpos_T));
1182 *splicestrings = (List_T *) CALLOC(*nsplicesites,sizeof(List_T));
1183 *splicefrags_ref = (UINT4 *) CALLOC(*nsplicesites,sizeof(UINT4));
1184 if (genomealt == NULL) {
1185 *splicefrags_alt = *splicefrags_ref;
1186 } else {
1187 *splicefrags_alt = (UINT4 *) CALLOC(*nsplicesites,sizeof(UINT4));
1188 }
1189
1190
1191 /* Order all information */
1192 for (j = 0; j < *nsplicesites; j++) {
1193 k = cells[j].k;
1194 splicesites[j] = temp_splicesites[k];
1195 (*splicetypes)[j] = temp_splicetypes[k];
1196 (*splicedists)[j] = temp_splicedists[k];
1197 (*splicestrings)[j] = temp_splicestrings[k];
1198 for (p = (*splicestrings)[j]; p != NULL; p = List_next(p)) {
1199 /* Fix reference back to splicesite */
1200 splicestring = (Splicestring_T) List_head(p);
1201 splicestring->splicesite_i = j;
1202 }
1203 (*splicefrags_ref)[j] = temp_splicefrags_ref[k];
1204 (*splicefrags_alt)[j] = temp_splicefrags_alt[k];
1205 }
1206
1207 splicesites[*nsplicesites] = (Univcoord_T) -1; /* Marker for comparison in identify_all_segments */
1208
1209 FREE(cells);
1210 FREE(temp_splicesites);
1211 FREE(temp_splicetypes);
1212 FREE(temp_splicedists);
1213 FREE(temp_splicestrings);
1214 FREE(temp_splicefrags_ref);
1215 if (genomealt != NULL) {
1216 FREE(temp_splicefrags_alt);
1217 }
1218
1219
1220 #ifdef DEBUG2
1221 for (k = 0; k < *nsplicesites; k++) {
1222 printf("%d: %u %s %08X\n",k,splicesites[k],Splicetype_string((*splicetypes)[k]),(*splicefrags_ref)[k]);
1223 for (p = (*splicestrings)[k]; p != NULL; p = List_next(p)) {
1224 splicestring = (Splicestring_T) List_head(p);
1225 printf(" %u %u %u\n",splicestring->string,splicestring->splicesite,splicestring->splicesite_i);
1226 }
1227 }
1228 #endif
1229
1230 return splicesites;
1231 }
1232
1233
1234 #if 0
1235 static bool
1236 get_exons (Uintlist_T *donors, Uintlist_T *acceptors, char *annot) {
1237 Chrpos_T exonstart, exonend;
1238 char *p;
1239
1240 /* Skip header */
1241 p = annot;
1242 while (*p != '\0' && *p != '\n') {
1243 p++;
1244 }
1245 if (*p == '\n') p++;
1246
1247 *donors = *acceptors = (Uintlist_T) NULL;
1248 while (*p != '\0') {
1249 if (sscanf(p,"%u %u",&exonstart,&exonend) != 2) {
1250 return false;
1251 } else {
1252 *donors = Uintlist_push(*donors,exonend);
1253 *acceptors = Uintlist_push(*acceptors,exonstart);
1254 }
1255
1256 /* Advance to next exon to see if this is the last one */
1257 while (*p != '\0' && *p != '\n') p++;
1258 if (*p == '\n') p++;
1259 }
1260
1261 if (*donors == NULL || *acceptors == NULL) {
1262 return false;
1263 } else {
1264 *donors = Uintlist_pop(*donors,&exonend);
1265 *donors = Uintlist_reverse(*donors);
1266
1267 *acceptors = Uintlist_reverse(*acceptors);
1268 *acceptors = Uintlist_pop(*acceptors,&exonstart);
1269
1270 return true;
1271 }
1272 }
1273 #endif
1274
1275
1276 /************************************************************************/
1277
1278
1279
1280 typedef struct Trie_T *Trie_T;
1281 struct Trie_T {
1282 Splicestring_T leaf;
1283
1284 int nsites;
1285 int na;
1286 int nc;
1287 int ng;
1288 int nt;
1289
1290 Trie_T triea;
1291 Trie_T triec;
1292 Trie_T trieg;
1293 Trie_T triet;
1294 };
1295
1296
1297 #if 0
1298 static void
1299 Trie_free (Trie_T *old) {
1300 if ((*old) == NULL) {
1301 return;
1302 } else if ((*old)->leaf != NULL) {
1303 FREE(*old);
1304 return;
1305 } else {
1306 Trie_free(&(*old)->triea);
1307 Trie_free(&(*old)->triec);
1308 Trie_free(&(*old)->trieg);
1309 Trie_free(&(*old)->triet);
1310 FREE(*old);
1311 return;
1312 }
1313 }
1314 #endif
1315
1316 /************************************************************************
1317 * Building splicetries
1318 ************************************************************************/
1319
1320 #if 0
1321 static Trie_T
1322 Trie_new (Splicestring_T *sites, int nsites, int charpos) {
1323 Trie_T trie;
1324 int i, ptr;
1325
1326 if (nsites == 0) {
1327 return (Trie_T) NULL;
1328
1329 } else {
1330
1331 if (nsites == 1) {
1332 trie = (Trie_T) MALLOC(sizeof(*trie));
1333 trie->nsites = 1;
1334 trie->na = trie->nc = trie->ng = trie->nt = 0;
1335 trie->triea = trie->triec = trie->trieg = trie->triet = (Trie_T) NULL;
1336 trie->leaf = sites[0];
1337 /* trie->remainder = &(sites[0]->string[charpos]); */
1338 return trie;
1339
1340 } else if (charpos >= 16) {
1341 return (Trie_T) NULL;
1342
1343 } else {
1344 trie = (Trie_T) MALLOC(sizeof(*trie));
1345 trie->nsites = nsites;
1346 trie->na = trie->nc = trie->ng = trie->nt = 0;
1347 trie->leaf = (Splicestring_T) NULL;
1348 /* trie->remainder = (char *) NULL; */
1349
1350 for (i = 0; i < nsites; i++) {
1351 switch ((sites[i]->string >> (30 - 2*charpos)) & 0x03) {
1352 case RIGHT_A: trie->na++; break;
1353 case RIGHT_C: trie->nc++; break;
1354 case RIGHT_G: trie->ng++; break;
1355 case RIGHT_T: trie->nt++; break;
1356 default: abort();
1357 }
1358 }
1359
1360 ptr = 0;
1361 trie->triea = Trie_new(&(sites[ptr]),trie->na,charpos+1);
1362 ptr += trie->na;
1363
1364 trie->triec = Trie_new(&(sites[ptr]),trie->nc,charpos+1);
1365 ptr += trie->nc;
1366
1367 trie->trieg = Trie_new(&(sites[ptr]),trie->ng,charpos+1);
1368 ptr += trie->ng;
1369
1370 trie->triet = Trie_new(&(sites[ptr]),trie->nt,charpos+1);
1371 /* ptr += trie->nt; */
1372
1373 return trie;
1374 }
1375 }
1376 }
1377 #endif
1378
1379
1380 /* Based on original Trie_output_new, which used repeated calls to Uintlist_push */
1381 static int
Trie_output_size(int size,Splicestring_T * sites,int nsites,int charpos)1382 Trie_output_size (int size, Splicestring_T *sites, int nsites, int charpos) {
1383 int i, k;
1384 /* unsigned int posa, posc, posg, post; */
1385 int na, nc, ng, nt;
1386
1387 if (nsites == 0) {
1388 debug0(printf("nsites == 0, so not adding anything\n"));
1389
1390 } else if (nsites == 1) {
1391 debug0(printf("nsites == 1, so adding 1\n"));
1392 size += 1;
1393
1394 } else if (charpos >= 16) {
1395 if (nsites > MAX_DUPLICATES) {
1396 fprintf(stderr,"Warning: Splicetrie exceeded max duplicates value of %d\n",MAX_DUPLICATES);
1397 } else {
1398 size += 1;
1399
1400 for (i = 0; i < nsites; i++) {
1401 debug0(printf(" %d",sites[i]->splicesite_i));
1402 size += 1;
1403 }
1404 debug0(printf("\n"));
1405 }
1406
1407 } else {
1408 na = nc = ng = nt = 0;
1409 for (i = 0; i < nsites; i++) {
1410 switch ((sites[i]->string >> (30 - 2*charpos)) & 0x03) {
1411 case RIGHT_A: na++; break;
1412 case RIGHT_C: nc++; break;
1413 case RIGHT_G: ng++; break;
1414 case RIGHT_T: nt++; break;
1415 default: abort();
1416 }
1417 }
1418 debug0(printf("%d A, %d C, %d G, %d T\n",na,nc,ng,nt));
1419
1420 k = 0;
1421 size = Trie_output_size(size,&(sites[k]),na,charpos+1);
1422
1423 k += na;
1424 size = Trie_output_size(size,&(sites[k]),nc,charpos+1);
1425
1426 k += nc;
1427 size = Trie_output_size(size,&(sites[k]),ng,charpos+1);
1428
1429 k += ng;
1430 size = Trie_output_size(size,&(sites[k]),nt,charpos+1);
1431 /* k += nt; */
1432
1433 size += 5;
1434 }
1435
1436 return size;
1437 }
1438
1439
1440
1441 #ifdef USE_LIST
1442 /* Combination of Trie_new and Trie_output */
1443 /* Note that when *ptr = NULL_POINTER, we do not need to advance
1444 nprinted, because no entry is made into triecontents_list */
1445 static Uintlist_T
Trie_output_list(unsigned int * ptr,int * nprinted,Uintlist_T triecontents_list,Splicestring_T * sites,int nsites,int charpos)1446 Trie_output_list (unsigned int *ptr, int *nprinted, Uintlist_T triecontents_list,
1447 Splicestring_T *sites, int nsites, int charpos) {
1448 int i, k;
1449 unsigned int posa, posc, posg, post;
1450 int na, nc, ng, nt;
1451
1452 if (nsites == 0) {
1453 debug0(printf("nsites == 0, so NULL\n"));
1454 *ptr = NULL_POINTER;
1455
1456 } else if (nsites == 1) {
1457 debug0(printf("nsites == 1, so pushing %d\n",sites[0]->splicesite_i));
1458 *ptr = *nprinted;
1459 triecontents_list = Uintlist_push(triecontents_list,sites[0]->splicesite_i);
1460 *nprinted += 1;
1461
1462 } else if (charpos >= 16) {
1463 if (nsites > MAX_DUPLICATES) {
1464 fprintf(stderr,"Warning: Splicetrie exceeded max duplicates value of %d\n",MAX_DUPLICATES);
1465 *ptr = NULL_POINTER;
1466 } else {
1467 *ptr = *nprinted;
1468
1469 triecontents_list = Uintlist_push(triecontents_list,(Triecontent_T) (-nsites));
1470 *nprinted += 1;
1471 for (i = 0; i < nsites; i++) {
1472 debug0(printf(" %d",sites[i]->splicesite_i));
1473 triecontents_list = Uintlist_push(triecontents_list,sites[i]->splicesite_i);
1474 *nprinted += 1;
1475 }
1476 debug0(printf("\n"));
1477 }
1478
1479 } else {
1480 na = nc = ng = nt = 0;
1481 for (i = 0; i < nsites; i++) {
1482 switch ((sites[i]->string >> (30 - 2*charpos)) & 0x03) {
1483 case RIGHT_A: na++; break;
1484 case RIGHT_C: nc++; break;
1485 case RIGHT_G: ng++; break;
1486 case RIGHT_T: nt++; break;
1487 default: abort();
1488 }
1489 }
1490 debug0(printf("%d A, %d C, %d G, %d T\n",na,nc,ng,nt));
1491
1492 k = 0;
1493 triecontents_list = Trie_output_list(&posa,&(*nprinted),triecontents_list,
1494 &(sites[k]),na,charpos+1);
1495 k += na;
1496 triecontents_list = Trie_output_list(&posc,&(*nprinted),triecontents_list,
1497 &(sites[k]),nc,charpos+1);
1498 k += nc;
1499 triecontents_list = Trie_output_list(&posg,&(*nprinted),triecontents_list,
1500 &(sites[k]),ng,charpos+1);
1501 k += ng;
1502 triecontents_list = Trie_output_list(&post,&(*nprinted),triecontents_list,
1503 &(sites[k]),nt,charpos+1);
1504 /* k += nt; */
1505
1506 *ptr = *nprinted;
1507 triecontents_list = Uintlist_push(triecontents_list,INTERNAL_NODE);
1508
1509 if (posa == NULL_POINTER) {
1510 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1511 } else {
1512 triecontents_list = Uintlist_push(triecontents_list,*ptr - posa);
1513 }
1514 if (posc == NULL_POINTER) {
1515 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1516 } else {
1517 triecontents_list = Uintlist_push(triecontents_list,*ptr - posc);
1518 }
1519 if (posg == NULL_POINTER) {
1520 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1521 } else {
1522 triecontents_list = Uintlist_push(triecontents_list,*ptr - posg);
1523 }
1524 if (post == NULL_POINTER) {
1525 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1526 } else {
1527 triecontents_list = Uintlist_push(triecontents_list,*ptr - post);
1528 }
1529 *nprinted += 5;
1530 }
1531
1532 return triecontents_list;
1533 }
1534
1535 #else
1536
1537 /* ptr is for offsets */
1538 static unsigned int *
Trie_output_array(unsigned int * ptr,int * nprinted,unsigned int * contents,Splicestring_T * sites,int nsites,int charpos)1539 Trie_output_array (unsigned int *ptr, int *nprinted, unsigned int *contents,
1540 Splicestring_T *sites, int nsites, int charpos) {
1541 int i, k;
1542 unsigned int posa, posc, posg, post;
1543 int na, nc, ng, nt;
1544
1545 if (nsites == 0) {
1546 debug0(printf("nsites == 0, so NULL\n"));
1547 *ptr = NULL_POINTER;
1548
1549 } else if (nsites == 1) {
1550 debug0(printf("nsites == 1, so pushing %d\n",sites[0]->splicesite_i));
1551 *ptr = *nprinted;
1552 *contents++ = sites[0]->splicesite_i;
1553 *nprinted += 1;
1554
1555 } else if (charpos >= 16) {
1556 if (nsites > MAX_DUPLICATES) {
1557 fprintf(stderr,"Warning: Splicetrie exceeded max duplicates value of %d\n",MAX_DUPLICATES);
1558 *ptr = NULL_POINTER;
1559 } else {
1560 *ptr = *nprinted;
1561
1562 *contents++ = (Triecontent_T) (-nsites);
1563 *nprinted += 1;
1564 for (i = 0; i < nsites; i++) {
1565 debug0(printf(" %d",sites[i]->splicesite_i));
1566 *contents++ = sites[i]->splicesite_i;
1567 *nprinted += 1;
1568 }
1569 debug0(printf("\n"));
1570 }
1571
1572 } else {
1573 na = nc = ng = nt = 0;
1574 for (i = 0; i < nsites; i++) {
1575 switch ((sites[i]->string >> (30 - 2*charpos)) & 0x03) {
1576 case RIGHT_A: na++; break;
1577 case RIGHT_C: nc++; break;
1578 case RIGHT_G: ng++; break;
1579 case RIGHT_T: nt++; break;
1580 default: abort();
1581 }
1582 }
1583 debug0(printf("%d A, %d C, %d G, %d T\n",na,nc,ng,nt));
1584
1585 k = 0;
1586 contents = Trie_output_array(&posa,&(*nprinted),contents,&(sites[k]),na,charpos+1);
1587
1588 k += na;
1589 contents = Trie_output_array(&posc,&(*nprinted),contents,&(sites[k]),nc,charpos+1);
1590
1591 k += nc;
1592 contents = Trie_output_array(&posg,&(*nprinted),contents,&(sites[k]),ng,charpos+1);
1593
1594 k += ng;
1595 contents = Trie_output_array(&post,&(*nprinted),contents,&(sites[k]),nt,charpos+1);
1596 /* k += nt; */
1597
1598 *ptr = *nprinted;
1599 *contents++ = INTERNAL_NODE;
1600
1601 if (posa == NULL_POINTER) {
1602 *contents++ = EMPTY_POINTER;
1603 } else {
1604 *contents++ = *ptr - posa;
1605 }
1606 if (posc == NULL_POINTER) {
1607 *contents++ = EMPTY_POINTER;
1608 } else {
1609 *contents++ = *ptr - posc;
1610 }
1611 if (posg == NULL_POINTER) {
1612 *contents++ = EMPTY_POINTER;
1613 } else {
1614 *contents++ = *ptr - posg;
1615 }
1616 if (post == NULL_POINTER) {
1617 *contents++ = EMPTY_POINTER;
1618 } else {
1619 *contents++ = *ptr - post;
1620 }
1621 *nprinted += 5;
1622 }
1623
1624 return contents;
1625 }
1626 #endif
1627
1628
1629 #if 0
1630 static void
1631 Trie_print (Trie_T trie, int level) {
1632 int i;
1633
1634 if (trie == NULL) {
1635 printf("\n");
1636
1637 } else {
1638 if (trie->leaf != NULL) {
1639 printf(" %d@%u\n",(int) trie->leaf->splicesite_i,trie->leaf->splicesite);
1640
1641 } else {
1642 printf("\n");
1643 for (i = 0; i < level; i++) {
1644 printf("\t");
1645 }
1646 printf("A (%d)",trie->na);
1647 Trie_print(trie->triea,level+1);
1648
1649 for (i = 0; i < level; i++) {
1650 printf("\t");
1651 }
1652 printf("C (%d)",trie->nc);
1653 Trie_print(trie->triec,level+1);
1654
1655 for (i = 0; i < level; i++) {
1656 printf("\t");
1657 }
1658 printf("G (%d)",trie->ng);
1659 Trie_print(trie->trieg,level+1);
1660
1661 for (i = 0; i < level; i++) {
1662 printf("\t");
1663 }
1664 printf("T (%d)",trie->nt);
1665 Trie_print(trie->triet,level+1);
1666 }
1667 }
1668
1669 return;
1670 }
1671 #endif
1672
1673 #if 0
1674 static int
1675 Trie_print_compact (int *ptr, FILE *fp, Trie_T trie, int nprinted) {
1676 int posa, posc, posg, post;
1677
1678 if (trie == NULL) {
1679 *ptr = NULL_POINTER;
1680 return nprinted;
1681
1682 } else {
1683 if (trie->leaf != NULL) {
1684 printf("%d. %d@%u\n",nprinted,(int) trie->leaf->splicesite_i,trie->leaf->splicesite);
1685 *ptr = nprinted;
1686 return nprinted + 1;
1687
1688 } else {
1689 nprinted = Trie_print_compact(&posa,fp,trie->triea,nprinted);
1690 nprinted = Trie_print_compact(&posc,fp,trie->triec,nprinted);
1691 nprinted = Trie_print_compact(&posg,fp,trie->trieg,nprinted);
1692 nprinted = Trie_print_compact(&post,fp,trie->triet,nprinted);
1693
1694 *ptr = nprinted;
1695
1696 printf("%d. %u\n",nprinted,INTERNAL_NODE);
1697
1698 printf("relative: %d %d %d %d\n",
1699 *ptr - posa,*ptr - posc,*ptr - posg,*ptr - post);
1700 printf("absolute: %d %d %d %d\n",
1701 posa,posc,posg,post);
1702
1703 return nprinted + 3;
1704 }
1705 }
1706 }
1707 #endif
1708
1709
1710 #ifdef USE_2BYTE_RELOFFSETS
1711 #define TRIE_EMPTY_SIZE 3
1712 #else
1713 #define TRIE_EMPTY_SIZE 5
1714 #endif
1715
1716
1717 #ifdef USE_LIST
1718 static Uintlist_T
Trie_output_empty_list(unsigned int * ptr,int * nprinted,Uintlist_T triecontents_list)1719 Trie_output_empty_list (unsigned int *ptr, int *nprinted, Uintlist_T triecontents_list) {
1720 *ptr = (unsigned int) *nprinted;
1721 triecontents_list = Uintlist_push(triecontents_list,INTERNAL_NODE);
1722 #ifdef USE_2BYTE_RELOFFSETS
1723 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1724 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1725 *nprinted += 3;
1726 #else
1727 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1728 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1729 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1730 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1731 *nprinted += 5;
1732 #endif
1733 return triecontents_list;
1734 }
1735
1736 #else
1737 static unsigned int *
Trie_output_empty_array(unsigned int * ptr,int * nprinted,unsigned int * contents)1738 Trie_output_empty_array (unsigned int *ptr, int *nprinted, unsigned int *contents) {
1739 *ptr = (unsigned int) *nprinted;
1740 *contents++ = INTERNAL_NODE;
1741 #ifdef USE_2BYTE_RELOFFSETS
1742 *contents++ = EMPTY_POINTER;
1743 *contents++ = EMPTY_POINTER;
1744 *nprinted += 3;
1745 #else
1746 *contents++ = EMPTY_POINTER;
1747 *contents++ = EMPTY_POINTER;
1748 *contents++ = EMPTY_POINTER;
1749 *contents++ = EMPTY_POINTER;
1750 *nprinted += 5;
1751 #endif
1752 return contents;
1753 }
1754
1755 #endif
1756
1757
1758 #if 0
1759 static Uintlist_T
1760 Trie_output (int *ptr, int *nprinted, Uintlist_T triecontents_list, Trie_T trie) {
1761 int posa, posc, posg, post;
1762 #ifdef USE_2BYTE_RELOFFSETS
1763 unsigned int pointersAC, pointersGT;
1764 int reloffset;
1765 #endif
1766
1767 if (trie == NULL) {
1768 *ptr = NULL_POINTER;
1769
1770 } else if (trie->leaf != NULL) {
1771 *ptr = *nprinted;
1772 /* Entering splicesite_i, rather than position */
1773 triecontents_list = Uintlist_push(triecontents_list,trie->leaf->splicesite_i);
1774 *nprinted += 1;
1775
1776 } else {
1777 triecontents_list = Trie_output(&posa,&(*nprinted),triecontents_list,trie->triea);
1778 triecontents_list = Trie_output(&posc,&(*nprinted),triecontents_list,trie->triec);
1779 triecontents_list = Trie_output(&posg,&(*nprinted),triecontents_list,trie->trieg);
1780 triecontents_list = Trie_output(&post,&(*nprinted),triecontents_list,trie->triet);
1781
1782 *ptr = *nprinted;
1783 triecontents_list = Uintlist_push(triecontents_list,INTERNAL_NODE);
1784
1785
1786 #ifdef USE_2BYTE_RELOFFSETS
1787 if (posa == NULL_POINTER) {
1788 pointersAC = 0;
1789 } else if ((reloffset = *ptr - posa) >= 65536) {
1790 fprintf(stderr,"Reloffset A %d = %d - %d is too big for 2 bytes\n",reloffset,*ptr,posa);
1791 abort();
1792 } else {
1793 pointersAC = reloffset;
1794 }
1795
1796 if (posc == NULL_POINTER) {
1797 pointersAC = (pointersAC << 16);
1798 } else if ((reloffset = *ptr - posc) >= 65536) {
1799 fprintf(stderr,"Reloffset C %d = %d - %d is too big for 2 bytes\n",reloffset,*ptr,posc);
1800 abort();
1801 } else {
1802 pointersAC = (pointersAC << 16) + reloffset;
1803 }
1804
1805 if (posg == NULL_POINTER) {
1806 pointersGT = 0;
1807 } else if ((reloffset = *ptr - posg) >= 65536) {
1808 fprintf(stderr,"Reloffset G %d = %d - %d is too big for 2 bytes\n",reloffset,*ptr,posg);
1809 abort();
1810 } else {
1811 pointersGT = reloffset;
1812 }
1813
1814 if (post == NULL_POINTER) {
1815 pointersGT = (pointersGT << 16);
1816 } else if ((reloffset = *ptr - post) >= 65536) {
1817 fprintf(stderr,"Reloffset T %d = %d - %d is too big for 2 bytes\n",reloffset,*ptr,post);
1818 abort();
1819 } else {
1820 pointersGT = (pointersGT << 16) + reloffset;
1821 }
1822
1823 triecontents_list = Uintlist_push(triecontents_list,pointersAC);
1824 triecontents_list = Uintlist_push(triecontents_list,pointersGT);
1825
1826 *nprinted += 3;
1827 #else
1828 if (posa == NULL_POINTER) {
1829 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1830 } else {
1831 triecontents_list = Uintlist_push(triecontents_list,*ptr - posa);
1832 }
1833 if (posc == NULL_POINTER) {
1834 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1835 } else {
1836 triecontents_list = Uintlist_push(triecontents_list,*ptr - posc);
1837 }
1838 if (posg == NULL_POINTER) {
1839 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1840 } else {
1841 triecontents_list = Uintlist_push(triecontents_list,*ptr - posg);
1842 }
1843 if (post == NULL_POINTER) {
1844 triecontents_list = Uintlist_push(triecontents_list,EMPTY_POINTER);
1845 } else {
1846 triecontents_list = Uintlist_push(triecontents_list,*ptr - post);
1847 }
1848 *nprinted += 5;
1849 #endif
1850
1851 }
1852 return triecontents_list;
1853 }
1854 #endif
1855
1856
1857 void
Splicetrie_npartners(int ** nsplicepartners_skip,int ** nsplicepartners_obs,int ** nsplicepartners_max,Univcoord_T * splicesites,Splicetype_T * splicetypes,Chrpos_T * splicedists,List_T * splicestrings,int nsplicesites,Univ_IIT_T chromosome_iit,Chrpos_T max_distance,bool distances_observed_p)1858 Splicetrie_npartners (int **nsplicepartners_skip, int **nsplicepartners_obs, int **nsplicepartners_max,
1859 Univcoord_T *splicesites, Splicetype_T *splicetypes,
1860 Chrpos_T *splicedists, List_T *splicestrings, int nsplicesites,
1861 Univ_IIT_T chromosome_iit, Chrpos_T max_distance, bool distances_observed_p) {
1862 int nsites_skip, nsites_obs, nsites_max, j, j1;
1863 Univcoord_T chroffset, chrhigh, leftbound_obs, leftbound_max, rightbound_obs, rightbound_max;
1864 Univcoord_T leftbound_min, rightbound_min;
1865 Chrpos_T chrlength;
1866 Chrnum_T chrnum;
1867
1868
1869 (*nsplicepartners_skip) = (int *) CALLOC(nsplicesites,sizeof(int));
1870 if (distances_observed_p == true) {
1871 (*nsplicepartners_obs) = (int *) CALLOC(nsplicesites,sizeof(int));
1872 } else {
1873 (*nsplicepartners_obs) = (int *) NULL;
1874 }
1875 (*nsplicepartners_max) = (int *) CALLOC(nsplicesites,sizeof(int));
1876
1877 chrhigh = 0U;
1878 for (j = 0; j < nsplicesites; j++) {
1879 if (splicesites[j] > chrhigh) {
1880 chrnum = Univ_IIT_get_one(chromosome_iit,splicesites[j],splicesites[j]);
1881 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
1882 /* chrhigh += 1U; */
1883 }
1884
1885 switch (splicetypes[j]) {
1886 case DONOR:
1887 nsites_skip = nsites_obs = nsites_max = 0;
1888 j1 = j + 1;
1889
1890 if ((rightbound_min = splicesites[j] + MININTRONLEN) > chrhigh) {
1891 rightbound_min = chrhigh;
1892 }
1893 while (j1 < nsplicesites && splicesites[j1] < rightbound_min) {
1894 nsites_skip++;
1895 j1++;
1896 }
1897
1898 if (distances_observed_p) {
1899 if ((rightbound_obs = splicesites[j] + splicedists[j]) > chrhigh) {
1900 rightbound_obs = chrhigh;
1901 }
1902 while (j1 < nsplicesites && splicesites[j1] < rightbound_obs) {
1903 if (splicetypes[j1] == ACCEPTOR) {
1904 nsites_obs += List_length(splicestrings[j1]);
1905 }
1906 j1++;
1907 }
1908 }
1909
1910 if ((rightbound_max = splicesites[j] + max_distance) > chrhigh) {
1911 rightbound_max = chrhigh;
1912 }
1913 while (j1 < nsplicesites && splicesites[j1] < rightbound_max) {
1914 if (splicetypes[j1] == ACCEPTOR) {
1915 nsites_max += List_length(splicestrings[j1]);
1916 }
1917 j1++;
1918 }
1919
1920 break;
1921
1922 case ACCEPTOR:
1923 nsites_skip = nsites_obs = nsites_max = 0;
1924 j1 = j - 1;
1925
1926 if (splicesites[j] < chroffset + MININTRONLEN) {
1927 leftbound_min = chroffset;
1928 } else {
1929 leftbound_min = splicesites[j] - MININTRONLEN;
1930 }
1931 while (j1 >= 0 && splicesites[j1] > leftbound_min) {
1932 nsites_skip++;
1933 j1--;
1934 }
1935
1936 if (distances_observed_p) {
1937 if (splicesites[j] < chroffset + splicedists[j]) {
1938 leftbound_obs = chroffset;
1939 } else {
1940 leftbound_obs = splicesites[j] - splicedists[j];
1941 }
1942 while (j1 >= 0 && splicesites[j1] > leftbound_obs) {
1943 if (splicetypes[j1] == DONOR) {
1944 nsites_obs += List_length(splicestrings[j1]);
1945 }
1946 j1--;
1947 }
1948 }
1949
1950 if (splicesites[j] < chroffset + max_distance) {
1951 leftbound_max = chroffset;
1952 } else {
1953 leftbound_max = splicesites[j] - max_distance;
1954 }
1955 while (j1 >= 0 && splicesites[j1] > leftbound_max) {
1956 if (splicetypes[j1] == DONOR) {
1957 nsites_max += List_length(splicestrings[j1]);
1958 }
1959 j1--;
1960 }
1961
1962 break;
1963
1964 case ANTIDONOR:
1965 nsites_skip = nsites_obs = nsites_max = 0;
1966 j1 = j - 1;
1967
1968 if (splicesites[j] < chroffset + MININTRONLEN) {
1969 leftbound_min = chroffset;
1970 } else {
1971 leftbound_min = splicesites[j] - MININTRONLEN;
1972 }
1973 while (j1 >= 0 && splicesites[j1] > leftbound_min) {
1974 nsites_skip++;
1975 j1--;
1976 }
1977
1978 if (distances_observed_p) {
1979 if (splicesites[j] < chroffset + splicedists[j]) {
1980 leftbound_obs = chroffset;
1981 } else {
1982 leftbound_obs = splicesites[j] - splicedists[j];
1983 }
1984 while (j1 >= 0 && splicesites[j1] > leftbound_obs) {
1985 if (splicetypes[j1] == ANTIACCEPTOR) {
1986 nsites_obs += List_length(splicestrings[j1]);
1987 }
1988 j1--;
1989 }
1990 }
1991
1992 if (splicesites[j] < chroffset + max_distance) {
1993 leftbound_max = chroffset;
1994 } else {
1995 leftbound_max = splicesites[j] - max_distance;
1996 }
1997 while (j1 >= 0 && splicesites[j1] > leftbound_max) {
1998 if (splicetypes[j1] == ANTIACCEPTOR) {
1999 nsites_max += List_length(splicestrings[j1]);
2000 }
2001 j1--;
2002 }
2003
2004 break;
2005
2006 case ANTIACCEPTOR:
2007 nsites_skip = nsites_obs = nsites_max = 0;
2008 j1 = j + 1;
2009
2010 if ((rightbound_min = splicesites[j] + MININTRONLEN) > chrhigh) {
2011 rightbound_min = chrhigh;
2012 }
2013 while (j1 < nsplicesites && splicesites[j1] < rightbound_min) {
2014 nsites_skip++;
2015 j1++;
2016 }
2017
2018 if (distances_observed_p) {
2019 if ((rightbound_obs = splicesites[j] + splicedists[j]) > chrhigh) {
2020 rightbound_obs = chrhigh;
2021 }
2022 while (j1 < nsplicesites && splicesites[j1] < rightbound_obs) {
2023 if (splicetypes[j1] == ANTIDONOR) {
2024 nsites_obs += List_length(splicestrings[j1]);
2025 }
2026 j1++;
2027 }
2028 }
2029
2030 if ((rightbound_max = splicesites[j] + max_distance) > chrhigh) {
2031 rightbound_max = chrhigh;
2032 }
2033 while (j1 < nsplicesites && splicesites[j1] < rightbound_max) {
2034 if (splicetypes[j1] == ANTIDONOR) {
2035 nsites_max += List_length(splicestrings[j1]);
2036 }
2037 j1++;
2038 }
2039
2040 break;
2041
2042 default:
2043 fprintf(stderr,"Unexpected splicetype %d\n",splicetypes[j]);
2044 abort();
2045 }
2046
2047 (*nsplicepartners_skip)[j] = nsites_skip;
2048 if (distances_observed_p) {
2049 (*nsplicepartners_obs)[j] = nsites_obs;
2050 }
2051 (*nsplicepartners_max)[j] = nsites_max;
2052 }
2053
2054 return;
2055 }
2056
2057
2058 #define MAX_SITES_ALLOCATED 1000
2059
2060 static void
build_via_splicesites_size(int * trie_obs_size,int * trie_max_size,int * nsplicepartners_skip,int * nsplicepartners_obs,int * nsplicepartners_max,Splicetype_T * splicetypes,List_T * splicestrings,int nsplicesites,bool distances_observed_p)2061 build_via_splicesites_size (int *trie_obs_size, int *trie_max_size,
2062 int *nsplicepartners_skip, int *nsplicepartners_obs, int *nsplicepartners_max,
2063 Splicetype_T *splicetypes, List_T *splicestrings, int nsplicesites,
2064 bool distances_observed_p) {
2065 List_T p;
2066 int nsites, j, j1;
2067 Splicestring_T *sites, sites_allocated[MAX_SITES_ALLOCATED];
2068 int npartners;
2069
2070 for (j = 0; j < nsplicesites; j++) {
2071 switch (splicetypes[j]) {
2072 case DONOR:
2073 j1 = j + 1 + nsplicepartners_skip[j];
2074
2075 if (distances_observed_p) {
2076 if ((npartners = nsplicepartners_obs[j]) == 0) {
2077 #if 0
2078 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2079 #else
2080 (*trie_obs_size) += TRIE_EMPTY_SIZE;
2081 #endif
2082
2083 } else {
2084 if (npartners > MAX_SITES_ALLOCATED) {
2085 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2086 } else {
2087 sites = &(sites_allocated[0]);
2088 }
2089 nsites = 0;
2090 while (nsites < nsplicepartners_obs[j]) {
2091 if (splicetypes[j1] == ACCEPTOR) {
2092 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2093 sites[nsites++] = (Splicestring_T) List_head(p);
2094 }
2095 }
2096 j1++;
2097 }
2098 assert(nsites == nsplicepartners_obs[j]);
2099 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2100 #if 0
2101 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2102 sites,nsites,/*charpos*/0);
2103 #else
2104 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
2105 #endif
2106 if (npartners > MAX_SITES_ALLOCATED) {
2107 FREE(sites);
2108 }
2109 }
2110 }
2111
2112 if ((npartners = nsplicepartners_max[j]) == 0) {
2113 #if 0
2114 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2115 #else
2116 (*trie_max_size) += TRIE_EMPTY_SIZE;
2117 #endif
2118
2119 } else {
2120 if (npartners > MAX_SITES_ALLOCATED) {
2121 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2122 } else {
2123 sites = &(sites_allocated[0]);
2124 }
2125 nsites = 0;
2126 while (nsites < nsplicepartners_max[j]) {
2127 if (splicetypes[j1] == ACCEPTOR) {
2128 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2129 sites[nsites++] = (Splicestring_T) List_head(p);
2130 }
2131 }
2132 j1++;
2133 }
2134 assert(nsites == nsplicepartners_max[j]);
2135 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2136 #if 0
2137 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2138 sites,nsites,/*charpos*/0);
2139 #else
2140 *trie_max_size = Trie_output_size(*trie_max_size,sites,nsites,/*charpos*/0);
2141 #endif
2142 if (npartners > MAX_SITES_ALLOCATED) {
2143 FREE(sites);
2144 }
2145 }
2146
2147 break;
2148
2149 case ACCEPTOR:
2150
2151 j1 = j - 1 - nsplicepartners_skip[j];
2152
2153 if (distances_observed_p) {
2154 if ((npartners = nsplicepartners_obs[j]) == 0) {
2155 #if 0
2156 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2157 #else
2158 (*trie_obs_size) += TRIE_EMPTY_SIZE;
2159 #endif
2160
2161 } else {
2162 if (npartners > MAX_SITES_ALLOCATED) {
2163 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2164 } else {
2165 sites = &(sites_allocated[0]);
2166 }
2167 nsites = 0;
2168 while (nsites < nsplicepartners_obs[j]) {
2169 if (splicetypes[j1] == DONOR) {
2170 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2171 sites[nsites++] = (Splicestring_T) List_head(p);
2172 }
2173 }
2174 j1--;
2175 }
2176 assert(nsites == nsplicepartners_obs[j]);
2177 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2178 #if 0
2179 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2180 sites,nsites,/*charpos*/0);
2181 #else
2182 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
2183 #endif
2184 if (npartners > MAX_SITES_ALLOCATED) {
2185 FREE(sites);
2186 }
2187 }
2188 }
2189
2190 if ((npartners = nsplicepartners_max[j]) == 0) {
2191 #if 0
2192 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2193 #else
2194 (*trie_max_size) += TRIE_EMPTY_SIZE;
2195 #endif
2196
2197 } else {
2198 if (npartners > MAX_SITES_ALLOCATED) {
2199 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2200 } else {
2201 sites = &(sites_allocated[0]);
2202 }
2203 nsites = 0;
2204 while (nsites < nsplicepartners_max[j]) {
2205 if (splicetypes[j1] == DONOR) {
2206 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2207 sites[nsites++] = (Splicestring_T) List_head(p);
2208 }
2209 }
2210 j1--;
2211 }
2212 assert(nsites == nsplicepartners_max[j]);
2213 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2214 #if 0
2215 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2216 sites,nsites,/*charpos*/0);
2217 #else
2218 *trie_max_size = Trie_output_size(*trie_max_size,sites,nsites,/*charpos*/0);
2219 #endif
2220 if (npartners > MAX_SITES_ALLOCATED) {
2221 FREE(sites);
2222 }
2223 }
2224
2225 break;
2226
2227 case ANTIDONOR:
2228
2229 j1 = j - 1 - nsplicepartners_skip[j];
2230
2231 if (distances_observed_p) {
2232 if ((npartners = nsplicepartners_obs[j]) == 0) {
2233 #if 0
2234 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2235 #else
2236 (*trie_obs_size) += TRIE_EMPTY_SIZE;
2237 #endif
2238
2239 } else {
2240 if (npartners > MAX_SITES_ALLOCATED) {
2241 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2242 } else {
2243 sites = &(sites_allocated)[0];
2244 }
2245 nsites = 0;
2246 while (nsites < nsplicepartners_obs[j]) {
2247 if (splicetypes[j1] == ANTIACCEPTOR) {
2248 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2249 sites[nsites++] = (Splicestring_T) List_head(p);
2250 }
2251 }
2252 j1--;
2253 }
2254 assert(nsites == nsplicepartners_obs[j]);
2255 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2256 #if 0
2257 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2258 sites,nsites,/*charpos*/0);
2259 #else
2260 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
2261 #endif
2262 if (npartners > MAX_SITES_ALLOCATED) {
2263 FREE(sites);
2264 }
2265 }
2266 }
2267
2268 if ((npartners = nsplicepartners_max[j]) == 0) {
2269 #if 0
2270 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2271 #else
2272 (*trie_max_size) += TRIE_EMPTY_SIZE;
2273 #endif
2274
2275 } else {
2276 if (npartners > MAX_SITES_ALLOCATED) {
2277 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2278 } else {
2279 sites = &(sites_allocated[0]);
2280 }
2281 nsites = 0;
2282 while (nsites < nsplicepartners_max[j]) {
2283 if (splicetypes[j1] == ANTIACCEPTOR) {
2284 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2285 sites[nsites++] = (Splicestring_T) List_head(p);
2286 }
2287 }
2288 j1--;
2289 }
2290 assert(nsites == nsplicepartners_max[j]);
2291 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2292 #if 0
2293 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2294 sites,nsites,/*charpos*/0);
2295 #else
2296 *trie_max_size = Trie_output_size(*trie_max_size,sites,nsites,/*charpos*/0);
2297 #endif
2298 if (npartners > MAX_SITES_ALLOCATED) {
2299 FREE(sites);
2300 }
2301 }
2302
2303 break;
2304
2305 case ANTIACCEPTOR:
2306
2307 j1 = j + 1 + nsplicepartners_skip[j];
2308
2309 if (distances_observed_p) {
2310 if ((npartners = nsplicepartners_obs[j]) == 0) {
2311 #if 0
2312 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2313 #else
2314 (*trie_obs_size) += TRIE_EMPTY_SIZE;
2315 #endif
2316
2317 } else {
2318 if (npartners > MAX_SITES_ALLOCATED) {
2319 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2320 } else {
2321 sites = &(sites_allocated[0]);
2322 }
2323 nsites = 0;
2324 while (nsites < nsplicepartners_obs[j]) {
2325 if (splicetypes[j1] == ANTIDONOR) {
2326 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2327 sites[nsites++] = (Splicestring_T) List_head(p);
2328 }
2329 }
2330 j1++;
2331 }
2332 assert(nsites == nsplicepartners_obs[j]);
2333 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2334 #if 0
2335 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2336 sites,nsites,/*charpos*/0);
2337 #else
2338 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
2339 #endif
2340 if (npartners > MAX_SITES_ALLOCATED) {
2341 FREE(sites);
2342 }
2343 }
2344 }
2345
2346 if ((npartners = nsplicepartners_max[j]) == 0) {
2347 #if 0
2348 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2349 #else
2350 (*trie_max_size) += TRIE_EMPTY_SIZE;
2351 #endif
2352
2353 } else {
2354 if (npartners > MAX_SITES_ALLOCATED) {
2355 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2356 } else {
2357 sites = &(sites_allocated[0]);
2358 }
2359 nsites = 0;
2360 while (nsites < nsplicepartners_max[j]) {
2361 if (splicetypes[j1] == ANTIDONOR) {
2362 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2363 sites[nsites++] = (Splicestring_T) List_head(p);
2364 }
2365 }
2366 j1++;
2367 }
2368 assert(nsites == nsplicepartners_max[j]);
2369 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2370 #if 0
2371 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2372 sites,nsites,/*charpos*/0);
2373 #else
2374 *trie_max_size = Trie_output_size(*trie_max_size,sites,nsites,/*charpos*/0);
2375 #endif
2376 if (npartners > MAX_SITES_ALLOCATED) {
2377 FREE(sites);
2378 }
2379 }
2380
2381 break;
2382
2383 default:
2384 fprintf(stderr,"Unexpected splicetype %d\n",splicetypes[j]);
2385 abort();
2386 }
2387 }
2388
2389 return;
2390 }
2391
2392
2393
2394 void
Splicetrie_build_via_splicesites(Triecontent_T ** triecontents_obs,Trieoffset_T ** trieoffsets_obs,Triecontent_T ** triecontents_max,Trieoffset_T ** trieoffsets_max,int * nsplicepartners_skip,int * nsplicepartners_obs,int * nsplicepartners_max,Splicetype_T * splicetypes,List_T * splicestrings,int nsplicesites)2395 Splicetrie_build_via_splicesites (Triecontent_T **triecontents_obs, Trieoffset_T **trieoffsets_obs,
2396 Triecontent_T **triecontents_max, Trieoffset_T **trieoffsets_max,
2397 int *nsplicepartners_skip, int *nsplicepartners_obs, int *nsplicepartners_max,
2398 Splicetype_T *splicetypes, List_T *splicestrings, int nsplicesites) {
2399 #ifdef USE_LIST
2400 Uintlist_T triecontents_obs_list = NULL, triecontents_max_list = NULL;
2401 #else
2402 unsigned int *triecontents_obs_ptr, *triecontents_max_ptr;
2403 #endif
2404 List_T p;
2405 int nsites, j, j1;
2406 Splicestring_T *sites, sites_allocated[MAX_SITES_ALLOCATED];
2407 int npartners;
2408 int nprinted_obs = 0, nprinted_max = 0;
2409 int trie_obs_size = 0, trie_max_size = 0;
2410 bool distances_observed_p;
2411
2412 if (nsplicepartners_obs == NULL) {
2413 distances_observed_p = false;
2414 } else {
2415 distances_observed_p = true;
2416 }
2417
2418 if (distances_observed_p == true) {
2419 *trieoffsets_obs = (Trieoffset_T *) CALLOC(nsplicesites,sizeof(Trieoffset_T));
2420 } else {
2421 *trieoffsets_obs = (Trieoffset_T *) NULL;
2422 }
2423 *trieoffsets_max = (Trieoffset_T *) CALLOC(nsplicesites,sizeof(Trieoffset_T));
2424
2425 build_via_splicesites_size(&trie_obs_size,&trie_max_size,
2426 nsplicepartners_skip,nsplicepartners_obs,nsplicepartners_max,
2427 splicetypes,splicestrings,nsplicesites,distances_observed_p);
2428 #ifndef USE_LIST
2429 if (distances_observed_p) {
2430 triecontents_obs_ptr = *triecontents_obs = (Triecontent_T *) MALLOC(trie_obs_size * sizeof(Triecontent_T));
2431 } else {
2432 *triecontents_obs = (Triecontent_T *) NULL;
2433 }
2434 triecontents_max_ptr = *triecontents_max = (Triecontent_T *) MALLOC(trie_max_size * sizeof(Triecontent_T));
2435 #endif
2436
2437
2438 for (j = 0; j < nsplicesites; j++) {
2439 switch (splicetypes[j]) {
2440 case DONOR:
2441 debug(
2442 if (distances_observed_p == true) {
2443 printf("donor #%d (%d partners obs, %d partners max):",
2444 j,nsplicepartners_obs[j],nsplicepartners_max[j]);
2445 } else {
2446 printf("donor #%d (%d partners max):",
2447 j,nsplicepartners_max[j]);
2448 });
2449
2450 j1 = j + 1 + nsplicepartners_skip[j];
2451
2452 if (distances_observed_p) {
2453 if ((npartners = nsplicepartners_obs[j]) == 0) {
2454 #ifdef USE_LIST
2455 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2456 #else
2457 triecontents_obs_ptr = Trie_output_empty_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr);
2458 #endif
2459
2460 } else {
2461 if (npartners > MAX_SITES_ALLOCATED) {
2462 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2463 } else {
2464 sites = &(sites_allocated[0]);
2465 }
2466 nsites = 0;
2467 while (nsites < nsplicepartners_obs[j]) {
2468 if (splicetypes[j1] == ACCEPTOR) {
2469 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2470 debug(printf(" %d",j1));
2471 sites[nsites++] = (Splicestring_T) List_head(p);
2472 }
2473 }
2474 j1++;
2475 }
2476 assert(nsites == nsplicepartners_obs[j]);
2477 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2478 #ifdef USE_LIST
2479 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2480 sites,nsites,/*charpos*/0);
2481 #else
2482 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
2483 sites,nsites,/*charpos*/0);
2484 #endif
2485 if (npartners > MAX_SITES_ALLOCATED) {
2486 FREE(sites);
2487 }
2488 }
2489 }
2490
2491 if ((npartners = nsplicepartners_max[j]) == 0) {
2492 #ifdef USE_LIST
2493 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2494 #else
2495 triecontents_max_ptr = Trie_output_empty_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr);
2496 #endif
2497
2498 } else {
2499 if (npartners > MAX_SITES_ALLOCATED) {
2500 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2501 } else {
2502 sites = &(sites_allocated[0]);
2503 }
2504 nsites = 0;
2505 while (nsites < nsplicepartners_max[j]) {
2506 if (splicetypes[j1] == ACCEPTOR) {
2507 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2508 debug(printf(" %d",j1));
2509 sites[nsites++] = (Splicestring_T) List_head(p);
2510 }
2511 }
2512 j1++;
2513 }
2514 assert(nsites == nsplicepartners_max[j]);
2515 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2516 #ifdef USE_LIST
2517 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2518 sites,nsites,/*charpos*/0);
2519 #else
2520 triecontents_max_ptr = Trie_output_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr,
2521 sites,nsites,/*charpos*/0);
2522 #endif
2523 if (npartners > MAX_SITES_ALLOCATED) {
2524 FREE(sites);
2525 }
2526 }
2527
2528 debug(printf("\n"));
2529 break;
2530
2531 case ACCEPTOR:
2532 debug(
2533 if (distances_observed_p == true) {
2534 printf("acceptor #%d (%d partners obs, %d partners max):",
2535 j,nsplicepartners_obs[j],nsplicepartners_max[j]);
2536 } else {
2537 printf("acceptor #%d (%d partners max):",
2538 j,nsplicepartners_max[j]);
2539 });
2540
2541 j1 = j - 1 - nsplicepartners_skip[j];
2542
2543 if (distances_observed_p) {
2544 if ((npartners = nsplicepartners_obs[j]) == 0) {
2545 #ifdef USE_LIST
2546 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2547 #else
2548 triecontents_obs_ptr = Trie_output_empty_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr);
2549 #endif
2550
2551 } else {
2552 if (npartners > MAX_SITES_ALLOCATED) {
2553 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2554 } else {
2555 sites = &(sites_allocated[0]);
2556 }
2557 nsites = 0;
2558 while (nsites < nsplicepartners_obs[j]) {
2559 if (splicetypes[j1] == DONOR) {
2560 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2561 debug(printf(" %d",j1));
2562 sites[nsites++] = (Splicestring_T) List_head(p);
2563 }
2564 }
2565 j1--;
2566 }
2567 assert(nsites == nsplicepartners_obs[j]);
2568 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2569 #ifdef USE_LIST
2570 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2571 sites,nsites,/*charpos*/0);
2572 #else
2573 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
2574 sites,nsites,/*charpos*/0);
2575 #endif
2576 if (npartners > MAX_SITES_ALLOCATED) {
2577 FREE(sites);
2578 }
2579 }
2580 }
2581
2582 if ((npartners = nsplicepartners_max[j]) == 0) {
2583 #ifdef USE_LIST
2584 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2585 #else
2586 triecontents_max_ptr = Trie_output_empty_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr);
2587 #endif
2588
2589 } else {
2590 if (npartners > MAX_SITES_ALLOCATED) {
2591 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2592 } else {
2593 sites = &(sites_allocated[0]);
2594 }
2595 nsites = 0;
2596 while (nsites < nsplicepartners_max[j]) {
2597 if (splicetypes[j1] == DONOR) {
2598 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2599 debug(printf(" %d",j1));
2600 sites[nsites++] = (Splicestring_T) List_head(p);
2601 }
2602 }
2603 j1--;
2604 }
2605 assert(nsites == nsplicepartners_max[j]);
2606 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2607 #ifdef USE_LIST
2608 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2609 sites,nsites,/*charpos*/0);
2610 #else
2611 triecontents_max_ptr = Trie_output_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr,
2612 sites,nsites,/*charpos*/0);
2613 #endif
2614 if (npartners > MAX_SITES_ALLOCATED) {
2615 FREE(sites);
2616 }
2617 }
2618
2619 debug(printf("\n"));
2620 break;
2621
2622 case ANTIDONOR:
2623 debug(
2624 if (distances_observed_p == true) {
2625 printf("antidonor #%d (%d partners obs, %d partners max):",
2626 j,nsplicepartners_obs[j],nsplicepartners_max[j]);
2627 } else {
2628 printf("antidonor #%d (%d partners max):",
2629 j,nsplicepartners_max[j]);
2630 });
2631
2632 j1 = j - 1 - nsplicepartners_skip[j];
2633
2634 if (distances_observed_p) {
2635 if ((npartners = nsplicepartners_obs[j]) == 0) {
2636 #ifdef USE_LIST
2637 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2638 #else
2639 triecontents_obs_ptr = Trie_output_empty_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr);
2640 #endif
2641
2642 } else {
2643 if (npartners > MAX_SITES_ALLOCATED) {
2644 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2645 } else {
2646 sites = &(sites_allocated[0]);
2647 }
2648 nsites = 0;
2649 while (nsites < nsplicepartners_obs[j]) {
2650 if (splicetypes[j1] == ANTIACCEPTOR) {
2651 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2652 debug(printf(" %d",j1));
2653 sites[nsites++] = (Splicestring_T) List_head(p);
2654 }
2655 }
2656 j1--;
2657 }
2658 assert(nsites == nsplicepartners_obs[j]);
2659 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2660 #ifdef USE_LIST
2661 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2662 sites,nsites,/*charpos*/0);
2663 #else
2664 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
2665 sites,nsites,/*charpos*/0);
2666 #endif
2667 if (npartners > MAX_SITES_ALLOCATED) {
2668 FREE(sites);
2669 }
2670 }
2671 }
2672
2673 if ((npartners = nsplicepartners_max[j]) == 0) {
2674 #ifdef USE_LIST
2675 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2676 #else
2677 triecontents_max_ptr = Trie_output_empty_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr);
2678 #endif
2679
2680 } else {
2681 if (npartners > MAX_SITES_ALLOCATED) {
2682 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2683 } else {
2684 sites = &(sites_allocated[0]);
2685 }
2686 nsites = 0;
2687 while (nsites < nsplicepartners_max[j]) {
2688 if (splicetypes[j1] == ANTIACCEPTOR) {
2689 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2690 debug(printf(" %d",j1));
2691 sites[nsites++] = (Splicestring_T) List_head(p);
2692 }
2693 }
2694 j1--;
2695 }
2696 assert(nsites == nsplicepartners_max[j]);
2697 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2698 #ifdef USE_LIST
2699 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2700 sites,nsites,/*charpos*/0);
2701 #else
2702 triecontents_max_ptr = Trie_output_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr,
2703 sites,nsites,/*charpos*/0);
2704 #endif
2705 if (npartners > MAX_SITES_ALLOCATED) {
2706 FREE(sites);
2707 }
2708 }
2709
2710 debug(printf("\n"));
2711 break;
2712
2713 case ANTIACCEPTOR:
2714 debug(
2715 if (distances_observed_p == true) {
2716 printf("antiacceptor #%d (%d partners obs, %d partners max):",
2717 j,nsplicepartners_obs[j],nsplicepartners_max[j]);
2718 } else {
2719 printf("antiacceptor #%d (%d partners max):",
2720 j,nsplicepartners_max[j]);
2721 });
2722
2723 j1 = j + 1 + nsplicepartners_skip[j];
2724
2725 if (distances_observed_p) {
2726 if ((npartners = nsplicepartners_obs[j]) == 0) {
2727 #ifdef USE_LIST
2728 triecontents_obs_list = Trie_output_empty_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list);
2729 #else
2730 triecontents_obs_ptr = Trie_output_empty_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr);
2731 #endif
2732
2733 } else {
2734 if (npartners > MAX_SITES_ALLOCATED) {
2735 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2736 } else {
2737 sites = &(sites_allocated[0]);
2738 }
2739 nsites = 0;
2740 while (nsites < nsplicepartners_obs[j]) {
2741 if (splicetypes[j1] == ANTIDONOR) {
2742 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2743 debug(printf(" %d",j1));
2744 sites[nsites++] = (Splicestring_T) List_head(p);
2745 }
2746 }
2747 j1++;
2748 }
2749 assert(nsites == nsplicepartners_obs[j]);
2750 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2751 #ifdef USE_LIST
2752 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2753 sites,nsites,/*charpos*/0);
2754 #else
2755 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
2756 sites,nsites,/*charpos*/0);
2757 #endif
2758 if (npartners > MAX_SITES_ALLOCATED) {
2759 FREE(sites);
2760 }
2761 }
2762 }
2763
2764 if ((npartners = nsplicepartners_max[j]) == 0) {
2765 #ifdef USE_LIST
2766 triecontents_max_list = Trie_output_empty_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list);
2767 #else
2768 triecontents_max_ptr = Trie_output_empty_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr);
2769 #endif
2770
2771 } else {
2772 if (npartners > MAX_SITES_ALLOCATED) {
2773 sites = (Splicestring_T *) CALLOC(npartners,sizeof(Splicestring_T));
2774 } else {
2775 sites = &(sites_allocated[0]);
2776 }
2777 nsites = 0;
2778 while (nsites < nsplicepartners_max[j]) {
2779 if (splicetypes[j1] == ANTIDONOR) {
2780 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2781 debug(printf(" %d",j1));
2782 sites[nsites++] = (Splicestring_T) List_head(p);
2783 }
2784 }
2785 j1++;
2786 }
2787 assert(nsites == nsplicepartners_max[j]);
2788 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2789 #ifdef USE_LIST
2790 triecontents_max_list = Trie_output_list(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_list,
2791 sites,nsites,/*charpos*/0);
2792 #else
2793 triecontents_max_ptr = Trie_output_array(&((*trieoffsets_max)[j]),&nprinted_max,triecontents_max_ptr,
2794 sites,nsites,/*charpos*/0);
2795 #endif
2796 if (npartners > MAX_SITES_ALLOCATED) {
2797 FREE(sites);
2798 }
2799 }
2800
2801 debug(printf("\n"));
2802 break;
2803
2804 default:
2805 fprintf(stderr,"Unexpected splicetype %d\n",splicetypes[j]);
2806 abort();
2807 }
2808 }
2809
2810
2811 assert(nprinted_obs == trie_obs_size);
2812 assert(nprinted_max == trie_max_size);
2813
2814
2815 if (distances_observed_p) {
2816 fprintf(stderr,"splicetrie_obs has %d entries...",nprinted_obs);
2817 #ifdef USE_LIST
2818 triecontents_obs_list = Uintlist_reverse(triecontents_obs_list);
2819 *triecontents_obs = Uintlist_to_array(&nprinted_obs,triecontents_obs_list);
2820 Uintlist_free(&triecontents_obs_list);
2821 #endif
2822 } else {
2823 *triecontents_obs = (Triecontent_T *) NULL;
2824 }
2825
2826
2827 fprintf(stderr,"splicetrie_max has %d entries...",nprinted_max);
2828 #ifdef USE_LIST
2829 triecontents_max_list = Uintlist_reverse(triecontents_max_list);
2830 *triecontents_max = Uintlist_to_array(&nprinted_max,triecontents_max_list);
2831 Uintlist_free(&triecontents_max_list);
2832 #endif
2833
2834 return;
2835 }
2836
2837
2838 #ifdef DEBUG
2839 /* Splicetype is for the anchor splice */
2840 static void
dump_sites(Splicestring_T * sites,int nsites,Splicetype_T splicetype)2841 dump_sites (Splicestring_T *sites, int nsites, Splicetype_T splicetype) {
2842 int i;
2843 char gbuffer[17];
2844
2845 gbuffer[16] = '\0';
2846
2847 for (i = 0; i < nsites; i++) {
2848 printf("%d %u",sites[i]->splicesite_i,sites[i]->splicesite);
2849 if (splicetype == DONOR || splicetype == ANTIACCEPTOR) {
2850 splicefrag_nt_rightward(gbuffer,sites[i]->string);
2851 printf(" %s (rightward)\n",gbuffer);
2852 } else {
2853 splicefrag_nt_leftward(gbuffer,sites[i]->string);
2854 printf(" %s (leftward)\n",gbuffer);
2855 }
2856 }
2857 return;
2858 }
2859 #endif
2860
2861
2862
2863 static void
build_via_introns_size(int * trie_obs_size,Univcoord_T * splicesites,Splicetype_T * splicetypes,List_T * splicestrings,int nsplicesites,Univ_IIT_T chromosome_iit,IIT_T splicing_iit,int * splicing_divint_crosstable)2864 build_via_introns_size (int *trie_obs_size, Univcoord_T *splicesites, Splicetype_T *splicetypes,
2865 List_T *splicestrings, int nsplicesites,
2866 Univ_IIT_T chromosome_iit, IIT_T splicing_iit, int *splicing_divint_crosstable) {
2867 List_T p;
2868 int nsites, j, j1;
2869 Splicestring_T *sites, sites_allocated[MAX_SITES_ALLOCATED];
2870
2871 Univcoord_T chroffset, chrhigh;
2872 Chrpos_T chrlength;
2873 Chrpos_T *coords;
2874 int ncoords, k;
2875 Chrnum_T chrnum;
2876 int divno;
2877
2878
2879 chrhigh = 0U;
2880 for (j = 0; j < nsplicesites; j++) {
2881 if (splicesites[j] > chrhigh) {
2882 chrnum = Univ_IIT_get_one(chromosome_iit,splicesites[j],splicesites[j]);
2883 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
2884 /* chrhigh += 1U; */
2885
2886 divno = splicing_divint_crosstable[chrnum];
2887 assert(divno > 0);
2888 }
2889
2890 switch (splicetypes[j]) {
2891 case DONOR:
2892 nsites = 0;
2893 coords = IIT_get_highs_for_low(&ncoords,splicing_iit,divno,splicesites[j]-chroffset);
2894
2895 j1 = j + 1;
2896 k = 0;
2897 while (j1 < nsplicesites && k < ncoords) {
2898 if (splicetypes[j1] == ACCEPTOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
2899 nsites += List_length(splicestrings[j1]);
2900 k++;
2901 }
2902 j1++;
2903 }
2904 /* assertion may not hold if last few introns are invalidated */
2905 /* assert(k == ncoords); */
2906
2907 if (nsites == 0) {
2908 #if 0
2909 (*trieoffsets_obs)[j] = NULL_POINTER;
2910 #endif
2911
2912 } else {
2913 if (nsites > MAX_SITES_ALLOCATED) {
2914 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
2915 } else {
2916 sites = &(sites_allocated[0]);
2917 }
2918 nsites = 0;
2919 j1 = j + 1;
2920 k = 0;
2921 while (j1 < nsplicesites && k < ncoords) {
2922 if (splicetypes[j1] == ACCEPTOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
2923 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2924 sites[nsites++] = (Splicestring_T) List_head(p);
2925 }
2926 k++;
2927 }
2928 j1++;
2929 }
2930
2931 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2932 #if 0
2933 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2934 sites,nsites,/*charpos*/0);
2935 #else
2936 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
2937 #endif
2938 if (nsites > MAX_SITES_ALLOCATED) {
2939 FREE(sites);
2940 }
2941 }
2942
2943 FREE(coords);
2944 break;
2945
2946 case ACCEPTOR:
2947 nsites = 0;
2948 coords = IIT_get_lows_for_high(&ncoords,splicing_iit,divno,splicesites[j] + INTRON_HIGH_TO_LOW - chroffset);
2949
2950 j1 = j - 1;
2951 k = 0;
2952 while (j1 >= 0 && k < ncoords) {
2953 if (splicetypes[j1] == DONOR && splicesites[j1] == coords[k] + chroffset) {
2954 nsites += List_length(splicestrings[j1]);
2955 k++;
2956 }
2957 j1--;
2958 }
2959 /* assertion may not hold if last few introns are invalidated */
2960 /* assert(k == ncoords); */
2961
2962 if (nsites == 0) {
2963 #if 0
2964 (*trieoffsets_obs)[j] = NULL_POINTER;
2965 #endif
2966
2967 } else {
2968 if (nsites > MAX_SITES_ALLOCATED) {
2969 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
2970 } else {
2971 sites = &(sites_allocated[0]);
2972 }
2973 nsites = 0;
2974 j1 = j - 1;
2975 k = 0;
2976 while (j1 >= 0 && k < ncoords) {
2977 if (splicetypes[j1] == DONOR && splicesites[j1] == coords[k] + chroffset) {
2978 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
2979 sites[nsites++] = (Splicestring_T) List_head(p);
2980 }
2981 k++;
2982 }
2983 j1--;
2984 }
2985
2986 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
2987 #if 0
2988 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
2989 sites,nsites,/*charpos*/0);
2990 #else
2991 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
2992 #endif
2993 if (nsites > MAX_SITES_ALLOCATED) {
2994 FREE(sites);
2995 }
2996 }
2997
2998 FREE(coords);
2999 break;
3000
3001 case ANTIDONOR:
3002 nsites = 0;
3003 coords = IIT_get_lows_for_high(&ncoords,splicing_iit,divno,splicesites[j] + INTRON_HIGH_TO_LOW - chroffset);
3004
3005 j1 = j - 1;
3006 k = 0;
3007 while (j1 >= 0 && k < ncoords) {
3008 if (splicetypes[j1] == ANTIACCEPTOR && splicesites[j1] == coords[k] + chroffset) {
3009 nsites += List_length(splicestrings[j1]);
3010 k++;
3011 }
3012 j1--;
3013 }
3014 /* assertion may not hold if last few introns are invalidated */
3015 /* assert(k == ncoords); */
3016
3017 if (nsites == 0) {
3018 #if 0
3019 (*trieoffsets_obs)[j] = NULL_POINTER;
3020 #endif
3021
3022 } else {
3023 if (nsites > MAX_SITES_ALLOCATED) {
3024 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
3025 } else {
3026 sites = &(sites_allocated[0]);
3027 }
3028 nsites = 0;
3029 j1 = j - 1;
3030 k = 0;
3031 while (j1 >= 0 && k < ncoords) {
3032 if (splicetypes[j1] == ANTIACCEPTOR && splicesites[j1] == coords[k] + chroffset) {
3033 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
3034 sites[nsites++] = (Splicestring_T) List_head(p);
3035 }
3036 k++;
3037 }
3038 j1--;
3039 }
3040
3041 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
3042 #if 0
3043 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
3044 sites,nsites,/*charpos*/0);
3045 #else
3046 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
3047 #endif
3048 if (nsites > MAX_SITES_ALLOCATED) {
3049 FREE(sites);
3050 }
3051 }
3052
3053 FREE(coords);
3054 break;
3055
3056 case ANTIACCEPTOR:
3057 nsites = 0;
3058 coords = IIT_get_highs_for_low(&ncoords,splicing_iit,divno,splicesites[j]-chroffset);
3059
3060 j1 = j + 1;
3061 k = 0;
3062 while (j1 < nsplicesites && k < ncoords) {
3063 if (splicetypes[j1] == ANTIDONOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
3064 nsites += List_length(splicestrings[j1]);
3065 k++;
3066 }
3067 j1++;
3068 }
3069 /* assertion may not hold if last few introns are invalidated */
3070 /* assert(k == ncoords); */
3071
3072 if (nsites == 0) {
3073 #if 0
3074 (*trieoffsets_obs)[j] = NULL_POINTER;
3075 #endif
3076
3077 } else {
3078 if (nsites > MAX_SITES_ALLOCATED) {
3079 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
3080 } else {
3081 sites = &(sites_allocated[0]);
3082 }
3083 nsites = 0;
3084 j1 = j + 1;
3085 k = 0;
3086 while (j1 < nsplicesites && k < ncoords) {
3087 if (splicetypes[j1] == ANTIDONOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
3088 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
3089 sites[nsites++] = (Splicestring_T) List_head(p);
3090 }
3091 k++;
3092 }
3093 j1++;
3094 }
3095
3096 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
3097 #if 0
3098 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
3099 sites,nsites,/*charpos*/0);
3100 #else
3101 *trie_obs_size = Trie_output_size(*trie_obs_size,sites,nsites,/*charpos*/0);
3102 #endif
3103 if (nsites > MAX_SITES_ALLOCATED) {
3104 FREE(sites);
3105 }
3106 }
3107
3108 FREE(coords);
3109 break;
3110
3111 default:
3112 fprintf(stderr,"Unexpected splicetype %d\n",splicetypes[j]);
3113 abort();
3114 }
3115 }
3116
3117 return;
3118 }
3119
3120
3121 void
Splicetrie_build_via_introns(Triecontent_T ** triecontents_obs,Trieoffset_T ** trieoffsets_obs,Univcoord_T * splicesites,Splicetype_T * splicetypes,List_T * splicestrings,int nsplicesites,Univ_IIT_T chromosome_iit,IIT_T splicing_iit,int * splicing_divint_crosstable)3122 Splicetrie_build_via_introns (Triecontent_T **triecontents_obs, Trieoffset_T **trieoffsets_obs,
3123 Univcoord_T *splicesites, Splicetype_T *splicetypes,
3124 List_T *splicestrings, int nsplicesites,
3125 Univ_IIT_T chromosome_iit, IIT_T splicing_iit, int *splicing_divint_crosstable) {
3126 #ifdef USE_LIST
3127 Uintlist_T triecontents_obs_list = NULL;
3128 #else
3129 unsigned int *triecontents_obs_ptr;
3130 #endif
3131 List_T p;
3132 int nsites, j, j1;
3133 Splicestring_T *sites, sites_allocated[MAX_SITES_ALLOCATED];
3134 int nprinted_obs = 0;
3135 int trie_obs_size = 0;
3136
3137 Univcoord_T chroffset, chrhigh;
3138 Chrpos_T chrlength;
3139 Chrpos_T *coords;
3140 int ncoords, k;
3141 Chrnum_T chrnum;
3142 int divno;
3143
3144
3145 *trieoffsets_obs = (Trieoffset_T *) CALLOC(nsplicesites,sizeof(Trieoffset_T));
3146
3147 build_via_introns_size(&trie_obs_size,splicesites,splicetypes,splicestrings,nsplicesites,
3148 chromosome_iit,splicing_iit,splicing_divint_crosstable);
3149 #ifndef USE_LIST
3150 triecontents_obs_ptr = *triecontents_obs = (Triecontent_T *) MALLOC(trie_obs_size * sizeof(Triecontent_T));
3151 #endif
3152
3153
3154 chrhigh = 0U;
3155 for (j = 0; j < nsplicesites; j++) {
3156 if (splicesites[j] > chrhigh) {
3157 chrnum = Univ_IIT_get_one(chromosome_iit,splicesites[j],splicesites[j]);
3158 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
3159 /* chrhigh += 1U; */
3160
3161 divno = splicing_divint_crosstable[chrnum];
3162 assert(divno > 0);
3163 }
3164
3165 switch (splicetypes[j]) {
3166 case DONOR:
3167 nsites = 0;
3168 coords = IIT_get_highs_for_low(&ncoords,splicing_iit,divno,splicesites[j]-chroffset);
3169
3170 j1 = j + 1;
3171 k = 0;
3172 while (j1 < nsplicesites && k < ncoords) {
3173 if (splicetypes[j1] == ACCEPTOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
3174 nsites += List_length(splicestrings[j1]);
3175 k++;
3176 }
3177 j1++;
3178 }
3179 /* assertion may not hold if last few introns are invalidated */
3180 /* assert(k == ncoords); */
3181
3182 if (nsites == 0) {
3183 (*trieoffsets_obs)[j] = NULL_POINTER;
3184
3185 } else {
3186 if (nsites > MAX_SITES_ALLOCATED) {
3187 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
3188 } else {
3189 sites = &(sites_allocated[0]);
3190 }
3191 nsites = 0;
3192 j1 = j + 1;
3193 k = 0;
3194 while (j1 < nsplicesites && k < ncoords) {
3195 if (splicetypes[j1] == ACCEPTOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
3196 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
3197 sites[nsites++] = (Splicestring_T) List_head(p);
3198 }
3199 k++;
3200 }
3201 j1++;
3202 }
3203
3204 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
3205 #ifdef USE_LIST
3206 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
3207 sites,nsites,/*charpos*/0);
3208 #else
3209 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
3210 sites,nsites,/*charpos*/0);
3211 #endif
3212 if (nsites > MAX_SITES_ALLOCATED) {
3213 FREE(sites);
3214 }
3215 }
3216
3217 FREE(coords);
3218 break;
3219
3220 case ACCEPTOR:
3221 nsites = 0;
3222 coords = IIT_get_lows_for_high(&ncoords,splicing_iit,divno,splicesites[j] + INTRON_HIGH_TO_LOW - chroffset);
3223
3224 j1 = j - 1;
3225 k = 0;
3226 while (j1 >= 0 && k < ncoords) {
3227 if (splicetypes[j1] == DONOR && splicesites[j1] == coords[k] + chroffset) {
3228 nsites += List_length(splicestrings[j1]);
3229 k++;
3230 }
3231 j1--;
3232 }
3233 /* assertion may not hold if last few introns are invalidated */
3234 /* assert(k == ncoords); */
3235
3236 if (nsites == 0) {
3237 (*trieoffsets_obs)[j] = NULL_POINTER;
3238
3239 } else {
3240 if (nsites > MAX_SITES_ALLOCATED) {
3241 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
3242 } else {
3243 sites = &(sites_allocated[0]);
3244 }
3245 nsites = 0;
3246 j1 = j - 1;
3247 k = 0;
3248 while (j1 >= 0 && k < ncoords) {
3249 if (splicetypes[j1] == DONOR && splicesites[j1] == coords[k] + chroffset) {
3250 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
3251 sites[nsites++] = (Splicestring_T) List_head(p);
3252 }
3253 k++;
3254 }
3255 j1--;
3256 }
3257
3258 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
3259 #ifdef USE_LIST
3260 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
3261 sites,nsites,/*charpos*/0);
3262 #else
3263 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
3264 sites,nsites,/*charpos*/0);
3265 #endif
3266 if (nsites > MAX_SITES_ALLOCATED) {
3267 FREE(sites);
3268 }
3269 }
3270
3271 FREE(coords);
3272 break;
3273
3274 case ANTIDONOR:
3275 nsites = 0;
3276 coords = IIT_get_lows_for_high(&ncoords,splicing_iit,divno,splicesites[j] + INTRON_HIGH_TO_LOW - chroffset);
3277
3278 j1 = j - 1;
3279 k = 0;
3280 while (j1 >= 0 && k < ncoords) {
3281 if (splicetypes[j1] == ANTIACCEPTOR && splicesites[j1] == coords[k] + chroffset) {
3282 nsites += List_length(splicestrings[j1]);
3283 k++;
3284 }
3285 j1--;
3286 }
3287 /* assertion may not hold if last few introns are invalidated */
3288 /* assert(k == ncoords); */
3289
3290 if (nsites == 0) {
3291 (*trieoffsets_obs)[j] = NULL_POINTER;
3292
3293 } else {
3294 if (nsites > MAX_SITES_ALLOCATED) {
3295 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
3296 } else {
3297 sites = &(sites_allocated[0]);
3298 }
3299 nsites = 0;
3300 j1 = j - 1;
3301 k = 0;
3302 while (j1 >= 0 && k < ncoords) {
3303 if (splicetypes[j1] == ANTIACCEPTOR && splicesites[j1] == coords[k] + chroffset) {
3304 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
3305 sites[nsites++] = (Splicestring_T) List_head(p);
3306 }
3307 k++;
3308 }
3309 j1--;
3310 }
3311
3312 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
3313 #ifdef USE_LIST
3314 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
3315 sites,nsites,/*charpos*/0);
3316 #else
3317 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
3318 sites,nsites,/*charpos*/0);
3319 #endif
3320 if (nsites > MAX_SITES_ALLOCATED) {
3321 FREE(sites);
3322 }
3323 }
3324
3325 FREE(coords);
3326 break;
3327
3328 case ANTIACCEPTOR:
3329 nsites = 0;
3330 coords = IIT_get_highs_for_low(&ncoords,splicing_iit,divno,splicesites[j]-chroffset);
3331
3332 j1 = j + 1;
3333 k = 0;
3334 while (j1 < nsplicesites && k < ncoords) {
3335 if (splicetypes[j1] == ANTIDONOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
3336 nsites += List_length(splicestrings[j1]);
3337 k++;
3338 }
3339 j1++;
3340 }
3341 /* assertion may not hold if last few introns are invalidated */
3342 /* assert(k == ncoords); */
3343
3344 if (nsites == 0) {
3345 (*trieoffsets_obs)[j] = NULL_POINTER;
3346
3347 } else {
3348 if (nsites > MAX_SITES_ALLOCATED) {
3349 sites = (Splicestring_T *) CALLOC(nsites,sizeof(Splicestring_T));
3350 } else {
3351 sites = &(sites_allocated[0]);
3352 }
3353 nsites = 0;
3354 j1 = j + 1;
3355 k = 0;
3356 while (j1 < nsplicesites && k < ncoords) {
3357 if (splicetypes[j1] == ANTIDONOR && splicesites[j1] == coords[k] - INTRON_HIGH_TO_LOW + chroffset) {
3358 for (p = splicestrings[j1]; p != NULL; p = List_next(p)) {
3359 sites[nsites++] = (Splicestring_T) List_head(p);
3360 }
3361 k++;
3362 }
3363 j1++;
3364 }
3365
3366 qsort(sites,nsites,sizeof(Splicestring_T),Splicestring_cmp);
3367 #ifdef USE_LIST
3368 triecontents_obs_list = Trie_output_list(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_list,
3369 sites,nsites,/*charpos*/0);
3370 #else
3371 triecontents_obs_ptr = Trie_output_array(&((*trieoffsets_obs)[j]),&nprinted_obs,triecontents_obs_ptr,
3372 sites,nsites,/*charpos*/0);
3373 #endif
3374 if (nsites > MAX_SITES_ALLOCATED) {
3375 FREE(sites);
3376 }
3377 }
3378
3379 FREE(coords);
3380 break;
3381
3382 default:
3383 fprintf(stderr,"Unexpected splicetype %d\n",splicetypes[j]);
3384 abort();
3385 }
3386 }
3387
3388 assert(nprinted_obs == trie_obs_size);
3389
3390 fprintf(stderr,"splicetrie_obs has %d entries...",nprinted_obs);
3391 #ifdef USE_LIST
3392 triecontents_obs_list = Uintlist_reverse(triecontents_obs_list);
3393 *triecontents_obs = Uintlist_to_array(&nprinted_obs,triecontents_obs_list);
3394 Uintlist_free(&triecontents_obs_list);
3395 #endif
3396
3397 return;
3398 }
3399
3400