1 static char rcsid[] = "$Id: knownsplicing.c 218286 2019-01-23 16:46:55Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "knownsplicing.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h> /* For qsort */
11
12 #include "assert.h"
13 #include "mem.h"
14 #include "univcoord.h"
15
16 #include "iitdef.h"
17 #include "interval.h"
18 #include "genome128_hr.h"
19
20
21 #define SPLICEDIST_EXTRA 10 /* Reported intron length does not include dinucleotides */
22
23
24 #ifdef DEBUG
25 #define debug(x) x
26 #else
27 #define debug(x)
28 #endif
29
30
31 /* Full dump */
32 #ifdef DEBUG2
33 #define debug2(x) x
34 #else
35 #define debug2(x)
36 #endif
37
38 /* Splicecomp */
39 #ifdef DEBUG4
40 #define debug4(x) x
41 #else
42 #define debug4(x)
43 #endif
44
45
46 #define clear_start(diff,startdiscard) (diff & (~0U << (startdiscard)))
47 #define clear_end(diff,enddiscard) (diff & ~(~0U << (enddiscard)))
48
49
50 static Genomecomp_T *splicecomp;
51
52 void
Knownsplicing_setup(Genomecomp_T * splicecomp_in)53 Knownsplicing_setup (Genomecomp_T *splicecomp_in) {
54 splicecomp = splicecomp_in;
55
56 return;
57 }
58
59
60 /* Previously was in splicetrie.c */
61 /* Modified from count_mismatches_limit */
62 bool
Knownsplicing_splicesite_p(Univcoord_T left,int pos5,int pos3)63 Knownsplicing_splicesite_p (Univcoord_T left, int pos5, int pos3) {
64 int startdiscard, enddiscard;
65 Univcoord_T startblocki, endblocki;
66 Genomecomp_T *endblock, *ptr;
67 Genomecomp_T splicesitep;
68
69 startblocki = (left+pos5)/32U;
70 endblocki = (left+pos3)/32U;
71 startdiscard = (left+pos5) % 32;
72 enddiscard = (left+pos3) % 32;
73
74 if (endblocki == startblocki) {
75 debug4(printf("** Single block **\n"));
76 splicesitep = splicecomp[startblocki];
77 splicesitep = clear_start(splicesitep,startdiscard);
78 splicesitep = clear_end(splicesitep,enddiscard);
79
80 return (splicesitep ? true : false);
81
82 } else if (endblocki == startblocki + 1) {
83 /* Only two blocks to check */
84
85 if (32 - startdiscard >= enddiscard) {
86 /* Two blocks to check and more bits counted in startblock */
87 debug4(printf("* Two blocks, start block first **\n"));
88
89 /* 1/2: Startblock */
90 splicesitep = splicecomp[startblocki];
91 splicesitep = clear_start(splicesitep,startdiscard);
92 if (splicesitep) {
93 return true;
94 }
95
96 /* 2/2: Endblock */
97 splicesitep = splicecomp[endblocki];
98 splicesitep = clear_end(splicesitep,enddiscard);
99
100 return (splicesitep ? true : false);
101
102 } else {
103 /* Two blocks to check and more bits counted in endblock */
104 debug4(printf("** Two blocks, end block first **\n"));
105
106 /* 1/2: Endblock */
107 splicesitep = splicecomp[endblocki];
108 splicesitep = clear_end(splicesitep,enddiscard);
109 if (splicesitep) {
110 return true;
111 }
112
113 /* 2/2: Startblock */
114 splicesitep = splicecomp[startblocki];
115 splicesitep = clear_start(splicesitep,startdiscard);
116 return (splicesitep ? true : false);
117 }
118
119 } else {
120 /* More than 2 blocks to check */
121 debug4(printf("** More than two blocks **\n"));
122
123 ptr = &(splicecomp[startblocki+1]);
124 endblock = &(splicecomp[endblocki]);
125
126 while (ptr < endblock) {
127 if (*ptr) {
128 return true;
129 }
130 ptr++;
131 }
132
133 if (enddiscard >= 32 - startdiscard) {
134 /* More bits in end block */
135 debug4(printf("** Final block, end block first **\n"));
136
137 /* n/n: Go first to end block */
138 splicesitep = *ptr;
139 splicesitep = clear_end(splicesitep,enddiscard);
140 if (splicesitep) {
141 return true;
142 }
143
144 /* 1/n: Go second to start block */
145 splicesitep = splicecomp[startblocki];
146 splicesitep = clear_start(splicesitep,startdiscard);
147 /* debug4(printf("adding start mask %08X\n",clear_start_mask(startdiscard))); */
148
149 return (splicesitep ? true : false);
150
151 } else {
152 /* 1/n: Go first to start block */
153 debug4(printf("** Final block, start block first **\n"));
154
155 splicesitep = splicecomp[startblocki];
156 splicesitep = clear_start(splicesitep,startdiscard);
157 if (splicesitep) {
158 return true;
159 }
160
161 /* n/n: Go second to end block */
162 splicesitep = splicecomp[endblocki];
163 splicesitep = clear_end(splicesitep,enddiscard);
164
165 return (splicesitep ? true : false);
166 }
167 }
168 }
169
170
171 /* 87654321 */
172 #define UINT4_LEFT_A 0x00000000
173 #define UINT4_LEFT_C 0x40000000
174 #define UINT4_LEFT_G 0x80000000
175 #define UINT4_LEFT_T 0xC0000000
176 #define UINT4_HIGH2 0xC0000000
177
178
179 /* Puts leftmost character into lowest bits */
180 /* For right splicestrings, we want the leftmost character in the highest bits */
181
182 #if 0
183 static Genomecomp_T
184 compress16 (bool *saw_n_p, char *buffer) {
185 Genomecomp_T low = 0U;
186 int c;
187 int i;
188
189 /* *saw_n_p = false; -- Want to check both ref and alt, so rely on caller to set */
190 for (i = 0; i < 16; i++) {
191 c = buffer[i];
192 low >>= 2;
193 switch (c) {
194 case 'A': break;
195 case 'C': low |= UINT4_LEFT_C; break;
196 case 'G': low |= UINT4_LEFT_G; break;
197 case 'T': low |= UINT4_LEFT_T; break;
198 default: *saw_n_p = true; break;
199 }
200 }
201
202 return low;
203 }
204 #endif
205
206
207 static bool
look_for_n(bool saw_n_p,char * buffer)208 look_for_n (bool saw_n_p, char *buffer) {
209 int c;
210 int i;
211
212 /* saw_n_p = false; -- Want to check both ref and alt, so rely on caller to set */
213 for (i = 0; i < 16; i++) {
214 c = buffer[i];
215 switch (c) {
216 case 'A': case 'C': case 'G': case 'T': break;
217 default: saw_n_p = true; break;
218 }
219 }
220
221 return saw_n_p;
222 }
223
224
225 #if 0
226 static Genomecomp_T
227 uint4_reverse (Genomecomp_T forward) {
228 Genomecomp_T reverse = 0U;
229 int c;
230 int i;
231
232 for (i = 0; i < 16; i++) {
233 c = forward & 0x03;
234 reverse <<= 2;
235 reverse |= c;
236 forward >>= 2;
237 }
238
239 return reverse;
240 }
241 #endif
242
243
244 Univcoord_T *
Knownsplicing_retrieve_via_splicesites(bool * distances_observed_p,Genomecomp_T ** splicecomp,Splicetype_T ** splicetypes,Chrpos_T ** splicedists,int * nsplicesites,IIT_T splicing_iit,int * splicing_divint_crosstable,int donor_typeint,int acceptor_typeint,Univ_IIT_T chromosome_iit,Genome_T genome,Genome_T genomealt,Chrpos_T shortsplicedist)245 Knownsplicing_retrieve_via_splicesites (bool *distances_observed_p, Genomecomp_T **splicecomp,
246 Splicetype_T **splicetypes, Chrpos_T **splicedists,
247 int *nsplicesites, IIT_T splicing_iit, int *splicing_divint_crosstable,
248 int donor_typeint, int acceptor_typeint, Univ_IIT_T chromosome_iit,
249 Genome_T genome, Genome_T genomealt, Chrpos_T shortsplicedist) {
250 Univcoord_T *splicesites, chroffset, chrhigh, position;
251 Chrpos_T chrlength, chrpos;
252 Univcoord_T last_donor, last_antidonor, last_acceptor, last_antiacceptor;
253 int last_donor_k, last_antidonor_k, last_acceptor_k, last_antiacceptor_k;
254 int distance;
255 int *splicesites1;
256 int divno, nsplicesites1, i, k;
257 Chrnum_T chrnum;
258 Interval_T *intervals, interval;
259 struct Interval_T *interval_structs;
260 char gbuffer_ref[17], gbuffer_alt[17], *chr;
261 char *restofheader;
262 /* char *annot; */
263 bool firstp = true, saw_n_p, allocp, alloc_header_p;
264 int ntoolong = 0;
265
266 #ifdef DEBUG2
267 List_T p;
268 #endif
269
270 int nblocks;
271
272 k = 0;
273 for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
274 if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
275 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
276 /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
277 splicesites1 = IIT_get_with_divno(&nsplicesites1,splicing_iit,divno,
278 0U,chrlength-1U,/*sortp*/false);
279 if (nsplicesites1 > 0) {
280 if (firstp == true) {
281 /* annot = */ IIT_annotation(&restofheader,splicing_iit,splicesites1[0],&alloc_header_p);
282 if (restofheader[0] == '\0') {
283 fprintf(stderr,"splice distances absent...");
284 *distances_observed_p = false;
285 } else if (sscanf(restofheader,"%d",&distance) < 1) {
286 fprintf(stderr,"splice distances absent...");
287 *distances_observed_p = false;
288 } else {
289 fprintf(stderr,"splice distances present...");
290 *distances_observed_p = true;
291 }
292 if (alloc_header_p == true) {
293 FREE(restofheader);
294 }
295 firstp = false;
296 }
297
298 intervals = (Interval_T *) CALLOC(nsplicesites1,sizeof(Interval_T));
299 for (i = 0; i < nsplicesites1; i++) {
300 intervals[i] = &(splicing_iit->intervals[divno][i]);
301 }
302 qsort(intervals,nsplicesites1,sizeof(Interval_T),Interval_cmp_low);
303
304 last_donor = last_antidonor = last_acceptor = last_antiacceptor = 0U;
305 for (i = 0; i < nsplicesites1; i++) {
306 interval = intervals[i];
307 chrpos = Interval_low(interval);
308 position = chrpos + chroffset;
309
310 if (position >= chrhigh) {
311 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
312 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
313 chr,chrpos,chrlength);
314 if (allocp) FREE(chr);
315
316 } else if (Interval_type(interval) == donor_typeint) {
317 if (Interval_sign(interval) > 0) {
318 if (position != last_donor) {
319 last_donor = position;
320 k++;
321 }
322 } else {
323 if (position != last_antidonor) {
324 last_antidonor = position;
325 k++;
326 }
327 }
328 } else if (Interval_type(interval) == acceptor_typeint) {
329 if (Interval_sign(interval) > 0) {
330 if (position != last_acceptor) {
331 last_acceptor = position;
332 k++;
333 }
334 } else {
335 if (position != last_antiacceptor) {
336 last_antiacceptor = position;
337 k++;
338 }
339 }
340 }
341 }
342 FREE(intervals);
343 FREE(splicesites1);
344 }
345 }
346 }
347
348 *nsplicesites = k;
349 debug(printf("total unique splicesites: %d\n",*nsplicesites));
350 fprintf(stderr,"%d unique splicesites...",*nsplicesites);
351
352 /* The above procedure determines number of unique splicesites */
353
354 if (*nsplicesites == 0) {
355 *splicecomp = (Genomecomp_T *) NULL;
356 splicesites = (Univcoord_T *) NULL;
357 *splicetypes = (Splicetype_T *) NULL;
358 *splicedists = (Chrpos_T *) NULL;
359 return splicesites;
360 }
361
362 nblocks = (Genome_totallength(genome)+31)/32U;
363 *splicecomp = (Genomecomp_T *) CALLOC(nblocks,sizeof(Genomecomp_T));
364 splicesites = (Univcoord_T *) CALLOC((*nsplicesites) + 1,sizeof(Univcoord_T));
365 *splicetypes = (Splicetype_T *) CALLOC(*nsplicesites,sizeof(Splicetype_T));
366 *splicedists = (Chrpos_T *) CALLOC(*nsplicesites,sizeof(Chrpos_T));
367
368 /* Use interval_structs instead of intervals, because we want to
369 copy information and avoid creating many Interval_T objects */
370
371 k = 0;
372 for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
373 if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
374 Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
375 /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
376 splicesites1 = IIT_get_with_divno(&nsplicesites1,splicing_iit,divno,
377 0U,chrlength-1U,/*sortp*/false);
378 if (nsplicesites1 > 0) {
379 interval_structs = (struct Interval_T *) CALLOC(nsplicesites1,sizeof(struct Interval_T));
380 for (i = 0; i < nsplicesites1; i++) {
381 /* intervals[i] = &(splicing_iit->intervals[divno][i]); */
382 /* Copy so we can store distance information in Interval_high */
383 Interval_copy_existing(&(interval_structs[i]),&(splicing_iit->intervals[divno][i]));
384 if (*distances_observed_p == false) {
385 /* No, want to have essentially zero distance */
386 /* Interval_store_length(intervals[i],shortsplicedist); */
387 } else {
388 /* annot = */ IIT_annotation(&restofheader,splicing_iit,splicesites1[i],&alloc_header_p);
389 if (sscanf(restofheader,"%d",&distance) != 1) {
390 fprintf(stderr,"splicesites file missing distance in entry %s...exiting\n",
391 IIT_label(splicing_iit,splicesites1[i],&allocp));
392 exit(9);
393 } else if (distance < 0) {
394 fprintf(stderr,"splicesites file has a negative distance %d in entry %s...exiting\n",
395 distance,IIT_label(splicing_iit,splicesites1[i],&allocp));
396 exit(9);
397 } else if (distance > (int) shortsplicedist) {
398 ntoolong++;
399 Interval_store_length(&(interval_structs[i]),distance + SPLICEDIST_EXTRA); /* Previously stored shortsplicedist */
400 } else {
401 Interval_store_length(&(interval_structs[i]),distance + SPLICEDIST_EXTRA);
402 }
403 if (alloc_header_p == true) {
404 FREE(restofheader);
405 }
406 }
407 }
408
409 qsort(interval_structs,nsplicesites1,sizeof(struct Interval_T),Interval_cmp_low_struct);
410
411 last_donor = last_antidonor = last_acceptor = last_antiacceptor = 0U;
412 for (i = 0; i < nsplicesites1; i++) {
413 interval = &(interval_structs[i]);
414 chrpos = Interval_low(interval);
415 position = chrpos + chroffset;
416
417 if (position >= chrhigh) {
418 #if 0
419 /* Warning given previously */
420 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
421 fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u. Discarding...",
422 chr,chrpos,chrlength);
423 if (allocp) FREE(chr);
424 #endif
425
426 } else if (Interval_type(interval) == donor_typeint) {
427 if (Interval_sign(interval) > 0) {
428 if (position == last_donor) {
429 if (Interval_length(interval) > (*splicedists)[last_donor_k]) {
430 (*splicedists)[last_donor_k] = Interval_length(interval);
431 }
432
433 } else {
434 last_donor_k = k;
435 last_donor = splicesites[k] = position;
436
437 assert(position/32U < (unsigned int) nblocks);
438 (*splicecomp)[position/32U] |= (1 << (position % 32));
439 (*splicetypes)[k] = DONOR;
440 (*splicedists)[k] = Interval_length(interval);
441
442 saw_n_p = false;
443 Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
444 saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
445 if (genomealt) {
446 Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
447 saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
448 }
449
450 if (saw_n_p == true) {
451 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
452 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
453 chr,chrpos);
454 if (allocp) FREE(chr);
455
456 } else {
457 k++;
458 }
459 }
460
461 } else {
462 if (position == last_antidonor) {
463 if (Interval_length(interval) > (*splicedists)[last_antidonor_k]) {
464 (*splicedists)[last_antidonor_k] = Interval_length(interval);
465 }
466
467 } else {
468 last_antidonor_k = k;
469 last_antidonor = splicesites[k] = position;
470
471 assert(position/32U < (unsigned int) nblocks);
472 (*splicecomp)[position/32U] |= (1 << (position % 32));
473 (*splicetypes)[k] = ANTIDONOR;
474 (*splicedists)[k] = Interval_length(interval);
475
476 saw_n_p = false;
477 Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
478 saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
479 if (genomealt) {
480 Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
481 saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
482 }
483
484 if (saw_n_p == true) {
485 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
486 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
487 chr,chrpos);
488 if (allocp) FREE(chr);
489 } else {
490 k++;
491 }
492 }
493 }
494
495 } else if (Interval_type(interval) == acceptor_typeint) {
496 if (Interval_sign(interval) > 0) {
497 if (position == last_acceptor) {
498 if (Interval_length(interval) > (*splicedists)[last_acceptor_k]) {
499 (*splicedists)[last_acceptor_k] = Interval_length(interval);
500 }
501
502 } else {
503 last_acceptor_k = k;
504 last_acceptor = splicesites[k] = position;
505
506 assert(position/32U < (unsigned int) nblocks);
507 (*splicecomp)[position/32U] |= (1 << (position % 32));
508 (*splicetypes)[k] = ACCEPTOR;
509 (*splicedists)[k] = Interval_length(interval);
510
511 saw_n_p = false;
512 Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
513 saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
514 if (genomealt) {
515 Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
516 saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
517 }
518
519 if (saw_n_p == true) {
520 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
521 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
522 chr,chrpos);
523 if (allocp) FREE(chr);
524 } else {
525 k++;
526 }
527 }
528
529 } else {
530 if (position == last_antiacceptor) {
531 if (Interval_length(interval) > (*splicedists)[last_antiacceptor_k]) {
532 (*splicedists)[last_antiacceptor_k] = Interval_length(interval);
533 }
534
535 } else {
536 last_antiacceptor_k = k;
537 last_antiacceptor = splicesites[k] = position;
538
539 assert(position/32U < (unsigned int) nblocks);
540 (*splicecomp)[position/32U] |= (1 << (position % 32));
541 (*splicetypes)[k] = ANTIACCEPTOR;
542 (*splicedists)[k] = Interval_length(interval);
543
544 saw_n_p = false;
545 Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
546 saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
547 if (genomealt) {
548 Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
549 saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
550 }
551
552 if (saw_n_p == true) {
553 chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
554 fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u. Discarding...",
555 chr,chrpos);
556 if (allocp) FREE(chr);
557 } else {
558 k++;
559 }
560 }
561
562 }
563 }
564 }
565
566 FREE(interval_structs);
567 FREE(splicesites1);
568 }
569 }
570 }
571
572 *nsplicesites = k;
573 splicesites[*nsplicesites] = (Univcoord_T) -1; /* Marker for comparison in identify_all_segments */
574 fprintf(stderr,"\n%d splicesites are valid...",*nsplicesites);
575
576 #ifdef DEBUG2
577 for (k = 0; k < *nsplicesites; k++) {
578 printf("%d: %u %s\n",k,splicesites[k],Splicetype_string((*splicetypes)[k]));
579 }
580 #endif
581
582 if (ntoolong > 0) {
583 fprintf(stderr,"%d entries with distance > %d specified for local splice distance...",ntoolong,shortsplicedist);
584 }
585
586 return splicesites;
587 }
588
589