1 static char rcsid[] = "$Id: knownsplicing.c 218286 2019-01-23 16:46:55Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "knownsplicing.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <math.h>		/* For qsort */
11 
12 #include "assert.h"
13 #include "mem.h"
14 #include "univcoord.h"
15 
16 #include "iitdef.h"
17 #include "interval.h"
18 #include "genome128_hr.h"
19 
20 
21 #define SPLICEDIST_EXTRA 10 /* Reported intron length does not include dinucleotides */
22 
23 
24 #ifdef DEBUG
25 #define debug(x) x
26 #else
27 #define debug(x)
28 #endif
29 
30 
31 /* Full dump */
32 #ifdef DEBUG2
33 #define debug2(x) x
34 #else
35 #define debug2(x)
36 #endif
37 
38 /* Splicecomp */
39 #ifdef DEBUG4
40 #define debug4(x) x
41 #else
42 #define debug4(x)
43 #endif
44 
45 
46 #define clear_start(diff,startdiscard) (diff & (~0U << (startdiscard)))
47 #define clear_end(diff,enddiscard) (diff & ~(~0U << (enddiscard)))
48 
49 
50 static Genomecomp_T *splicecomp;
51 
52 void
Knownsplicing_setup(Genomecomp_T * splicecomp_in)53 Knownsplicing_setup (Genomecomp_T *splicecomp_in) {
54   splicecomp = splicecomp_in;
55 
56   return;
57 }
58 
59 
60 /* Previously was in splicetrie.c */
61 /* Modified from count_mismatches_limit */
62 bool
Knownsplicing_splicesite_p(Univcoord_T left,int pos5,int pos3)63 Knownsplicing_splicesite_p (Univcoord_T left, int pos5, int pos3) {
64   int startdiscard, enddiscard;
65   Univcoord_T startblocki, endblocki;
66   Genomecomp_T *endblock, *ptr;
67   Genomecomp_T splicesitep;
68 
69   startblocki = (left+pos5)/32U;
70   endblocki = (left+pos3)/32U;
71   startdiscard = (left+pos5) % 32;
72   enddiscard = (left+pos3) % 32;
73 
74   if (endblocki == startblocki) {
75     debug4(printf("** Single block **\n"));
76     splicesitep = splicecomp[startblocki];
77     splicesitep = clear_start(splicesitep,startdiscard);
78     splicesitep = clear_end(splicesitep,enddiscard);
79 
80     return (splicesitep ? true : false);
81 
82   } else if (endblocki == startblocki + 1) {
83     /* Only two blocks to check */
84 
85     if (32 - startdiscard >= enddiscard) {
86       /* Two blocks to check and more bits counted in startblock */
87       debug4(printf("* Two blocks, start block first **\n"));
88 
89       /* 1/2: Startblock */
90       splicesitep = splicecomp[startblocki];
91       splicesitep = clear_start(splicesitep,startdiscard);
92       if (splicesitep) {
93 	return true;
94       }
95 
96       /* 2/2: Endblock */
97       splicesitep = splicecomp[endblocki];
98       splicesitep = clear_end(splicesitep,enddiscard);
99 
100       return (splicesitep ? true : false);
101 
102     } else {
103       /* Two blocks to check and more bits counted in endblock */
104       debug4(printf("** Two blocks, end block first **\n"));
105 
106       /* 1/2: Endblock */
107       splicesitep = splicecomp[endblocki];
108       splicesitep = clear_end(splicesitep,enddiscard);
109       if (splicesitep) {
110 	return true;
111       }
112 
113       /* 2/2: Startblock */
114       splicesitep = splicecomp[startblocki];
115       splicesitep = clear_start(splicesitep,startdiscard);
116       return (splicesitep ? true : false);
117     }
118 
119   } else {
120     /* More than 2 blocks to check */
121     debug4(printf("** More than two blocks **\n"));
122 
123     ptr = &(splicecomp[startblocki+1]);
124     endblock = &(splicecomp[endblocki]);
125 
126     while (ptr < endblock) {
127       if (*ptr) {
128 	return true;
129       }
130       ptr++;
131     }
132 
133     if (enddiscard >= 32 - startdiscard) {
134       /* More bits in end block */
135       debug4(printf("** Final block, end block first **\n"));
136 
137       /* n/n: Go first to end block */
138       splicesitep = *ptr;
139       splicesitep = clear_end(splicesitep,enddiscard);
140       if (splicesitep) {
141 	return true;
142       }
143 
144       /* 1/n: Go second to start block */
145       splicesitep = splicecomp[startblocki];
146       splicesitep = clear_start(splicesitep,startdiscard);
147       /* debug4(printf("adding start mask %08X\n",clear_start_mask(startdiscard))); */
148 
149       return (splicesitep ? true : false);
150 
151     } else {
152       /* 1/n: Go first to start block */
153       debug4(printf("** Final block, start block first **\n"));
154 
155       splicesitep = splicecomp[startblocki];
156       splicesitep = clear_start(splicesitep,startdiscard);
157       if (splicesitep) {
158 	return true;
159       }
160 
161       /* n/n: Go second to end block */
162       splicesitep = splicecomp[endblocki];
163       splicesitep = clear_end(splicesitep,enddiscard);
164 
165       return (splicesitep ? true : false);
166     }
167   }
168 }
169 
170 
171 /*               87654321 */
172 #define UINT4_LEFT_A 0x00000000
173 #define UINT4_LEFT_C 0x40000000
174 #define UINT4_LEFT_G 0x80000000
175 #define UINT4_LEFT_T 0xC0000000
176 #define UINT4_HIGH2  0xC0000000
177 
178 
179 /* Puts leftmost character into lowest bits */
180 /* For right splicestrings, we want the leftmost character in the highest bits */
181 
182 #if 0
183 static Genomecomp_T
184 compress16 (bool *saw_n_p, char *buffer) {
185   Genomecomp_T low = 0U;
186   int c;
187   int i;
188 
189   /* *saw_n_p = false; -- Want to check both ref and alt, so rely on caller to set */
190   for (i = 0; i < 16; i++) {
191     c = buffer[i];
192     low >>= 2;
193     switch (c) {
194     case 'A': break;
195     case 'C': low |= UINT4_LEFT_C; break;
196     case 'G': low |= UINT4_LEFT_G; break;
197     case 'T': low |= UINT4_LEFT_T; break;
198     default: *saw_n_p = true; break;
199     }
200   }
201 
202   return low;
203 }
204 #endif
205 
206 
207 static bool
look_for_n(bool saw_n_p,char * buffer)208 look_for_n (bool saw_n_p, char *buffer) {
209   int c;
210   int i;
211 
212   /* saw_n_p = false; -- Want to check both ref and alt, so rely on caller to set */
213   for (i = 0; i < 16; i++) {
214     c = buffer[i];
215     switch (c) {
216     case 'A': case 'C': case 'G': case 'T': break;
217     default: saw_n_p = true; break;
218     }
219   }
220 
221   return saw_n_p;
222 }
223 
224 
225 #if 0
226 static Genomecomp_T
227 uint4_reverse (Genomecomp_T forward) {
228   Genomecomp_T reverse = 0U;
229   int c;
230   int i;
231 
232   for (i = 0; i < 16; i++) {
233     c = forward & 0x03;
234     reverse <<= 2;
235     reverse |= c;
236     forward >>= 2;
237   }
238 
239   return reverse;
240 }
241 #endif
242 
243 
244 Univcoord_T *
Knownsplicing_retrieve_via_splicesites(bool * distances_observed_p,Genomecomp_T ** splicecomp,Splicetype_T ** splicetypes,Chrpos_T ** splicedists,int * nsplicesites,IIT_T splicing_iit,int * splicing_divint_crosstable,int donor_typeint,int acceptor_typeint,Univ_IIT_T chromosome_iit,Genome_T genome,Genome_T genomealt,Chrpos_T shortsplicedist)245 Knownsplicing_retrieve_via_splicesites (bool *distances_observed_p, Genomecomp_T **splicecomp,
246 					Splicetype_T **splicetypes, Chrpos_T **splicedists,
247 					int *nsplicesites, IIT_T splicing_iit, int *splicing_divint_crosstable,
248 					int donor_typeint, int acceptor_typeint, Univ_IIT_T chromosome_iit,
249 					Genome_T genome, Genome_T genomealt, Chrpos_T shortsplicedist) {
250   Univcoord_T *splicesites, chroffset, chrhigh, position;
251   Chrpos_T chrlength, chrpos;
252   Univcoord_T last_donor, last_antidonor, last_acceptor, last_antiacceptor;
253   int last_donor_k, last_antidonor_k, last_acceptor_k, last_antiacceptor_k;
254   int distance;
255   int *splicesites1;
256   int divno, nsplicesites1, i, k;
257   Chrnum_T chrnum;
258   Interval_T *intervals, interval;
259   struct Interval_T *interval_structs;
260   char gbuffer_ref[17], gbuffer_alt[17], *chr;
261   char *restofheader;
262   /* char *annot; */
263   bool firstp = true, saw_n_p, allocp, alloc_header_p;
264   int ntoolong = 0;
265 
266 #ifdef DEBUG2
267   List_T p;
268 #endif
269 
270   int nblocks;
271 
272   k = 0;
273   for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
274     if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
275       Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
276       /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
277       splicesites1 = IIT_get_with_divno(&nsplicesites1,splicing_iit,divno,
278 					0U,chrlength-1U,/*sortp*/false);
279       if (nsplicesites1 > 0) {
280 	if (firstp == true) {
281 	  /* annot = */ IIT_annotation(&restofheader,splicing_iit,splicesites1[0],&alloc_header_p);
282 	  if (restofheader[0] == '\0') {
283 	    fprintf(stderr,"splice distances absent...");
284 	    *distances_observed_p = false;
285 	  } else if (sscanf(restofheader,"%d",&distance) < 1) {
286 	    fprintf(stderr,"splice distances absent...");
287 	    *distances_observed_p = false;
288 	  } else {
289 	    fprintf(stderr,"splice distances present...");
290 	    *distances_observed_p = true;
291 	  }
292 	  if (alloc_header_p == true) {
293 	    FREE(restofheader);
294 	  }
295 	  firstp = false;
296 	}
297 
298 	intervals = (Interval_T *) CALLOC(nsplicesites1,sizeof(Interval_T));
299 	for (i = 0; i < nsplicesites1; i++) {
300 	  intervals[i] = &(splicing_iit->intervals[divno][i]);
301 	}
302 	qsort(intervals,nsplicesites1,sizeof(Interval_T),Interval_cmp_low);
303 
304 	last_donor = last_antidonor = last_acceptor = last_antiacceptor = 0U;
305 	for (i = 0; i < nsplicesites1; i++) {
306 	  interval = intervals[i];
307 	  chrpos = Interval_low(interval);
308 	  position = chrpos + chroffset;
309 
310 	  if (position >= chrhigh) {
311 	    chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
312 	    fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u.  Discarding...",
313 		    chr,chrpos,chrlength);
314 	    if (allocp) FREE(chr);
315 
316 	  } else if (Interval_type(interval) == donor_typeint) {
317 	    if (Interval_sign(interval) > 0) {
318 	      if (position != last_donor) {
319 		last_donor = position;
320 		k++;
321 	      }
322 	    } else {
323 	      if (position != last_antidonor) {
324 		last_antidonor = position;
325 		k++;
326 	      }
327 	    }
328 	  } else if (Interval_type(interval) == acceptor_typeint) {
329 	    if (Interval_sign(interval) > 0) {
330 	      if (position != last_acceptor) {
331 		last_acceptor = position;
332 		k++;
333 	      }
334 	    } else {
335 	      if (position != last_antiacceptor) {
336 		last_antiacceptor = position;
337 		k++;
338 	      }
339 	    }
340 	  }
341 	}
342 	FREE(intervals);
343 	FREE(splicesites1);
344       }
345     }
346   }
347 
348   *nsplicesites = k;
349   debug(printf("total unique splicesites: %d\n",*nsplicesites));
350   fprintf(stderr,"%d unique splicesites...",*nsplicesites);
351 
352   /* The above procedure determines number of unique splicesites */
353 
354   if (*nsplicesites == 0) {
355     *splicecomp = (Genomecomp_T *) NULL;
356     splicesites = (Univcoord_T *) NULL;
357     *splicetypes = (Splicetype_T *) NULL;
358     *splicedists = (Chrpos_T *) NULL;
359     return splicesites;
360   }
361 
362   nblocks = (Genome_totallength(genome)+31)/32U;
363   *splicecomp = (Genomecomp_T *) CALLOC(nblocks,sizeof(Genomecomp_T));
364   splicesites = (Univcoord_T *) CALLOC((*nsplicesites) + 1,sizeof(Univcoord_T));
365   *splicetypes = (Splicetype_T *) CALLOC(*nsplicesites,sizeof(Splicetype_T));
366   *splicedists = (Chrpos_T *) CALLOC(*nsplicesites,sizeof(Chrpos_T));
367 
368   /* Use interval_structs instead of intervals, because we want to
369      copy information and avoid creating many Interval_T objects */
370 
371   k = 0;
372   for (chrnum = 1; chrnum <= Univ_IIT_total_nintervals(chromosome_iit); chrnum++) {
373     if ((divno = splicing_divint_crosstable[chrnum]) > 0) {
374       Univ_IIT_interval_bounds(&chroffset,&chrhigh,&chrlength,chromosome_iit,chrnum,/*circular_typeint*/-1);
375       /* chrlength = Univ_IIT_length(chromosome_iit,chrnum); */
376       splicesites1 = IIT_get_with_divno(&nsplicesites1,splicing_iit,divno,
377 					0U,chrlength-1U,/*sortp*/false);
378       if (nsplicesites1 > 0) {
379 	interval_structs = (struct Interval_T *) CALLOC(nsplicesites1,sizeof(struct Interval_T));
380 	for (i = 0; i < nsplicesites1; i++) {
381 	  /* intervals[i] = &(splicing_iit->intervals[divno][i]); */
382 	  /* Copy so we can store distance information in Interval_high */
383 	  Interval_copy_existing(&(interval_structs[i]),&(splicing_iit->intervals[divno][i]));
384 	  if (*distances_observed_p == false) {
385 	    /* No, want to have essentially zero distance */
386 	    /* Interval_store_length(intervals[i],shortsplicedist); */
387 	  } else {
388 	    /* annot = */ IIT_annotation(&restofheader,splicing_iit,splicesites1[i],&alloc_header_p);
389 	    if (sscanf(restofheader,"%d",&distance) != 1) {
390 	      fprintf(stderr,"splicesites file missing distance in entry %s...exiting\n",
391 		      IIT_label(splicing_iit,splicesites1[i],&allocp));
392 	      exit(9);
393 	    } else if (distance < 0) {
394 	      fprintf(stderr,"splicesites file has a negative distance %d in entry %s...exiting\n",
395 		      distance,IIT_label(splicing_iit,splicesites1[i],&allocp));
396 	      exit(9);
397 	    } else if (distance > (int) shortsplicedist) {
398 	      ntoolong++;
399 	      Interval_store_length(&(interval_structs[i]),distance + SPLICEDIST_EXTRA);  /* Previously stored shortsplicedist */
400 	    } else {
401 	      Interval_store_length(&(interval_structs[i]),distance + SPLICEDIST_EXTRA);
402 	    }
403 	    if (alloc_header_p == true) {
404 	      FREE(restofheader);
405 	    }
406 	  }
407 	}
408 
409 	qsort(interval_structs,nsplicesites1,sizeof(struct Interval_T),Interval_cmp_low_struct);
410 
411 	last_donor = last_antidonor = last_acceptor = last_antiacceptor = 0U;
412 	for (i = 0; i < nsplicesites1; i++) {
413 	  interval = &(interval_structs[i]);
414 	  chrpos = Interval_low(interval);
415 	  position = chrpos + chroffset;
416 
417 	  if (position >= chrhigh) {
418 #if 0
419 	    /* Warning given previously */
420 	    chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
421 	    fprintf(stderr,"\nSplice site %s:%u extends beyond chromosome length %u.  Discarding...",
422 		    chr,chrpos,chrlength);
423 	    if (allocp) FREE(chr);
424 #endif
425 
426 	  } else if (Interval_type(interval) == donor_typeint) {
427 	    if (Interval_sign(interval) > 0) {
428 	      if (position == last_donor) {
429 		if (Interval_length(interval) > (*splicedists)[last_donor_k]) {
430 		  (*splicedists)[last_donor_k] = Interval_length(interval);
431 		}
432 
433 	      } else {
434 		last_donor_k = k;
435 		last_donor = splicesites[k] = position;
436 
437 		assert(position/32U < (unsigned int) nblocks);
438 		(*splicecomp)[position/32U] |= (1 << (position % 32));
439 		(*splicetypes)[k] = DONOR;
440 		(*splicedists)[k] = Interval_length(interval);
441 
442 		saw_n_p = false;
443 		Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
444 		saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
445 		if (genomealt) {
446 		  Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
447 		  saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
448 		}
449 
450 		if (saw_n_p == true) {
451 		  chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
452 		  fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u.  Discarding...",
453 			  chr,chrpos);
454 		  if (allocp) FREE(chr);
455 
456 		} else {
457 		  k++;
458 		}
459 	      }
460 
461 	    } else {
462 	      if (position == last_antidonor) {
463 		if (Interval_length(interval) > (*splicedists)[last_antidonor_k]) {
464 		  (*splicedists)[last_antidonor_k] = Interval_length(interval);
465 		}
466 
467 	      } else {
468 		last_antidonor_k = k;
469 		last_antidonor = splicesites[k] = position;
470 
471 		assert(position/32U < (unsigned int) nblocks);
472 		(*splicecomp)[position/32U] |= (1 << (position % 32));
473 		(*splicetypes)[k] = ANTIDONOR;
474 		(*splicedists)[k] = Interval_length(interval);
475 
476 		saw_n_p = false;
477 		Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
478 		saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
479 		if (genomealt) {
480 		  Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
481 		  saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
482 		}
483 
484 		if (saw_n_p == true) {
485 		  chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
486 		  fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u.  Discarding...",
487 			  chr,chrpos);
488 		  if (allocp) FREE(chr);
489 		} else {
490 		  k++;
491 		}
492 	      }
493 	    }
494 
495 	  } else if (Interval_type(interval) == acceptor_typeint) {
496 	    if (Interval_sign(interval) > 0) {
497 	      if (position == last_acceptor) {
498 		if (Interval_length(interval) > (*splicedists)[last_acceptor_k]) {
499 		  (*splicedists)[last_acceptor_k] = Interval_length(interval);
500 		}
501 
502 	      } else {
503 		last_acceptor_k = k;
504 		last_acceptor = splicesites[k] = position;
505 
506 		assert(position/32U < (unsigned int) nblocks);
507 		(*splicecomp)[position/32U] |= (1 << (position % 32));
508 		(*splicetypes)[k] = ACCEPTOR;
509 		(*splicedists)[k] = Interval_length(interval);
510 
511 		saw_n_p = false;
512 		Genome_fill_buffer_simple(genome,position,16,gbuffer_ref);
513 		saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
514 		if (genomealt) {
515 		  Genome_fill_buffer_simple_alt(genome,genomealt,position,16,gbuffer_alt);
516 		  saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
517 		}
518 
519 		if (saw_n_p == true) {
520 		  chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
521 		  fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u.  Discarding...",
522 			  chr,chrpos);
523 		  if (allocp) FREE(chr);
524 		} else {
525 		  k++;
526 		}
527 	      }
528 
529 	    } else {
530 	      if (position == last_antiacceptor) {
531 		if (Interval_length(interval) > (*splicedists)[last_antiacceptor_k]) {
532 		  (*splicedists)[last_antiacceptor_k] = Interval_length(interval);
533 		}
534 
535 	      } else {
536 		last_antiacceptor_k = k;
537 		last_antiacceptor = splicesites[k] = position;
538 
539 		assert(position/32U < (unsigned int) nblocks);
540 		(*splicecomp)[position/32U] |= (1 << (position % 32));
541 		(*splicetypes)[k] = ANTIACCEPTOR;
542 		(*splicedists)[k] = Interval_length(interval);
543 
544 		saw_n_p = false;
545 		Genome_fill_buffer_simple(genome,position-16,16,gbuffer_ref);
546 		saw_n_p = look_for_n(saw_n_p,gbuffer_ref);
547 		if (genomealt) {
548 		  Genome_fill_buffer_simple_alt(genome,genomealt,position-16,16,gbuffer_alt);
549 		  saw_n_p = look_for_n(saw_n_p,gbuffer_alt);
550 		}
551 
552 		if (saw_n_p == true) {
553 		  chr = Univ_IIT_label(chromosome_iit,chrnum,&allocp);
554 		  fprintf(stderr,"\nNon-standard nucleotide N near splice site %s:%u.  Discarding...",
555 			  chr,chrpos);
556 		  if (allocp) FREE(chr);
557 		} else {
558 		  k++;
559 		}
560 	      }
561 
562 	    }
563 	  }
564 	}
565 
566 	FREE(interval_structs);
567 	FREE(splicesites1);
568       }
569     }
570   }
571 
572   *nsplicesites = k;
573   splicesites[*nsplicesites] = (Univcoord_T) -1; /* Marker for comparison in identify_all_segments */
574   fprintf(stderr,"\n%d splicesites are valid...",*nsplicesites);
575 
576 #ifdef DEBUG2
577   for (k = 0; k < *nsplicesites; k++) {
578     printf("%d: %u %s\n",k,splicesites[k],Splicetype_string((*splicetypes)[k]));
579   }
580 #endif
581 
582   if (ntoolong > 0) {
583     fprintf(stderr,"%d entries with distance > %d specified for local splice distance...",ntoolong,shortsplicedist);
584   }
585 
586   return splicesites;
587 }
588 
589