1 static char rcsid[] = "$Id: iit-read-univ.c 222811 2020-06-03 22:02:06Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "iit-read-univ.h"
7 #include "parserange.h"
8 
9 #ifdef WORDS_BIGENDIAN
10 #include "bigendian.h"
11 #else
12 #include "littleendian.h"
13 #endif
14 
15 #include <stdlib.h>		/* For qsort */
16 #include <string.h>		/* For memset */
17 #include <strings.h>
18 #include <ctype.h>		/* For isspace */
19 #ifdef HAVE_UNISTD_H
20 #include <unistd.h>		/* For mmap on Linux */
21 #endif
22 #ifdef HAVE_SYS_TYPES_H
23 #include <sys/types.h>		/* For open, fstat, and mmap */
24 #endif
25 /* Not sure why this was included
26 #include <sys/param.h>
27 */
28 #ifdef HAVE_FCNTL_H
29 #include <fcntl.h>		/* For open */
30 #endif
31 #ifdef HAVE_SYS_STAT_H
32 #include <sys/stat.h>		/* For open and fstat */
33 #endif
34 #include <sys/mman.h>		/* For mmap and madvise */
35 #include <math.h>		/* For qsort */
36 #include <errno.h>		/* For perror */
37 #include "assert.h"
38 #include "except.h"
39 #include "mem.h"
40 #include "access.h"
41 #include "fopen.h"
42 #include "uintlist.h"
43 #include "intlist.h"
44 
45 
46 /* Note: if sizeof(int) or sizeof(unsigned int) are not 4, then the below code is faulty */
47 
48 
49 /* Integer interval tree. */
50 
51 /*
52  * n intervals;
53  *   specified by their indices e[1..n]
54  *   and endpoint-access function:
55  *                low  (e[i])
56  *                high (e[i])
57  *        is_contained (x, e[i])
58  *   eg:
59  *        interval e[i]          ... "[" low (e[i]) "," high (e[i]) ")"
60  *        is_contained (x, e[i]) ... (    (low (e[i]) <= x
61  *                                    and (x < high (e[i]))
62  */
63 
64 /*--------------------------------------------------------------------------*/
65 
66 #ifdef DEBUG
67 #define debug(x) x
68 #else
69 #define debug(x)
70 #endif
71 
72 /* Timing */
73 #ifdef DEBUG1
74 #define debug1(x) x
75 #else
76 #define debug1(x)
77 #endif
78 
79 /* Flanking */
80 #ifdef DEBUG2
81 #define debug2(x) x
82 #else
83 #define debug2(x)
84 #endif
85 
86 /* Binary search */
87 #ifdef DEBUG3
88 #define debug3(x) x
89 #else
90 #define debug3(x)
91 #endif
92 
93 /* Straddle */
94 #ifdef DEBUG4
95 #define debug4(x) x
96 #else
97 #define debug4(x)
98 #endif
99 
100 
101 /* Cannot use version in iitdef.h, because that uses Chrpos_T for the
102    value, and utility functions need to have Univcoord_T (or UINT8 if
103    available). */
104 typedef struct Univ_FNode_T *Univ_FNode_T;
105 struct Univ_FNode_T {
106   Univ_IIT_coord_T value;
107   int a;
108   int b;
109   int leftindex;
110   int rightindex;
111 };
112 
113 #define T Univ_IIT_T
114 struct T {
115   bool coord_values_8p;
116 
117   int fd;
118   Access_T access;		/* access type */
119 
120 #ifdef HAVE_PTHREAD
121   pthread_mutex_t read_mutex;
122 #endif
123 
124   int ntypes;			/* Always >= 1 */
125 
126   int total_nintervals;
127   int nnodes;
128 
129   int *sigmas;			/* Ordering for IIT */
130   int *omegas;			/* Ordering for IIT */
131 
132   struct Univ_FNode_T *nodes;
133   struct Univinterval_T *intervals;
134 
135   UINT4 *typepointers;
136   char *typestrings;
137 
138   size_t labelorder_offset;
139   size_t labelorder_length; /* mmap length (mmap uses size_t, not off_t) */
140   char *labelorder_mmap;
141 
142   size_t labelpointers_offset;
143   size_t labelpointers_length; /* mmap length (mmap uses size_t, not off_t) */
144   char *labelpointers_mmap;
145 
146   size_t label_offset;
147   size_t label_length;		/* mmap length (mmap uses size_t, not off_t) */
148   char *label_mmap;
149 
150   size_t annotpointers_offset;
151   size_t annotpointers_length; /* mmap length (mmap uses size_t, not off_t) */
152   char *annotpointers_mmap;
153 
154   size_t annot_offset;
155   size_t annot_length;		/* mmap length (mmap uses size_t, not off_t) */
156   char *annot_mmap;
157 
158   int *labelorder;
159   UINT4 *labelpointers;
160   char *labels;
161 
162   UINT4 *annotpointers;
163   char *annotations;
164 
165   void **datapointers;
166 };
167 
168 
169 
170 static void
file_move_absolute(int fd,size_t offset,size_t objsize,Univcoord_T n)171 file_move_absolute (int fd, size_t offset, size_t objsize, Univcoord_T n) {
172   off_t position = offset + n*objsize;
173 
174   if (lseek(fd,position,SEEK_SET) < 0) {
175     perror("Error in gmap, file_move_label");
176     exit(9);
177   }
178   return;
179 }
180 
181 
182 bool
Univ_IIT_coord_values_8p(T this)183 Univ_IIT_coord_values_8p (T this) {
184   return this->coord_values_8p;
185 }
186 
187 int
Univ_IIT_total_nintervals(T this)188 Univ_IIT_total_nintervals (T this) {
189   return this->total_nintervals;
190 }
191 
192 int
Univ_IIT_ntypes(T this)193 Univ_IIT_ntypes (T this) {
194   return this->ntypes;
195 }
196 
197 Univcoord_T
Univ_IIT_length(T this,int index)198 Univ_IIT_length (T this, int index) {
199   Univinterval_T interval;
200 
201   interval = &(this->intervals[index-1]);
202   return Univinterval_length(interval);
203 }
204 
205 
206 /* Assumes intervals are stored using universal coordinates */
207 Univcoord_T
Univ_IIT_genomelength(T chromosome_iit,bool with_circular_alias_p)208 Univ_IIT_genomelength (T chromosome_iit, bool with_circular_alias_p) {
209   Univcoord_T max = 0U, this_max;
210   Univinterval_T interval;
211   int i;
212   int circular_typeint;
213 
214   circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
215 
216   for (i = 0; i < chromosome_iit->total_nintervals; i++) {
217     interval = &(chromosome_iit->intervals[i]);
218     if (with_circular_alias_p == true && Univinterval_type(interval) == circular_typeint) {
219       this_max = Univinterval_high(interval) + Univinterval_length(interval);
220     } else {
221       this_max = Univinterval_high(interval);
222     }
223     if (this_max > max) {
224       max = this_max;
225     }
226   }
227 
228   /* Convert from zero-based coordinate */
229   return max+1U;
230 }
231 
232 
233 bool *
Univ_IIT_circularp(bool * any_circular_p,T chromosome_iit)234 Univ_IIT_circularp (bool *any_circular_p, T chromosome_iit) {
235   bool *circularp;
236   Univinterval_T interval;
237   int chrnum, nchromosomes;
238   int circular_typeint;
239 
240   nchromosomes = chromosome_iit->total_nintervals;
241   circularp = (bool *) CALLOC(nchromosomes+1,sizeof(bool));
242 
243   *any_circular_p = false;
244   circularp[0] = false;		/* chrnum of 0 indicates translocation */
245   if ((circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular")) >= 0) {
246     for (chrnum = 0; chrnum < nchromosomes; chrnum++) {
247       interval = &(chromosome_iit->intervals[chrnum]);
248       if (Univinterval_type(interval) == circular_typeint) {
249 	*any_circular_p = true;
250 	circularp[chrnum+1] = true;
251       }
252     }
253   }
254 
255   return circularp;
256 }
257 
258 
259 bool *
Univ_IIT_altlocp(Univcoord_T ** alias_starts,Univcoord_T ** alias_ends,T chromosome_iit,T altscaffold_iit)260 Univ_IIT_altlocp (Univcoord_T **alias_starts, Univcoord_T **alias_ends, T chromosome_iit, T altscaffold_iit) {
261   bool *altlocp, alloc_header_p;
262   char *chr, *primary_chr, *restofheader, *p;
263   int chri, nchromosomes, alti, primary_chrnum;
264   bool revcomp;
265   Univcoord_T genomicstart, chroffset;
266   Chrpos_T genomiclength, chrstart, chrend, chrlength;
267   UINT4 start;
268   int nprimary = 0, naltloc = 0;
269   int circular_typeint;
270   Univinterval_T interval;
271 
272 
273   nchromosomes = chromosome_iit->total_nintervals;
274   altlocp = (bool *) CALLOC(nchromosomes+1,sizeof(bool));
275   *alias_starts = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
276   *alias_ends = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
277 
278   altlocp[0] = false;		/* chrnum of 0 indicates translocation */
279   circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
280 
281   /* Can go through all chromosomes in chromosome_iit, since it contains both primary and alternate loci */
282   for (chri = 0; chri < nchromosomes; chri++) {
283 #ifdef WORDS_BIGENDIAN
284     /* chromosome_iit should be version 1 */
285     start = Bigendian_convert_uint(chromosome_iit->labelpointers[chri]);
286 #else
287     start = chromosome_iit->labelpointers[chri];
288 #endif
289     chr = &(chromosome_iit->labels[start]);
290 
291     if ((alti = Univ_IIT_find_one(altscaffold_iit,chr)) >= 0) {
292       /* Don't need to assign result, since annot is never allocated */
293       Univ_IIT_annotation(&restofheader,altscaffold_iit,alti/*+1*/,&alloc_header_p);
294       p = restofheader;
295       while (isspace((int) *p)) {
296 	p++;
297       }
298 
299       if (naltloc > 10) {
300 	/* Don't print */
301       } else if (naltloc == 10) {
302 	fprintf(stderr,"More than 10 alternate loci.  Will stop printing.\n");
303       } else {
304 	fprintf(stderr,"Alternate locus %s maps to primary region %s\n",chr,p);
305       }
306 
307       Parserange_universal_iit(&primary_chr,&revcomp,&genomicstart,&genomiclength,
308 			       &chrstart,&chrend,&chroffset,&chrlength,
309 			       p,chromosome_iit,/*contig_iit*/NULL);
310       /* fprintf(stderr," => %s %u..%u\n",primary_chr,genomicstart,genomicstart+genomiclength); */
311 
312       if (circular_typeint < 0) {
313 	altlocp[chri+1] = true;
314 	(*alias_starts)[chri+1] = genomicstart;
315 	(*alias_ends)[chri+1] = genomicstart + genomiclength;
316 	naltloc++;
317 
318       } else {
319 	primary_chrnum = Univ_IIT_find_one(chromosome_iit,primary_chr);
320 	interval = &(chromosome_iit->intervals[primary_chrnum-1]);
321 	if (Univinterval_type(interval) != circular_typeint) {
322 	  altlocp[chri+1] = true;
323 	  (*alias_starts)[chri+1] = genomicstart;
324 	  (*alias_ends)[chri+1] = genomicstart + genomiclength;
325 	  naltloc++;
326 
327 	} else {
328 	  fprintf(stderr,"Alternate locus %s maps to region %s, which is circular and not allowed.  Ignoring.\n",chr,p);
329 	  nprimary++;
330 	}
331       }
332 
333       if (alloc_header_p) {
334 	FREE(restofheader);
335       }
336 
337       FREE(primary_chr);
338 
339     } else {
340       /* printf("Chromosome %s is not an alt locus\n",chr); */
341       nprimary++;
342     }
343   }
344 
345   fprintf(stderr,"Found %d alternate loci, and %d primary contigs (not mapping to another contig)\n",
346 	  naltloc,nprimary);
347 
348   return altlocp;
349 }
350 
351 
352 Univinterval_T
Univ_IIT_interval(T this,int index)353 Univ_IIT_interval (T this, int index) {
354   assert(index <= this->total_nintervals);
355   return &(this->intervals[index-1]); /* Convert to 0-based */
356 }
357 
358 Univcoord_T
Univ_IIT_interval_low(T this,int index)359 Univ_IIT_interval_low (T this, int index) {
360   Univinterval_T interval;
361 
362   assert(index <= this->total_nintervals);
363   interval = &(this->intervals[index-1]);
364   return Univinterval_low(interval);
365 }
366 
367 Univcoord_T
Univ_IIT_interval_high(T this,int index)368 Univ_IIT_interval_high (T this, int index) {
369   Univinterval_T interval;
370 
371   assert(index <= this->total_nintervals);
372   interval = &(this->intervals[index-1]);
373   return Univinterval_high(interval);
374 }
375 
376 Univcoord_T
Univ_IIT_interval_length(T this,int index)377 Univ_IIT_interval_length (T this, int index) {
378   Univinterval_T interval;
379 
380   assert(index <= this->total_nintervals);
381   interval = &(this->intervals[index-1]);
382   return Univinterval_length(interval);
383 }
384 
385 int
Univ_IIT_interval_type(T this,int index)386 Univ_IIT_interval_type (T this, int index) {
387   Univinterval_T interval;
388 
389   assert(index <= this->total_nintervals);
390   interval = &(this->intervals[index-1]);
391   return Univinterval_type(interval);
392 }
393 
394 
395 Univcoord_T
Univ_IIT_next_chrbound(T this,int index,int circular_typeint)396 Univ_IIT_next_chrbound (T this, int index, int circular_typeint) {
397   Univinterval_T interval;
398 
399   assert(index <= this->total_nintervals);
400   interval = &(this->intervals[index-1]);
401   if (Univinterval_type(interval) == circular_typeint) {
402     return Univinterval_high(interval) + Univinterval_length(interval);
403   } else {
404     return Univinterval_high(interval);
405   }
406 }
407 
408 /* chrhigh is one past the highest position in the chromosome */
409 void
Univ_IIT_interval_bounds(Univcoord_T * low,Univcoord_T * high,Chrpos_T * length,T this,int index,int circular_typeint)410 Univ_IIT_interval_bounds (Univcoord_T *low, Univcoord_T *high, Chrpos_T *length, T this,
411 			  int index, int circular_typeint) {
412   Univinterval_T interval;
413 
414   assert(index > 0);
415   assert(index <= this->total_nintervals);
416 
417   interval = &(this->intervals[index-1]);
418   *low = Univinterval_low(interval);
419   *length = Univinterval_length(interval);
420   if (Univinterval_type(interval) == circular_typeint) {
421     *high = Univinterval_high(interval) + 1 + (*length);
422   } else {
423     *high = Univinterval_high(interval) + 1;
424   }
425   return;
426 }
427 
428 
429 static void
Univ_IIT_interval_bounds_small(Trcoord_T * low,Trcoord_T * high,Chrpos_T * length,T this,int index)430 Univ_IIT_interval_bounds_small (Trcoord_T *low, Trcoord_T *high, Chrpos_T *length, T this,
431 				int index) {
432   Univinterval_T interval;
433 
434   assert(index > 0);
435   assert(index <= this->total_nintervals);
436 
437   interval = &(this->intervals[index-1]);
438   *low = Univinterval_low(interval);
439   *length = Univinterval_length(interval);
440   *high = Univinterval_high(interval) + 1;
441   return;
442 }
443 
444 
445 void
Univ_IIT_intervals_setup(Univcoord_T ** chroffsets,Univcoord_T ** chrhighs,Chrpos_T ** chrlengths,T this,int nchromosomes,int circular_typeint)446 Univ_IIT_intervals_setup (Univcoord_T **chroffsets, Univcoord_T **chrhighs, Chrpos_T **chrlengths,
447 			  T this, int nchromosomes, int circular_typeint) {
448   Univinterval_T interval;
449   Univcoord_T *chroffsets_ptr, *chrhighs_ptr;
450   Chrpos_T *chrlengths_ptr, length;
451   int index;
452 
453   chroffsets_ptr = *chroffsets = (Univcoord_T *) CALLOC(nchromosomes,sizeof(Univcoord_T));
454   chrhighs_ptr = *chrhighs = (Univcoord_T *) CALLOC(nchromosomes,sizeof(Univcoord_T));
455   chrlengths_ptr = *chrlengths = (Chrpos_T *) CALLOC(nchromosomes,sizeof(Chrpos_T));
456 
457   for (index = 1; index <= nchromosomes; index++) {
458     interval = &(this->intervals[index-1]);
459     *chroffsets_ptr++ = Univinterval_low(interval);
460     length = *chrlengths_ptr++ = Univinterval_length(interval);
461     if (Univinterval_type(interval) == circular_typeint) {
462       *chrhighs_ptr++ = Univinterval_high(interval) + 1 + length;
463     } else {
464       *chrhighs_ptr++ = Univinterval_high(interval) + 1;
465     }
466   }
467 
468   return;
469 }
470 
471 
472 /* divint_crosstable maps from chromosome_iit chrnums to iit divints */
473 int *
Univ_IIT_divint_crosstable(T chromosome_iit,IIT_T iit)474 Univ_IIT_divint_crosstable (T chromosome_iit, IIT_T iit) {
475   int *crosstable;
476   int chrnum, nchromosomes;
477   char *chr;
478   UINT4 start;
479 
480   nchromosomes = chromosome_iit->total_nintervals;
481   crosstable = (int *) CALLOC(nchromosomes+1,sizeof(int));
482 
483   for (chrnum = 0; chrnum < nchromosomes; chrnum++) {
484 #ifdef WORDS_BIGENDIAN
485     /* chromosome_iit should be version 1 */
486     start = Bigendian_convert_uint(chromosome_iit->labelpointers[chrnum]);
487 #else
488     start = chromosome_iit->labelpointers[chrnum];
489 #endif
490     chr = &(chromosome_iit->labels[start]);
491 
492     /* upon lookup, chrnum from Univ_IIT_get_one(chromosome_iit)
493        is 1-based, so we need to store in chrnum+1 */
494     crosstable[chrnum+1] = IIT_divint(iit,chr);
495 #if 0
496     if (crosstable[chrnum+1] < 0) {
497       fprintf(stderr,"Note: No splicesites are provided in chr %s\n",chr);
498     } else {
499       fprintf(stderr,"chrnum %d maps to splicesite divint %d\n",chrnum,crosstable[chrnum]);
500     }
501 #endif
502   }
503 
504 #if 0
505   for (chrnum = 0; chrnum < nchromosomes; chrnum++) {
506     printf("Genomic chrnum %d => IIT chrnum %d\n",chrnum+1,crosstable[chrnum+1]);
507   }
508 #endif
509 
510   return crosstable;
511 }
512 
513 
514 /* chrnum_crosstable maps from iit divints to chromosome_iit chrnums */
515 int *
Univ_IIT_chrnum_crosstable(T chromosome_iit,IIT_T iit)516 Univ_IIT_chrnum_crosstable (T chromosome_iit, IIT_T iit) {
517   int *crosstable;
518   int divint, ndivs;
519   char *chr;
520 
521   ndivs = IIT_ndivs(iit);
522   crosstable = (int *) CALLOC(ndivs+1,sizeof(int));
523 
524   for (divint = 1; divint < ndivs; divint++) {
525     chr = IIT_divstring(iit,divint);
526 
527     /* upon lookup, chrnum is 1-based, so we need to store in chrnum+1 */
528     crosstable[divint] = Univ_IIT_find_one(chromosome_iit,chr);
529   }
530 
531 #if 0
532   for (divint = 0; divint < ndivs; divint++) {
533     printf("IIT divint %d => IIT chrnum %d\n",divint+1,crosstable[divint+1]);
534   }
535 #endif
536 
537   return crosstable;
538 }
539 
540 
541 
542 /* The iit file has a '\0' after each string, so functions know where
543    it ends */
544 char *
Univ_IIT_typestring(T this,int type)545 Univ_IIT_typestring (T this, int type) {
546   UINT4 start;
547 
548   start = this->typepointers[type];
549   return &(this->typestrings[start]);
550 }
551 
552 int
Univ_IIT_typeint(T this,char * typestring)553 Univ_IIT_typeint (T this, char *typestring) {
554   int i = 0;
555   UINT4 start;
556 
557   while (i < this->ntypes) {
558     start = this->typepointers[i];
559     if (!strcmp(typestring,&(this->typestrings[start]))) {
560       return i;
561     }
562     i++;
563   }
564 
565   return -1;
566 }
567 
568 char *
Univ_IIT_label(T this,int index,bool * allocp)569 Univ_IIT_label (T this, int index, bool *allocp) {
570   int recno;
571   UINT4 start;
572 
573   recno = index - 1; /* Convert to 0-based */
574 
575 #ifdef WORDS_BIGENDIAN
576   start = Bigendian_convert_uint(this->labelpointers[recno]);
577 #else
578   start = this->labelpointers[recno];
579 #endif
580   *allocp = false;
581   return &(this->labels[start]);
582 }
583 
584 
585 static char EMPTY_STRING[1] = {'\0'};
586 
587 /* The iit file has a '\0' after each string, so functions know where
588    it ends */
589 /* Note: annotation itself is never allocated */
590 char *
Univ_IIT_annotation(char ** restofheader,T this,int index,bool * alloc_header_p)591 Univ_IIT_annotation (char **restofheader, T this, int index, bool *alloc_header_p) {
592   int recno;
593   UINT4 start;
594   char *annotation, *p;
595   int len;
596 
597   recno = index - 1; /* Convert to 0-based */
598 #ifdef WORDS_BIGENDIAN
599   start = Bigendian_convert_uint(this->annotpointers[recno]);
600 #else
601   start = this->annotpointers[recno];
602 #endif
603 
604   annotation =  &(this->annotations[start]);
605   if (annotation[0] == '\0') {
606     *restofheader = annotation; /* Both are empty strings */
607 
608     *alloc_header_p = false;
609     return annotation;
610 
611   } else if (annotation[0] == '\n') {
612     *restofheader = EMPTY_STRING;
613 
614     *alloc_header_p = false;
615     return &(annotation[1]);
616 
617   } else {
618     p = annotation;
619     while (*p != '\0' && *p != '\n') p++;
620     len = (p - annotation)/sizeof(char);
621     *restofheader = (char *) MALLOC((1+len+1)*sizeof(char));
622     *restofheader[0] = ' ';
623     strncpy(&((*restofheader)[1]),annotation,len);
624     (*restofheader)[1+len] = '\0';
625 
626     if (*p == '\n') p++;
627 
628     *alloc_header_p = true;
629     return p;
630   }
631 }
632 
633 
634 void
Univ_IIT_dump_typestrings(FILE * fp,T this)635 Univ_IIT_dump_typestrings (FILE *fp, T this) {
636   int type;
637   UINT4 start;
638 
639   for (type = 0; type < this->ntypes; type++) {
640     start = this->typepointers[type];
641     fprintf(fp,"%d\t%s\n",type,&(this->typestrings[start]));
642   }
643   return;
644 }
645 
646 void
Univ_IIT_dump(T this)647 Univ_IIT_dump (T this) {
648   int index = 0, i;
649   Univinterval_T interval;
650   Univcoord_T startpos, endpos;
651   char *label, *annotation, *restofheader;
652   bool allocp;
653 
654   for (i = 0; i < this->total_nintervals; i++) {
655     interval = &(this->intervals[i]);
656     label = Univ_IIT_label(this,index+1,&allocp);
657     printf(">%s",label);
658     if (allocp == true) {
659       FREE(label);
660     }
661     startpos = Univinterval_low(interval);
662     endpos = startpos + Univinterval_length(interval) - 1U;
663 
664     printf(" %llu..%llu",(unsigned long long) startpos,(unsigned long long) endpos);
665 
666     if (Univinterval_type(interval) > 0) {
667       printf(" %s",Univ_IIT_typestring(this,Univinterval_type(interval)));
668     }
669 
670     annotation = Univ_IIT_annotation(&restofheader,this,index+1,&allocp);
671     printf("%s\n",restofheader);
672     printf("%s",annotation);
673     if (allocp) {
674       FREE(restofheader);
675     }
676 
677     index++;
678   }
679 
680   return;
681 }
682 
683 void
Univ_IIT_dump_table(T this,bool zerobasedp)684 Univ_IIT_dump_table (T this, bool zerobasedp) {
685   int index = 0, i;
686   Univinterval_T interval;
687   Univcoord_T startpos, endpos;
688   char *label;
689   bool allocp;
690 
691   for (i = 0; i < this->total_nintervals; i++) {
692     interval = &(this->intervals[i]);
693     label = Univ_IIT_label(this,index+1,&allocp);
694     printf("%s\t",label);
695     if (allocp == true) {
696       FREE(label);
697     }
698     startpos = Univinterval_low(interval);
699     endpos = startpos + Univinterval_length(interval) - 1U;
700 
701     if (zerobasedp) {
702       printf("%llu..%llu\t",(unsigned long long) startpos,(unsigned long long) endpos);
703     } else {
704       printf("%llu..%llu\t",(unsigned long long) startpos+1,(unsigned long long) endpos+1);
705     }
706 
707     printf("%u",Univinterval_length(interval));
708     if (Univinterval_type(interval) > 0) {
709       printf("\t%s",Univ_IIT_typestring(this,Univinterval_type(interval)));
710     }
711     printf("\n");
712 
713     index++;
714   }
715 
716   return;
717 }
718 
719 void
Univ_IIT_dump_fai(T this)720 Univ_IIT_dump_fai (T this) {
721   int index = 0, i;
722   Univinterval_T interval;
723   char *label;
724   bool allocp;
725 
726   for (i = 0; i < this->total_nintervals; i++) {
727     interval = &(this->intervals[i]);
728     label = Univ_IIT_label(this,index+1,&allocp);
729     printf("%s %u\n",label,Univinterval_length(interval));
730     if (allocp == true) {
731       FREE(label);
732     }
733     index++;
734   }
735 
736   return;
737 }
738 
739 
740 #ifdef USE_MPI
741 /* For chromosome.iit file, which is stored in version 1 */
742 void
Univ_IIT_dump_sam(MPI_File fp,T this,char * sam_read_group_id,char * sam_read_group_name,char * sam_read_group_library,char * sam_read_group_platform)743 Univ_IIT_dump_sam (MPI_File fp, T this, char *sam_read_group_id, char *sam_read_group_name,
744 		   char *sam_read_group_library, char *sam_read_group_platform) {
745   int index = 0, i;
746   Univinterval_T interval;
747   Chrpos_T interval_length;
748   char *label, buffer[20];
749   bool allocp;
750   int circular_typeint;
751 
752   if (this == NULL) {
753     return;
754   } else {
755     circular_typeint = Univ_IIT_typeint(this,"circular");
756   }
757 
758   for (i = 0; i < this->total_nintervals; i++) {
759     interval = &(this->intervals[i]);
760     label = Univ_IIT_label(this,index+1,&allocp);
761     MPI_File_write_shared(fp,"@SQ\tSN:",strlen("@SQ\tSN:"),MPI_CHAR,MPI_STATUS_IGNORE);
762     MPI_File_write_shared(fp,label,strlen(label),MPI_CHAR,MPI_STATUS_IGNORE);
763     if (allocp == true) {
764       FREE(label);
765     }
766     /* startpos = Univinterval_low(interval); */
767     /* endpos = startpos + Univinterval_length(interval) - 1U; */
768 
769     interval_length = Univinterval_length(interval);
770     sprintf(buffer,"%u",interval_length);
771     MPI_File_write_shared(fp,"\tLN:%s",strlen("\tLN:")+strlen(buffer),MPI_CHAR,MPI_STATUS_IGNORE);
772     if (Univinterval_type(interval) == circular_typeint) {
773       MPI_File_write_shared(fp,"\ttp:circular",strlen("\ttp:circular"),MPI_CHAR,MPI_STATUS_IGNORE);
774     }
775     MPI_File_write_shared(fp,"\n",1,MPI_CHAR,MPI_STATUS_IGNORE);
776 
777     index++;
778   }
779 
780   if (sam_read_group_id != NULL) {
781     MPI_File_write_shared(fp,"@RG\tID:",strlen("@RG\tID:"),MPI_CHAR,MPI_STATUS_IGNORE);
782     MPI_File_write_shared(fp,sam_read_group_id,strlen(sam_read_group_id),MPI_CHAR,MPI_STATUS_IGNORE);
783 
784     if (sam_read_group_platform != NULL) {
785       MPI_File_write_shared(fp,"\tPL:",strlen("\tPL:"),MPI_CHAR,MPI_STATUS_IGNORE);
786       MPI_File_write_shared(fp,sam_read_group_platform,strlen(sam_read_group_platform),MPI_CHAR,MPI_STATUS_IGNORE);
787     }
788     if (sam_read_group_library != NULL) {
789       MPI_File_write_shared(fp,"\tLB:",strlen("\tLB:"),MPI_CHAR,MPI_STATUS_IGNORE);
790       MPI_File_write_shared(fp,sam_read_group_library,strlen(sam_read_group_library),MPI_CHAR,MPI_STATUS_IGNORE);
791     }
792     MPI_File_write_shared(fp,"\tSM:",strlen("\tSM:"),MPI_CHAR,MPI_STATUS_IGNORE);
793     MPI_File_write_shared(fp,sam_read_group_name,strlen(sam_read_group_name),MPI_CHAR,MPI_STATUS_IGNORE);
794     MPI_File_write_shared(fp,"\n",1,MPI_CHAR,MPI_STATUS_IGNORE);
795   }
796 
797   return;
798 }
799 
800 
801 int
Univ_IIT_reserve_sam(T this,char * sam_read_group_id,char * sam_read_group_name,char * sam_read_group_library,char * sam_read_group_platform)802 Univ_IIT_reserve_sam (T this, char *sam_read_group_id, char *sam_read_group_name,
803 		      char *sam_read_group_library, char *sam_read_group_platform) {
804   int nchars = 0;
805   int index = 0, i;
806   Univinterval_T interval;
807   Chrpos_T interval_length;
808   char *label, buffer[20];
809   bool allocp;
810   int circular_typeint;
811 
812   if (this == NULL) {
813     return 0;
814   } else {
815     circular_typeint = Univ_IIT_typeint(this,"circular");
816   }
817 
818   for (i = 0; i < this->total_nintervals; i++) {
819     interval = &(this->intervals[i]);
820     label = Univ_IIT_label(this,index+1,&allocp);
821     nchars += strlen("@SQ\tSN:");
822     nchars += strlen(label);
823     if (allocp == true) {
824       FREE(label);
825     }
826     /* startpos = Univinterval_low(interval); */
827     /* endpos = startpos + Univinterval_length(interval) - 1U; */
828 
829     interval_length = Univinterval_length(interval);
830     sprintf(buffer,"%u",interval_length);
831     nchars += strlen("\tLN:")+strlen(buffer);
832     if (Univinterval_type(interval) == circular_typeint) {
833       nchars += strlen("\ttp:circular");
834     }
835     nchars += strlen("\n");
836 
837     index++;
838   }
839 
840   if (sam_read_group_id != NULL) {
841     nchars += strlen("@RG\tID:");
842     nchars += strlen(sam_read_group_id);
843 
844     if (sam_read_group_platform != NULL) {
845       nchars += strlen("\tPL:");
846       nchars += strlen(sam_read_group_platform);
847     }
848     if (sam_read_group_library != NULL) {
849       nchars += strlen("\tLB:");
850       nchars += strlen(sam_read_group_library);
851     }
852     nchars += strlen("\tSM:");
853     nchars += strlen(sam_read_group_name);
854     nchars += strlen("\n");
855   }
856 
857   return nchars;
858 }
859 
860 
861 #else
862 /* For chromosome.iit file, which is stored in version 1 */
863 void
Univ_IIT_dump_sam(FILE * fp,T this,char * sam_read_group_id,char * sam_read_group_name,char * sam_read_group_library,char * sam_read_group_platform)864 Univ_IIT_dump_sam (FILE *fp, T this, char *sam_read_group_id, char *sam_read_group_name,
865 		   char *sam_read_group_library, char *sam_read_group_platform) {
866   int index = 0, i;
867   Univinterval_T interval;
868   char *label;
869   bool allocp;
870   int circular_typeint;
871 
872   if (this == NULL) {
873     return;
874   } else {
875     circular_typeint = Univ_IIT_typeint(this,"circular");
876   }
877 
878   for (i = 0; i < this->total_nintervals; i++) {
879     interval = &(this->intervals[i]);
880     label = Univ_IIT_label(this,index+1,&allocp);
881     fprintf(fp,"@SQ\tSN:%s",label);
882     if (allocp == true) {
883       FREE(label);
884     }
885     /* startpos = Univinterval_low(interval); */
886     /* endpos = startpos + Univinterval_length(interval) - 1U; */
887 
888     fprintf(fp,"\tLN:%u",Univinterval_length(interval));
889     if (Univinterval_type(interval) == circular_typeint) {
890       fprintf(fp,"\ttp:circular");
891     }
892     fprintf(fp,"\n");
893 
894     index++;
895   }
896 
897   if (sam_read_group_id != NULL) {
898     fprintf(fp,"@RG\tID:%s",sam_read_group_id);
899     if (sam_read_group_platform != NULL) {
900       fprintf(fp,"\tPL:%s",sam_read_group_platform);
901     }
902     if (sam_read_group_library != NULL) {
903       fprintf(fp,"\tLB:%s",sam_read_group_library);
904     }
905     fprintf(fp,"\tSM:%s",sam_read_group_name);
906     fprintf(fp,"\n");
907   }
908 
909   return;
910 }
911 #endif
912 
913 
914 
915 Chrpos_T *
Univ_IIT_chrlengths(T this)916 Univ_IIT_chrlengths (T this) {
917   Chrpos_T *chrlengths;
918   int i;
919   Univinterval_T interval;
920 
921   chrlengths = (Chrpos_T *) MALLOC(this->total_nintervals * sizeof(Chrpos_T));
922   for (i = 0; i < this->total_nintervals; i++) {
923     interval = &(this->intervals[i]);
924     chrlengths[i] = Univinterval_length(interval);
925   }
926 
927   return chrlengths;
928 }
929 
930 
931 void
Univ_IIT_dump_labels(FILE * fp,T this)932 Univ_IIT_dump_labels (FILE *fp, T this) {
933   int i;
934   UINT4 start;
935   char *label;
936 
937   for (i = 0; i < this->total_nintervals; i++) {
938 #ifdef WORDS_BIGENDIAN
939     start = Bigendian_convert_uint(this->labelpointers[i]);
940 #else
941     start = this->labelpointers[i];
942 #endif
943     label = &(this->labels[start]);
944     fprintf(fp,"%s ",label);
945   }
946   fprintf(fp,"\n");
947   return;
948 }
949 
950 
951 
952 /* The iit file has a '\0' after each string, so functions know where
953    it ends */
954 char
Univ_IIT_annotation_firstchar(T this,int index)955 Univ_IIT_annotation_firstchar (T this, int index) {
956   int recno;
957   UINT4 start;
958 
959   recno = index - 1; /* Convert to 0-based */
960 
961 #ifdef WORDS_BIGENDIAN
962   start = Bigendian_convert_uint(this->annotpointers[recno]);
963 #else
964   start = this->annotpointers[recno];
965 #endif
966 
967   return this->annotations[start];
968 }
969 
970 
971 
972 
973 /* For contig.iit file, which is stored in version 1 */
974 void
Univ_IIT_dump_contigs(T this,T chromosome_iit,bool directionalp)975 Univ_IIT_dump_contigs (T this, T chromosome_iit, bool directionalp) {
976   int index = 0, i, chromosome_index;
977   Univinterval_T interval;
978   Univcoord_T startpos, endpos, chroffset;
979   Chrpos_T chrstart, chrend;
980   char *label, firstchar, *chrstring;
981   bool allocp;
982 
983   for (i = 0; i < this->total_nintervals; i++) {
984     interval = &(this->intervals[i]);
985     label = Univ_IIT_label(this,index+1,&allocp);
986     printf("%s\t",label);
987     if (allocp == true) {
988       FREE(label);
989     }
990     startpos = Univinterval_low(interval);
991     endpos = startpos + Univinterval_length(interval) - 1U;
992 
993     chromosome_index = Univ_IIT_get_one(chromosome_iit,startpos,startpos);
994     chroffset = Univinterval_low(Univ_IIT_interval(chromosome_iit,chromosome_index));
995     chrstart = startpos - chroffset;
996     chrend = endpos - chroffset;
997 
998     chrstring = Univ_IIT_label(chromosome_iit,chromosome_index,&allocp);
999 
1000     if (directionalp == false) {
1001       printf("%llu..%llu\t",(unsigned long long) startpos+1U,(unsigned long long) endpos+1U);
1002       printf("%s:%u..%u\t",chrstring,chrstart+1U,chrend+1U);
1003 
1004     } else {
1005       firstchar = Univ_IIT_annotation_firstchar(this,index+1);
1006       if (firstchar == '-') {
1007 	printf("%llu..%llu\t",(unsigned long long) endpos+1U,(unsigned long long) startpos+1U);
1008 	printf("%s:%u..%u\t",chrstring,chrend+1U,chrstart+1U);
1009       } else {
1010 	printf("%llu..%llu\t",(unsigned long long) startpos+1U,(unsigned long long) endpos+1U);
1011 	printf("%s:%u..%u\t",chrstring,chrstart+1U,chrend+1U);
1012       }
1013     }
1014     if (allocp == true) {
1015       FREE(chrstring);
1016     }
1017 
1018     printf("%u",Univinterval_length(interval));
1019     if (Univinterval_type(interval) > 0) {
1020       printf("\t%s",Univ_IIT_typestring(this,Univinterval_type(interval)));
1021     }
1022     printf("\n");
1023 
1024     index++;
1025   }
1026 
1027   return;
1028 }
1029 
1030 
1031 /************************************************************************
1032  * For file format, see iit-write-univ.c
1033  ************************************************************************/
1034 
1035 void
Univ_IIT_free(T * old)1036 Univ_IIT_free (T *old) {
1037 
1038   if (*old != NULL) {
1039     if ((*old)->access == MMAPPED) {
1040 #ifdef HAVE_MMAP
1041       munmap((void *) (*old)->annot_mmap,(*old)->annot_length);
1042       munmap((void *) (*old)->annotpointers_mmap,(*old)->annotpointers_length);
1043       munmap((void *) (*old)->label_mmap,(*old)->label_length);
1044       munmap((void *) (*old)->labelpointers_mmap,(*old)->labelpointers_length);
1045       munmap((void *) (*old)->labelorder_mmap,(*old)->labelorder_length);
1046 #endif
1047       close((*old)->fd);
1048 
1049     } else if ((*old)->access == FILEIO) {
1050       FREE((*old)->annotations);
1051       FREE((*old)->annotpointers);
1052       FREE((*old)->labels);
1053       FREE((*old)->labelpointers);
1054       FREE((*old)->labelorder);
1055       /* close((*old)->fd); -- closed in read_annotations */
1056 
1057     } else if ((*old)->access == ALLOCATED_PRIVATE) {
1058       /* Nothing to close.  IIT must have been created by Univ_IIT_new. */
1059 
1060     } else if ((*old)->access == ALLOCATED_SHARED) {
1061       /* Nothing to close.  IIT must have been created by Univ_IIT_new. */
1062 
1063     } else {
1064       abort();
1065     }
1066 
1067     FREE((*old)->typestrings);
1068     FREE((*old)->typepointers);
1069 
1070     FREE((*old)->intervals);
1071 
1072     /* Note: we are depending on Mem_free() to check that these are non-NULL */
1073     FREE((*old)->nodes);
1074     FREE((*old)->omegas);
1075     FREE((*old)->sigmas);
1076 
1077     FREE(*old);
1078 
1079   }
1080 
1081   return;
1082 }
1083 
1084 
1085 
1086 static void
move_relative(FILE * fp,off_t offset)1087 move_relative (FILE *fp, off_t offset) {
1088 
1089 #ifdef HAVE_FSEEKO
1090   if (fseeko(fp,offset,SEEK_CUR) < 0) {
1091     fprintf(stderr,"Error in move_relative, seek\n");
1092     abort();
1093   }
1094 #else
1095   if (fseek(fp,(long) offset,SEEK_CUR) < 0) {
1096     fprintf(stderr,"Error in move_relative, seek\n");
1097     abort();
1098   }
1099 #endif
1100 
1101   return;
1102 }
1103 
1104 
1105 static size_t
read_tree_univ(size_t offset,size_t filesize,FILE * fp,char * filename,T new)1106 read_tree_univ (size_t offset, size_t filesize, FILE *fp, char *filename, T new) {
1107   size_t items_read;
1108   int i;
1109   UINT4 uint4;
1110 
1111   if ((offset += sizeof(int)*(new->total_nintervals+1)) > filesize) {
1112     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after sigmas %lld, filesize %lld).  Did you generate it using iit_store?\n",
1113 	    filename,(long long int) offset,(long long int) filesize);
1114     exit(9);
1115   } else {
1116     new->sigmas = (int *) CALLOC(new->total_nintervals+1,sizeof(int));
1117     if ((items_read = FREAD_INTS(new->sigmas,new->total_nintervals+1,fp)) != (unsigned int) new->total_nintervals + 1) {
1118       fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1119       exit(9);
1120     }
1121   }
1122 
1123   if ((offset += sizeof(int)*(new->total_nintervals+1)) > filesize) {
1124     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after omegas %lld, filesize %lld).  Did you generate it using iit_store?\n",
1125 	    filename,(long long int) offset,(long long int) filesize);
1126     exit(9);
1127   } else {
1128     new->omegas = (int *) CALLOC(new->total_nintervals+1,sizeof(int));
1129     if ((items_read = FREAD_INTS(new->omegas,new->total_nintervals+1,fp)) != (unsigned int) new->total_nintervals + 1) {
1130       fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1131       exit(9);
1132     }
1133   }
1134 
1135   debug(printf("nnodes: %d\n",new->nnodes));
1136   if (new->nnodes == 0) {
1137     new->nodes = (struct Univ_FNode_T *) NULL;
1138   } else {
1139     new->nodes = (struct Univ_FNode_T *) CALLOC(new->nnodes,sizeof(struct Univ_FNode_T));
1140 #ifdef WORDS_BIGENDIAN
1141     if (new->coord_values_8p == true) {
1142 #ifdef HAVE_64_BIT
1143       for (i = 0; i < new->nnodes; i++) {
1144 	Bigendian_fread_uint8(&(new->nodes[i].value),fp);
1145 	Bigendian_fread_int(&(new->nodes[i].a),fp);
1146 	Bigendian_fread_int(&(new->nodes[i].b),fp);
1147 	Bigendian_fread_int(&(new->nodes[i].leftindex),fp);
1148 	Bigendian_fread_int(&(new->nodes[i].rightindex),fp);
1149       }
1150       offset += (sizeof(UINT8)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1151 #else
1152       fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
1153       exit(9);
1154 #endif
1155     } else {
1156       for (i = 0; i < new->nnodes; i++) {
1157 	Bigendian_fread_uint(&uint4,fp);
1158 	new->nodes[i].value = (UINT8) uint4;
1159 	Bigendian_fread_int(&(new->nodes[i].a),fp);
1160 	Bigendian_fread_int(&(new->nodes[i].b),fp);
1161 	Bigendian_fread_int(&(new->nodes[i].leftindex),fp);
1162 	Bigendian_fread_int(&(new->nodes[i].rightindex),fp);
1163       }
1164       offset += (sizeof(UINT4)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1165     }
1166 #else
1167     if (new->coord_values_8p == true) {
1168 #ifdef HAVE_64_BIT
1169 #if 1
1170       offset += sizeof(struct Univ_FNode_T)*fread(new->nodes,sizeof(struct Univ_FNode_T),new->nnodes,fp);
1171 #else
1172       for (i = 0; i < new->nnodes; i++) {
1173 	FREAD_UINT8(&(new->nodes[i].value),fp);
1174 	FREAD_INT(&(new->nodes[i].a),fp);
1175 	FREAD_INT(&(new->nodes[i].b),fp);
1176 	FREAD_INT(&(new->nodes[i].leftindex),fp);
1177 	FREAD_INT(&(new->nodes[i].rightindex),fp);
1178 	printf("i %d, node value %llu\n",i,(unsigned long long) new->nodes[i].value);
1179       }
1180       offset += (sizeof(UINT8)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1181 #endif
1182 #else
1183       fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
1184       exit(9);
1185 #endif
1186     } else {
1187       for (i = 0; i < new->nnodes; i++) {
1188 	FREAD_UINT(&uint4,fp);
1189 	new->nodes[i].value = (UINT8) uint4;
1190 	FREAD_INT(&(new->nodes[i].a),fp);
1191 	FREAD_INT(&(new->nodes[i].b),fp);
1192 	FREAD_INT(&(new->nodes[i].leftindex),fp);
1193 	FREAD_INT(&(new->nodes[i].rightindex),fp);
1194       }
1195       offset += (sizeof(UINT4)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1196     }
1197 #endif
1198     if (offset > filesize) {
1199       fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after nodes %lld, filesize %lld).  Did you generate it using iit_store?\n",
1200 	      filename,(long long int) offset,(long long int) filesize);
1201       exit(9);
1202     }
1203   }
1204   debug(printf("\n"));
1205 
1206   return offset;
1207 }
1208 
1209 
1210 static size_t
read_intervals_univ(size_t offset,size_t filesize,FILE * fp,char * filename,T new)1211 read_intervals_univ (size_t offset, size_t filesize, FILE *fp, char *filename, T new) {
1212   int i;
1213   UINT4 uint4;
1214 
1215 #ifdef WORDS_BIGENDIAN
1216   if (new->coord_values_8p == true) {
1217 #ifdef HAVE_64_BIT
1218     for (i = 0; i < new->total_nintervals; i++) {
1219       Bigendian_fread_uint8(&(new->intervals[i].low),fp);
1220       Bigendian_fread_uint8(&(new->intervals[i].high),fp);
1221       Bigendian_fread_int(&(new->intervals[i].type),fp);
1222     }
1223 #else
1224     fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
1225     exit(9);
1226 #endif
1227   } else {
1228     for (i = 0; i < new->total_nintervals; i++) {
1229       Bigendian_fread_uint(&uint4,fp);
1230       new->intervals[i].low = (Univcoord_T) uint4;
1231       Bigendian_fread_uint(&uint4,fp);
1232       new->intervals[i].high = (Univcoord_T) uint4;
1233       Bigendian_fread_int(&(new->intervals[i].type),fp);
1234     }
1235     offset += (sizeof(UINT4)+sizeof(UINT4)+sizeof(int))*new->total_nintervals;
1236   }
1237 #else
1238   if (new->coord_values_8p == true) {
1239 #ifdef HAVE_64_BIT
1240     for (i = 0; i < new->total_nintervals; i++) {
1241       FREAD_UINT8(&(new->intervals[i].low),fp);
1242       FREAD_UINT8(&(new->intervals[i].high),fp);
1243       FREAD_INT(&(new->intervals[i].type),fp);
1244     }
1245     offset += (sizeof(UINT8)+sizeof(UINT8)+sizeof(int))*new->total_nintervals;
1246 #else
1247     fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit.  Cannot continue.\n");
1248     exit(9);
1249 #endif
1250   } else {
1251     for (i = 0; i < new->total_nintervals; i++) {
1252       FREAD_UINT(&uint4,fp);
1253       new->intervals[i].low = (Univcoord_T) uint4;
1254       FREAD_UINT(&uint4,fp);
1255       new->intervals[i].high = (Univcoord_T) uint4;
1256       FREAD_INT(&(new->intervals[i].type),fp);
1257     }
1258     offset += (sizeof(UINT4)+sizeof(UINT4)+sizeof(int))*new->total_nintervals;
1259   }
1260 #endif
1261   if (offset > filesize) {
1262     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after intervals %lld, filesize %lld).  Did you generate it using iit_store?\n",
1263 	    filename,(long long int) offset,(long long int) filesize);
1264     exit(9);
1265   }
1266 
1267   return offset;
1268 }
1269 
1270 
1271 static void
read_words(size_t offset,size_t filesize,FILE * fp,T new)1272 read_words (size_t offset, size_t filesize, FILE *fp, T new) {
1273   size_t stringlen;
1274   UINT4 length;
1275 #ifdef DEBUG
1276   int i;
1277 #endif
1278 
1279   new->typepointers = (UINT4 *) CALLOC(new->ntypes+1,sizeof(UINT4));
1280   offset += sizeof(int)*FREAD_UINTS(new->typepointers,new->ntypes+1,fp);
1281   debug(
1282 	printf("typepointers:");
1283 	for (i = 0; i < new->ntypes+1; i++) {
1284 	  printf(" %u",new->typepointers[i]);
1285 	}
1286 	printf("\n");
1287 	);
1288 
1289   stringlen = new->typepointers[new->ntypes];
1290   if (stringlen == 0) {
1291     new->typestrings = (char *) NULL;
1292   } else {
1293     new->typestrings = (char *) CALLOC(stringlen,sizeof(char));
1294     offset += sizeof(char)*FREAD_CHARS(new->typestrings,stringlen,fp);
1295   }
1296   debug(
1297 	printf("typestrings:\n");
1298 	for (s = 0; s < stringlen; s++) {
1299 	  printf("%c",new->typestrings[s]);
1300 	}
1301 	printf("\n");
1302 	);
1303 
1304   debug1(printf("Starting read of labelorder offset/length\n"));
1305   new->labelorder_offset = offset;
1306   new->labelorder_length = (size_t) (new->total_nintervals*sizeof(int));
1307   /* fprintf(stderr,"Doing a move_relative for labelorder_length %zu\n",new->labelorder_length); */
1308   move_relative(fp,new->labelorder_length);
1309   offset += new->labelorder_length;
1310 
1311   debug1(printf("Starting read of labelpointer offset/length\n"));
1312   new->labelpointers_offset = offset;
1313   new->labelpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1314   /* fprintf(stderr,"Doing a move_relative for labelpointer %zu\n",new->total_nintervals * sizeof(UINT4)); */
1315   move_relative(fp,new->total_nintervals * sizeof(UINT4));
1316   FREAD_UINT(&length,fp);
1317   new->label_length = (size_t) length;
1318   offset += new->labelpointers_length;
1319 
1320   debug1(printf("Starting read of label offset/length\n"));
1321   new->label_offset = offset;
1322   /* new->label_length computed above */
1323   /* fprintf(stderr,"Doing a move_relative for label_length %zu\n",new->label_length); */
1324   move_relative(fp,new->label_length);
1325   offset += new->label_length;
1326 
1327   debug1(printf("Starting read of annotpointers offset/length\n"));
1328   new->annotpointers_offset = offset;
1329   new->annotpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1330   offset += new->annotpointers_length;
1331 
1332   new->annot_offset = offset;
1333 
1334 #ifdef BAD_32BIT
1335   /* This fails if length > 4 GB */
1336   move_relative(fp,new->total_nintervals * sizeof(unsigned int));
1337   FREAD_UINT(&length,fp);
1338   new->annot_length = (size_t) length;
1339   fprintf(stderr,"Incorrect length: %u\n",length);
1340 #else
1341   new->annot_length = filesize - new->annot_offset;
1342   /* fprintf(stderr,"annot_length: %zu\n",new->annot_length); */
1343 #endif
1344 
1345 #if 0
1346   /* To do this check, we need to get stringlen for annotation similarly to that for labels */
1347   last_offset = offset + sizeof(char)*stringlen;
1348   if (last_offset != filesize) {
1349     fprintf(stderr,"Problem with last_offset (%lld) not equal to filesize = (%lld)\n",
1350 	    (long long int) last_offset,(long long int) filesize);
1351     exit(9);
1352   }
1353 #endif
1354 
1355   return;
1356 }
1357 
1358 static void
read_words_debug(size_t offset,size_t filesize,FILE * fp,T new)1359 read_words_debug (size_t offset, size_t filesize, FILE *fp, T new) {
1360   size_t stringlen, s;
1361   UINT4 length;
1362   int i;
1363 #if 0
1364   size_t last_offset;
1365 #endif
1366 
1367   new->typepointers = (unsigned int *) CALLOC(new->ntypes+1,sizeof(unsigned int));
1368   offset += sizeof(int)*FREAD_UINTS(new->typepointers,new->ntypes+1,fp);
1369   printf("typepointers:");
1370   for (i = 0; i < new->ntypes+1; i++) {
1371     printf(" %u",new->typepointers[i]);
1372   }
1373   printf("\n");
1374 
1375   stringlen = new->typepointers[new->ntypes];
1376   if (stringlen == 0) {
1377     new->typestrings = (char *) NULL;
1378   } else {
1379     new->typestrings = (char *) CALLOC(stringlen,sizeof(char));
1380     offset += sizeof(char)*FREAD_CHARS(new->typestrings,stringlen,fp);
1381   }
1382   printf("typestrings:\n");
1383   for (s = 0; s < stringlen; s++) {
1384     printf("%c",new->typestrings[s]);
1385   }
1386   printf("\n");
1387 
1388   debug1(printf("Starting read of labelorder offset/length\n"));
1389   new->labelorder_offset = offset;
1390   new->labelorder_length = (size_t) (new->total_nintervals*sizeof(int));
1391   move_relative(fp,new->labelorder_length);
1392   offset += new->labelorder_length;
1393 
1394   debug1(printf("Starting read of labelpointers offset/length\n"));
1395   new->labelpointers_offset = offset;
1396   new->labelpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1397   move_relative(fp,new->total_nintervals * sizeof(UINT4));
1398   FREAD_UINT(&length,fp);
1399   new->label_length = (size_t) length;
1400   offset += new->labelpointers_length;
1401 
1402   fprintf(stderr,"label_length: %zu\n",new->label_length);
1403   debug1(printf("Starting read of label offset/length\n"));
1404   new->label_offset = offset;
1405   /* new->label_length computed above */
1406   move_relative(fp,new->label_length);
1407   offset += new->label_length;
1408 
1409   debug1(printf("Starting read of annotpointers offset/length\n"));
1410   new->annotpointers_offset = offset;
1411   new->annotpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1412   offset += new->annotpointers_length;
1413 
1414   new->annot_offset = offset;
1415 
1416 #ifdef BAD_32BIT
1417   /* This fails if length > 4 GB */
1418   move_relative(fp,new->total_nintervals * sizeof(unsigned int));
1419   FREAD_UINT(&length,fp);
1420   new->annot_length = (size_t) length;
1421   fprintf(stderr,"Incorrect length: %u\n",length);
1422 #else
1423   new->annot_length = filesize - new->annot_offset;
1424   fprintf(stderr,"annot_length: %zu\n",new->annot_length);
1425 #endif
1426 
1427 #if 0
1428   /* To do this check, we need to get stringlen for annotation similarly to that for labels */
1429   last_offset = offset + sizeof(char)*stringlen;
1430   if (last_offset != filesize) {
1431     fprintf(stderr,"Problem with last_offset (%lld) not equal to filesize = (%lld)\n",
1432 	    (long long int) last_offset,(long long int) filesize);
1433     exit(9);
1434   }
1435 #endif
1436 
1437   return;
1438 }
1439 
1440 /* This function only assigns pointers.  Subsequent accesses to
1441    memory, other than char *, still need to be read correctly
1442    by bigendian machines */
1443 /* Previously allowed read/write access, but we can assume read-only access */
1444 #ifdef HAVE_MMAP
1445 static bool
mmap_annotations(char * filename,T new,bool readonlyp)1446 mmap_annotations (char *filename, T new, bool readonlyp) {
1447   int remainder;
1448 
1449   assert(readonlyp == true);
1450 
1451   if ((new->fd = open(filename,O_RDONLY,0764)) < 0) {
1452       fprintf(stderr,"Error: can't open file %s with open for reading\n",filename);
1453       exit(9);
1454   }
1455 
1456   new->labelorder_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->labelorder_offset,new->labelorder_length,
1457 						       /*randomp*/true);
1458   debug(fprintf(stderr,"labelorder_mmap is %p\n",new->labelorder_mmap));
1459   new->labelorder = (int *) &(new->labelorder_mmap[remainder]);
1460   new->labelorder_length += (size_t) remainder;
1461 
1462   new->labelpointers_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->labelpointers_offset,new->labelpointers_length,
1463 							  /*randomp*/true);
1464   debug(fprintf(stderr,"labelpointers_mmap is %p\n",new->labelpointers_mmap));
1465   new->labelpointers = (UINT4 *) &(new->labelpointers_mmap[remainder]);
1466   new->labelpointers_length += (size_t) remainder;
1467 
1468   new->label_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->label_offset,new->label_length,
1469 						  /*randomp*/true);
1470   debug(fprintf(stderr,"labels_mmap is %p\n",new->label_mmap));
1471   new->labels = (char *) &(new->label_mmap[remainder]);
1472   new->label_length += (size_t) remainder;
1473 
1474   new->annotpointers_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->annotpointers_offset,new->annotpointers_length,
1475 							/*randomp*/true);
1476   debug(fprintf(stderr,"annotpointers_mmap is %p\n",new->annotpointers_mmap));
1477   new->annotpointers = (UINT4 *) &(new->annotpointers_mmap[remainder]);
1478   new->annotpointers_length += (size_t) remainder;
1479 
1480   new->annot_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->annot_offset,new->annot_length,
1481 						/*randomp*/true);
1482   debug(fprintf(stderr,"annots_mmap is %p\n",new->annot_mmap));
1483   new->annotations = (char *) &(new->annot_mmap[remainder]);
1484   new->annot_length += (size_t) remainder;
1485 
1486 
1487   if (new->labelorder == NULL || new->labelpointers == NULL || new->labels == NULL) {
1488     fprintf(stderr,"Memory mapping failed in reading IIT file %s.  Using slow file IO instead.\n",filename);
1489     return false;
1490   }
1491 
1492   if (new->annotpointers == NULL || new->annotations == NULL) {
1493     fprintf(stderr,"Memory mapping failed in reading IIT file %s.  Using slow file IO instead.\n",filename);
1494     return false;
1495   }
1496 
1497   return true;
1498 }
1499 #endif
1500 
1501 
1502 /* Used if access is FILEIO.  Subsequent accesses by bigendian
1503    machines to anything but (char *) will still need to convert. */
1504 static void
read_annotations(T new)1505 read_annotations (T new) {
1506 
1507   file_move_absolute(new->fd,new->labelorder_offset,sizeof(int),/*n*/0);
1508   new->labelorder = (int *) CALLOC(new->total_nintervals,sizeof(int));
1509   read(new->fd,new->labelorder,new->total_nintervals*sizeof(int));
1510 
1511   file_move_absolute(new->fd,new->labelpointers_offset,sizeof(UINT4),/*n*/0);
1512   new->labelpointers = (UINT4 *) CALLOC(new->total_nintervals+1,sizeof(UINT4));
1513   read(new->fd,new->labelpointers,(new->total_nintervals+1)*sizeof(UINT4));
1514 
1515   file_move_absolute(new->fd,new->label_offset,sizeof(char),/*n*/0);
1516   new->labels = (char *) CALLOC(new->label_length,sizeof(char));
1517   read(new->fd,new->labels,new->label_length*sizeof(char));
1518 
1519   file_move_absolute(new->fd,new->annotpointers_offset,sizeof(UINT4),/*n*/0);
1520   new->annotpointers = (UINT4 *) CALLOC(new->total_nintervals+1,sizeof(UINT4));
1521   read(new->fd,new->annotpointers,(new->total_nintervals+1)*sizeof(UINT4));
1522 
1523   file_move_absolute(new->fd,new->annot_offset,sizeof(char),/*n*/0);
1524   new->annotations = (char *) CALLOC(new->annot_length,sizeof(char));
1525   read(new->fd,new->annotations,new->annot_length*sizeof(char));
1526 
1527   return;
1528 }
1529 
1530 
1531 
1532 T
Univ_IIT_read(char * filename,bool readonlyp,bool add_iit_p)1533 Univ_IIT_read (char *filename, bool readonlyp, bool add_iit_p) {
1534   T new;
1535   FILE *fp;
1536   char *newfile = NULL;
1537   size_t offset = 0, filesize;
1538 
1539   /* printf("Reading IIT file %s\n",filename); */
1540   if (add_iit_p == true) {
1541     newfile = (char *) CALLOC(strlen(filename)+strlen(".iit")+1,sizeof(char));
1542     sprintf(newfile,"%s.iit",filename);
1543     if ((fp = FOPEN_READ_BINARY(newfile)) != NULL) {
1544       filename = newfile;
1545     } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1546       /* fprintf(stderr,"Cannot open IIT file %s or %s\n",filename,newfile); */
1547       FREE(newfile);
1548       return NULL;
1549     }
1550   } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1551     /* fprintf(stderr,"Cannot open IIT file %s\n",filename); */
1552     return NULL;
1553   }
1554 
1555   new = (T) MALLOC(sizeof(*new));
1556 
1557   filesize = Access_filesize(filename);
1558 
1559   if (FREAD_INT(&new->total_nintervals,fp) < 1) {
1560     fprintf(stderr,"IIT file %s appears to be empty\n",filename);
1561     return NULL;
1562   } else if ((offset += sizeof(int)) > filesize) {
1563     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after first byte %lld, filesize %lld).  Did you generate it using iit_store?\n",
1564 	    filename,(long long int) offset,(long long int) filesize);
1565     return NULL;
1566   }
1567 
1568   if (new->total_nintervals < 0) {
1569 #ifdef LARGE_GENOMES
1570     new->coord_values_8p = true;
1571     new->total_nintervals = -new->total_nintervals;
1572 #elif defined(UTILITYP)
1573     new->coord_values_8p = true;
1574     new->total_nintervals = -new->total_nintervals;
1575 #else
1576     fprintf(stderr,"This is a large genome of more than 2^32 (4 billion) bp.\n");
1577 #ifdef GSNAP
1578     fprintf(stderr,"You should run gsnapl instead.\n");
1579 #else
1580     fprintf(stderr,"You should run gmapl instead.\n");
1581 #endif
1582     exit(9);
1583 #endif
1584 
1585   } else if (new->total_nintervals > 0) {
1586     new->coord_values_8p = false;
1587 
1588   } else {
1589     abort();
1590   }
1591 
1592   debug(printf("version: 1\n"));
1593   debug(printf("total_nintervals: %d\n",new->total_nintervals));
1594   debug(printf("coord values 8p: %d\n",new->coord_values_8p));
1595 
1596 
1597   if (FREAD_INT(&new->ntypes,fp) < 1) {
1598     fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1599     return NULL;
1600   } else if (new->ntypes < 0) {
1601     fprintf(stderr,"IIT file %s appears to have a negative number of types\n",filename);
1602     return NULL;
1603   } else if ((offset += sizeof(int)) > filesize) {
1604     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after ntypes %lld, filesize %lld).  Did you generate it using iit_store?\n",
1605 	    filename,(long long int) offset,(long long int) filesize);
1606     return NULL;
1607   }
1608   debug(printf("ntypes: %d\n",new->ntypes));
1609 
1610   if (FREAD_INT(&new->nnodes,fp) < 1) {
1611     fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1612     return NULL;
1613   } else if (new->nnodes < 0) {
1614     fprintf(stderr,"IIT file %s appears to have a negative number of nodes\n",filename);
1615     return NULL;
1616   } else if ((offset += sizeof(int)) > filesize) {
1617     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after nnodes %lld, filesize %lld).  Did you generate it using iit_store?\n",
1618 	    filename,(long long int) offset,(long long int) filesize);
1619     return NULL;
1620   }
1621 
1622   /* new->divsort = NO_SORT; */
1623   offset = read_tree_univ(offset,filesize,fp,filename,new);
1624 
1625   new->intervals = (struct Univinterval_T *) CALLOC(new->total_nintervals,sizeof(struct Univinterval_T));
1626   offset = read_intervals_univ(offset,filesize,fp,filename,new);
1627 
1628   read_words(offset,filesize,fp,new);
1629   fclose(fp);
1630 
1631 #ifndef HAVE_MMAP
1632   debug1(printf("No mmap available.  Reading annotations\n"));
1633   new->access = FILEIO;
1634   new->fd = Access_fileio(filename);
1635   read_annotations(new);
1636   close(new->fd);
1637   /* pthread_mutex_init(&new->read_mutex,NULL); */
1638 #else
1639   debug1(printf("mmap available.  Setting up pointers to annotations\n"));
1640   new->access = MMAPPED;
1641   if (mmap_annotations(filename,new,readonlyp) == false) {
1642     debug1(printf("  Failed.  Reading annotations\n"));
1643     new->access = FILEIO;
1644     new->fd = Access_fileio(filename);
1645     read_annotations(new);
1646     close(new->fd);
1647     /* pthread_mutex_init(&new->read_mutex,NULL); */
1648   }
1649 #endif
1650 
1651   if (newfile != NULL) {
1652     FREE(newfile);
1653   }
1654 
1655   return new;
1656 }
1657 
1658 
1659 void
Univ_IIT_debug(char * filename)1660 Univ_IIT_debug (char *filename) {
1661   T new;
1662   FILE *fp;
1663   char *newfile = NULL;
1664   size_t offset = 0, filesize;
1665   bool add_iit_p = false;
1666 
1667   if (add_iit_p == true) {
1668     newfile = (char *) CALLOC(strlen(filename)+strlen(".iit")+1,sizeof(char));
1669     sprintf(newfile,"%s.iit",filename);
1670     if ((fp = FOPEN_READ_BINARY(newfile)) != NULL) {
1671       filename = newfile;
1672     } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1673       /* fprintf(stderr,"Cannot open IIT file %s or %s\n",filename,newfile); */
1674       FREE(newfile);
1675       return;
1676     }
1677   } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1678     /* fprintf(stderr,"Cannot open IIT file %s\n",filename); */
1679     return;
1680   }
1681 
1682   new = (T) MALLOC(sizeof(*new));
1683 
1684   filesize = Access_filesize(filename);
1685 
1686   if (FREAD_INT(&new->total_nintervals,fp) < 1) {
1687     fprintf(stderr,"IIT file %s appears to be empty\n",filename);
1688     return;
1689   } else if ((offset += sizeof(int)) > filesize) {
1690     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after first byte %lld, filesize %lld).  Did you generate it using iit_store?\n",
1691 	    filename,(long long int) offset,(long long int) filesize);
1692     return;
1693   }
1694 
1695   if (new->total_nintervals < 0) {
1696 #ifdef HAVE_64_BIT
1697     new->coord_values_8p = true;
1698     new->total_nintervals = -new->total_nintervals;
1699 #else
1700     fprintf(stderr,"IIT file has 64-bit coordinates, but this machine is only 32-bit, so it cannot process it.\n");
1701     exit(9);
1702 #endif
1703 
1704   } else if (new->total_nintervals > 0) {
1705     new->coord_values_8p = false;
1706 
1707   } else {
1708     abort();
1709   }
1710 
1711   printf("version: 1\n");
1712   printf("total_nintervals: %d\n",new->total_nintervals);
1713   printf("coord values 8p: %d\n",new->coord_values_8p);
1714 
1715 
1716   if (FREAD_INT(&new->ntypes,fp) < 1) {
1717     fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1718     return;
1719   } else if (new->ntypes < 0) {
1720     fprintf(stderr,"IIT file %s appears to have a negative number of types\n",filename);
1721     return;
1722   } else if ((offset += sizeof(int)) > filesize) {
1723     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after ntypes %lld, filesize %lld).  Did you generate it using iit_store?\n",
1724 	    filename,(long long int) offset,(long long int) filesize);
1725     return;
1726   }
1727   printf("ntypes: %d\n",new->ntypes);
1728 
1729   if (FREAD_INT(&new->nnodes,fp) < 1) {
1730     fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1731     return;
1732   } else if (new->nnodes < 0) {
1733     fprintf(stderr,"IIT file %s appears to have a negative number of nodes\n",filename);
1734     return;
1735   } else if ((offset += sizeof(int)) > filesize) {
1736     fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after nnodes %lld, filesize %lld).  Did you generate it using iit_store?\n",
1737 	    filename,(long long int) offset,(long long int) filesize);
1738     return;
1739   }
1740 
1741   /* new->divsort = NO_SORT; */
1742   offset = read_tree_univ(offset,filesize,fp,filename,new);
1743 
1744   new->intervals = (struct Univinterval_T *) CALLOC(new->total_nintervals,sizeof(struct Univinterval_T));
1745   offset = read_intervals_univ(offset,filesize,fp,filename,new);
1746 
1747   read_words_debug(offset,filesize,fp,new);
1748   fclose(fp);
1749 
1750 #ifndef HAVE_MMAP
1751   debug1(printf("No mmap available.  Reading annotations\n"));
1752   new->access = FILEIO;
1753   new->fd = Access_fileio(filename);
1754   read_annotations(new);
1755   close(new->fd);
1756   /* pthread_mutex_init(&new->read_mutex,NULL); */
1757 #else
1758   debug1(printf("mmap available.  Setting up pointers to annotations\n"));
1759   new->access = MMAPPED;
1760   if (mmap_annotations(filename,new,/*readonlyp*/true) == false) {
1761     debug1(printf("  Failed.  Reading annotations\n"));
1762     new->access = FILEIO;
1763     new->fd = Access_fileio(filename);
1764     read_annotations(new);
1765     close(new->fd);
1766     /* pthread_mutex_init(&new->read_mutex,NULL); */
1767   }
1768 #endif
1769 
1770   if (newfile != NULL) {
1771     FREE(newfile);
1772   }
1773 
1774   Univ_IIT_free(&new);
1775 
1776   return;
1777 }
1778 
1779 
1780 /************************************************************************/
1781 
1782 static void
fnode_query_aux(int * min,int * max,T this,int nodeindex,Univcoord_T x)1783 fnode_query_aux (int *min, int *max, T this, int nodeindex, Univcoord_T x) {
1784   int lambda;
1785   Univ_FNode_T node;
1786 
1787   if (nodeindex == -1) {
1788     return;
1789   }
1790 
1791   node = &(this->nodes[nodeindex]);
1792   debug(printf("Entered fnode_query_aux with nodeindex %d: a %d, b %d, leftindex %d, rightindex %d, value %llu\n",
1793 	       nodeindex,node->a,node->b,node->leftindex,node->rightindex,(unsigned long long) node->value));
1794 
1795   if (x == node->value) {
1796     debug(printf("%lluD:\n",(unsigned long long) node->value));
1797     if (node->a < *min) {
1798       *min = node->a;
1799     }
1800     if (node->b > *max) {
1801       *max = node->b;
1802     }
1803     return;
1804   } else if (x < node->value) {
1805     debug(printf("x %llu < node->value %llu\n",(unsigned long long) x,(unsigned long long) node->value));
1806     fnode_query_aux(&(*min),&(*max),this,node->leftindex,x);
1807     debug(printf("%lluL:\n",(unsigned long long) node->value));
1808     if (node->a < *min) {
1809       *min = node->a;
1810     }
1811     for (lambda = node->a; lambda <= node->b; lambda++) {
1812       debug(printf("Looking at lambda %d, segment %d\n",
1813 		   lambda,this->sigmas[lambda]));
1814       if (Univinterval_is_contained(x,this->intervals,this->sigmas[lambda]) == true) {
1815 	if (lambda > *max) {
1816 	  *max = lambda;
1817 	}
1818       } else {
1819 	return;
1820       }
1821     }
1822     return;
1823   } else {
1824     /* (node->value < x) */
1825     debug(printf("x %llu > node->value %llu\n",(unsigned long long) x,(unsigned long long) node->value));
1826     fnode_query_aux(&(*min),&(*max),this,node->rightindex,x);
1827     debug(printf("%lluR:\n",(unsigned long long) node->value));
1828     if (node->b > *max) {
1829       *max = node->b;
1830     }
1831     for (lambda = node->b; lambda >= node->a; lambda--) {
1832       debug(printf("Looking at lambda %d, segment %d\n",
1833 		   lambda,this->omegas[lambda]));
1834       if (Univinterval_is_contained(x,this->intervals,this->omegas[lambda]) == true) {
1835 	if (lambda < *min) {
1836 	  *min = lambda;
1837 	}
1838       } else {
1839 	return;
1840       }
1841     }
1842     return;
1843   }
1844 }
1845 
1846 /************************************************************************/
1847 
1848 int *
Univ_IIT_find(int * nmatches,T this,char * label)1849 Univ_IIT_find (int *nmatches, T this, char *label) {
1850   int *matches = NULL, j;
1851   int low, middle, high, recno;
1852   bool foundp = false;
1853   int cmp;
1854 
1855   low = 0;
1856   high = this->total_nintervals;
1857   *nmatches = 0;
1858 
1859   while (!foundp && low < high) {
1860     middle = low + (high - low)/2;
1861 
1862 #ifdef WORDS_BIGENDIAN
1863     cmp = strcmp(label,&(this->labels[Bigendian_convert_uint(this->labelpointers[Bigendian_convert_int(this->labelorder[middle])])]));
1864 #else
1865     cmp = strcmp(label,&(this->labels[this->labelpointers[this->labelorder[middle]]]));
1866 #endif
1867 
1868     if (cmp < 0) {
1869       high = middle;
1870     } else if (cmp > 0) {
1871       low = middle + 1;
1872     } else {
1873       foundp = true;
1874     }
1875   }
1876 
1877   if (foundp == true) {
1878     low = middle;
1879 #ifdef WORDS_BIGENDIAN
1880     while (low-1 >= 0 &&
1881 	   !strcmp(label,&(this->labels[Bigendian_convert_uint(this->labelpointers[Bigendian_convert_int(this->labelorder[low-1])])]))) {
1882       low--;
1883     }
1884 #else
1885     while (low-1 >= 0 &&
1886 	   !strcmp(label,&(this->labels[this->labelpointers[this->labelorder[low-1]]]))) {
1887       low--;
1888     }
1889 #endif
1890 
1891     high = middle;
1892 #ifdef WORDS_BIGENDIAN
1893     while (high+1 < this->total_nintervals &&
1894 	   !strcmp(label,&(this->labels[Bigendian_convert_uint(this->labelpointers[Bigendian_convert_int(this->labelorder[high+1])])]))) {
1895       high++;
1896     }
1897 #else
1898     while (high+1 < this->total_nintervals &&
1899 	   !strcmp(label,&(this->labels[this->labelpointers[this->labelorder[high+1]]]))) {
1900       high++;
1901     }
1902 #endif
1903 
1904 
1905     *nmatches = high - low + 1;
1906     if (*nmatches > 0) {
1907       matches = (int *) CALLOC(*nmatches,sizeof(int));
1908       j = 0;
1909       for (recno = low; recno <= high; recno++) {
1910 #ifdef WORDS_BIGENDIAN
1911 #ifdef DEBUG
1912 	printf("Pushing %d:%d\n",recno,Bigendian_convert_int(this->labelorder[recno]));
1913 #endif
1914 	matches[j++] = Bigendian_convert_int(this->labelorder[recno])+1;
1915 
1916 #else
1917 #ifdef DEBUG
1918 	printf("Pushing %d:%d\n",recno,this->labelorder[recno]);
1919 #endif
1920 	matches[j++] = this->labelorder[recno]+1;
1921 #endif
1922       }
1923     }
1924   }
1925 
1926   return matches;
1927 }
1928 
1929 /* Slow.  Used before binary search method above. */
1930 int
Univ_IIT_find_linear(T this,char * label)1931 Univ_IIT_find_linear (T this, char *label) {
1932   int i;
1933   char *p;
1934 
1935   for (i = 0; i < this->total_nintervals; i++) {
1936 #ifdef WORDS_BIGENDIAN
1937     p = &(this->labels[Bigendian_convert_uint(this->labelpointers[i])]);
1938 #else
1939     p = &(this->labels[this->labelpointers[i]]);
1940 #endif
1941     while (isspace((int) *p)) {
1942       p++;
1943     }
1944     if (!strcmp(label,p)) {
1945       return i + 1;
1946     }
1947   }
1948 
1949   return -1;
1950 }
1951 
1952 /* Returns 1-based index */
1953 int
Univ_IIT_find_one(T this,char * label)1954 Univ_IIT_find_one (T this, char *label) {
1955   int index;
1956   int *matches, nmatches;
1957 
1958   matches = Univ_IIT_find(&nmatches,this,label);
1959   if (nmatches == 0) {
1960     /*
1961     fprintf(stderr,"Expected one match for %s, but got 0\n",
1962 	    label);
1963     */
1964     index = -1;
1965   } else {
1966     if (nmatches > 1) {
1967       fprintf(stderr,"Expected one match for %s, but got %d\n",
1968 	      label,nmatches);
1969     }
1970     index = matches[0];
1971     FREE(matches);
1972   }
1973 
1974   return index;
1975 }
1976 
1977 
1978 /************************************************************************/
1979 
1980 
1981 static const Except_T iit_error = { "IIT problem" };
1982 
1983 static int
int_compare(const void * a,const void * b)1984 int_compare (const void *a, const void *b) {
1985   int x = * (int *) a;
1986   int y = * (int *) b;
1987 
1988   if (x < y) {
1989     return -1;
1990   } else if (y < x) {
1991     return +1;
1992   } else {
1993     return 0;
1994   }
1995 }
1996 
1997 int *
Univ_IIT_get(int * nmatches,T this,Univcoord_T x,Univcoord_T y)1998 Univ_IIT_get (int *nmatches, T this, Univcoord_T x, Univcoord_T y) {
1999   /* int *sorted; */
2000   int *matches = NULL, *uniq, neval, nuniq, i;
2001   int lambda, prev;
2002   int min1, max1 = 0, min2, max2 = 0;
2003   int nintervals;
2004 
2005   if ((nintervals = this->total_nintervals) == 0) {
2006     *nmatches = 0;
2007     return (int *) NULL;
2008   } else {
2009     min1 = min2 = nintervals + 1;
2010   }
2011 
2012   debug(printf("Entering Univ_IIT_get with query %u %u\n",x,y));
2013   fnode_query_aux(&min1,&max1,this,0,x);
2014   fnode_query_aux(&min2,&max2,this,0,y);
2015   debug(printf("min1=%d max1=%d  min2=%d max2=%d\n",min1,max1,min2,max2));
2016 
2017   *nmatches = 0;
2018   if (max2 >= min1) {
2019     neval = (max2 - min1 + 1) + (max2 - min1 + 1);
2020     matches = (int *) CALLOC(neval,sizeof(int));
2021     uniq = (int *) CALLOC(neval,sizeof(int));
2022 
2023     i = 0;
2024     for (lambda = min1; lambda <= max2; lambda++) {
2025       matches[i++] = this->sigmas[lambda];
2026       matches[i++] = this->omegas[lambda];
2027     }
2028 
2029     /* Eliminate duplicates */
2030     qsort(matches,neval,sizeof(int),int_compare);
2031     nuniq = 0;
2032     prev = 0;
2033     debug(printf("unique segments in lambda %d to %d:",min1,max2));
2034     for (i = 0; i < neval; i++) {
2035       if (matches[i] != prev) {
2036 	debug(printf(" %d",matches[i]));
2037 	uniq[nuniq++] = matches[i];
2038 	prev = matches[i];
2039       }
2040     }
2041     debug(printf("\n"));
2042 
2043     for (i = 0; i < nuniq; i++) {
2044       if (Univinterval_overlap_p(x,y,this->intervals,uniq[i]) == true) {
2045 	matches[(*nmatches)++] = uniq[i];
2046 	debug(printf("Pushing overlapping segment %d (%u..%u)\n",uniq[i],
2047 		     Univinterval_low(&(this->intervals[uniq[i]-1])),
2048 		     Univinterval_high(&(this->intervals[uniq[i]-1]))));
2049       } else {
2050 	debug(printf("Not pushing non-overlapping segment %d (%u..%u)\n",uniq[i],
2051 		     Univinterval_low(&(this->intervals[uniq[i]-1])),
2052 		     Univinterval_high(&(this->intervals[uniq[i]-1]))));
2053       }
2054     }
2055 
2056     FREE(uniq);
2057   }
2058 
2059   return matches;
2060 
2061 #if 0
2062   /* For some reason, sort_matches_by_position is annotated as being for version 3 and later */
2063   if (sortp == false) {
2064     return matches;
2065   } else if (this->version <= 2) {
2066     sorted = sort_matches_by_type(this,matches,*nmatches,/*alphabetizep*/true);
2067     FREE(matches);
2068     return sorted;
2069   } else {
2070     sorted = sort_matches_by_position(this,matches,*nmatches);
2071     FREE(matches);
2072     return sorted;
2073   }
2074 #endif
2075 }
2076 
2077 
2078 /* Guaranteed to return one result, even if coordinate is out of bounds */
2079 int
Univ_IIT_get_one(T this,Univcoord_T x,Univcoord_T y)2080 Univ_IIT_get_one (T this, Univcoord_T x, Univcoord_T y) {
2081   int lambda;
2082   int min1, max1 = 0, min2, max2 = 0;
2083   bool stopp;
2084   Univinterval_T interval;
2085 
2086   min1 = min2 = this->total_nintervals + 1;
2087 
2088   debug(printf("Entering Univ_IIT_get_one with query %llu %llu\n",(unsigned long long) x,(unsigned long long) y));
2089   fnode_query_aux(&min1,&max1,this,0,x);
2090   fnode_query_aux(&min2,&max2,this,0,y);
2091   debug(printf("min1=%d max1=%d  min2=%d max2=%d\n",min1,max1,min2,max2));
2092 
2093   if (max2 >= min1) {
2094     for (lambda = min1; lambda <= max2; lambda++) {
2095       if (Univinterval_overlap_p(x,y,this->intervals,this->sigmas[lambda]) == true) {
2096 	return this->sigmas[lambda];
2097       }
2098     }
2099     for (lambda = min1; lambda <= max2; lambda++) {
2100       if (Univinterval_overlap_p(x,y,this->intervals,this->omegas[lambda]) == true) {
2101 	return this->omegas[lambda];
2102       }
2103     }
2104   }
2105 
2106   /* fprintf(stderr,"Expected one match for %u--%u, but got none\n",x,y); */
2107   /* If we miss (e.g., for circular chromosome), then report the chromosome below */
2108   /* Look at betas or omegas for left flank */
2109   lambda = min1 - 1;
2110   stopp = false;
2111   while (lambda >= 1 && stopp == false) {
2112     interval = &(this->intervals[this->omegas[lambda]-1]);
2113     if (Univinterval_high(interval) >= x) {
2114       lambda--;
2115     } else {
2116       return this->omegas[lambda];
2117     }
2118   }
2119 
2120   return this->omegas[/*lambda*/1];
2121 }
2122 
2123 
2124 /* Assumes we have a running value for *chrhigh and that successive calls have increasing values for left */
2125 Univcoord_T
Univ_IIT_update_chrnum(int * chrnum,Univcoord_T * chroffset,Univcoord_T chrhigh,Chrpos_T * chrlength,T this,Univcoord_T low,Univcoord_T high,int circular_typeint)2126 Univ_IIT_update_chrnum (int *chrnum, Univcoord_T *chroffset, Univcoord_T chrhigh, Chrpos_T *chrlength,
2127 			T this, Univcoord_T low, Univcoord_T high, int circular_typeint) {
2128   int start_chrnum, end_chrnum, test_chrnum;
2129   Univcoord_T test_chroffset, test_chrhigh;
2130   Chrpos_T most_inbounds, inbounds, test_chrlength;
2131 
2132 #ifdef FAST_CHR_UPDATE
2133   /* Need to call Univ_IIT_intervals_setup at start of program */
2134   Univcoord_T goal;
2135   int nchromosomes_local = nchromosomes;
2136   Univcoord_T *chrhighs_local = chrhighs;
2137   int j, testj, endj;
2138 #endif
2139 
2140   if (low > chrhigh) {
2141     debug4(printf("\nUpdating chrhigh because low %u > chrhigh %u\n",low,chrhigh));
2142 
2143 #ifndef FAST_CHR_UPDATE
2144     *chrnum = Univ_IIT_get_one(this,low,low);
2145     Univ_IIT_interval_bounds(&(*chroffset),&chrhigh,&(*chrlength),this,*chrnum,circular_typeint);
2146 
2147 #else
2148     /* Code from segment-search.c */
2149     j = 1;
2150     goal = low + 1;
2151     while (j < nchromosomes_local && chrhighs_local[j] < goal) {
2152       j <<= 1;			/* gallop by 2 */
2153     }
2154     if (j >= nchromosomes_local) {
2155       j = binary_search(j >> 1,nchromosomes_local,chrhighs_local,goal);
2156     } else {
2157       j = binary_search(j >> 1,j,chrhighs_local,goal);
2158     }
2159     chrnum += j;
2160 
2161     chrhigh = chrhighs[chrnum-1];
2162     chroffset = chroffsets[chrnum-1];
2163     chrlength = chrlengths[chrnum-1];
2164     chrhighs_local += j;
2165     nchromosomes_local -= j;
2166     debug4(printf("Got chrnum %d, chroffset %u, chrhigh %u\n",chrnum,chroffset,chrhigh));
2167 #endif
2168 
2169     debug4(printf("Updating chrnum to be %d\n",*chrnum));
2170   }
2171 
2172   if (high > chrhigh) {
2173     /* Straddles two or more chromosomes */
2174     /* Test first chromosome */
2175     start_chrnum = *chrnum;
2176     most_inbounds = chrhigh - low;
2177     debug4(printf("Straddle.  First chromosome inbounds: %u.  ",most_inbounds));
2178 
2179 
2180     /* Test middle chromosomes */
2181 #ifndef FAST_CHR_UPDATE
2182     end_chrnum = Univ_IIT_get_one(this,high-1U,high-1U);
2183 
2184     for (test_chrnum = start_chrnum + 1; test_chrnum < end_chrnum; test_chrnum++) {
2185       Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,test_chrnum,circular_typeint);
2186       debug4(printf("Next inbounds: %u.  ",test_chrhigh - test_chroffset));
2187       if ((inbounds = test_chrhigh - test_chroffset) > most_inbounds) {
2188 	*chrnum = test_chrnum; *chroffset = test_chroffset; chrhigh = test_chrhigh, *chrlength = test_chrlength;
2189 	most_inbounds = inbounds;
2190       }
2191     }
2192 
2193 #else
2194     /* Code from segment-search.c */
2195     endj = 0;
2196     while (endj+1 < nchromosomes_local && high > chrhighs_local[endj+1]) {
2197       debug1(printf("For high %u, advancing to chrhigh %u\n",high,chrhighs_local[endj+1]));
2198       endj++;
2199     }
2200     if (endj+1 < nchromosomes_local) {
2201       endj++;
2202     }
2203     debug4(printf("endj is %d\n",endj));
2204 
2205     for (testj = 1; testj < endj; testj++) {
2206       debug1(printf("Next inbounds: %u.  ",chrhighs_local[testj] - chroffsets[(chrnum+testj)-1]));
2207       if ((inbounds = chrhighs_local[testj] - chroffsets[(chrnum+testj)-1]) > most_inbounds) {
2208 	j = testj;
2209 	most_inbounds = inbounds;
2210       }
2211     }
2212 #endif
2213 
2214 
2215     /* Test last chromosome */
2216 #ifndef FAST_CHR_UPDATE
2217     Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,end_chrnum,circular_typeint);
2218     debug4(printf("Last inbounds: %u\n",high - test_chroffset));
2219     if ((/*inbounds = */high - test_chroffset) > most_inbounds) {
2220       *chrnum = end_chrnum; *chroffset = test_chroffset; chrhigh = test_chrhigh, *chrlength = test_chrlength;
2221       /* most_inbounds = inbounds; */
2222     }
2223 
2224 #else
2225     /* Code from segment-search.c */
2226     if ((/*inbounds = */high - chroffsets[(chrnum+endj)-1]) > most_inbounds) {
2227       j = endj;
2228     }
2229 
2230     chrnum += j;
2231     chrhigh = chrhighs[chrnum-1];
2232     chroffset = chroffsets[chrnum-1];
2233     chrlength = chrlengths[chrnum-1];
2234     chrhighs_local += j;
2235     nchromosomes_local -= j;
2236 #endif
2237   }
2238 
2239   debug4(printf("Returning chrnum %d\n",*chrnum));
2240   return chrhigh;
2241 }
2242 
2243 
2244 int
Univ_IIT_get_chrnum(Univcoord_T * chroffset,Univcoord_T * chrhigh,Chrpos_T * chrlength,T this,Univcoord_T low,Univcoord_T high,int circular_typeint)2245 Univ_IIT_get_chrnum (Univcoord_T *chroffset, Univcoord_T *chrhigh, Chrpos_T *chrlength,
2246 		     T this, Univcoord_T low, Univcoord_T high, int circular_typeint) {
2247   int chrnum, start_chrnum, end_chrnum, test_chrnum;
2248   Univcoord_T test_chroffset, test_chrhigh;
2249   Chrpos_T most_inbounds, inbounds, test_chrlength;
2250 
2251   assert(low < high);
2252 
2253   chrnum = Univ_IIT_get_one(this,low,low);
2254   Univ_IIT_interval_bounds(&(*chroffset),&(*chrhigh),&(*chrlength),this,chrnum,circular_typeint);
2255   debug4(printf("Setting chrnum to be %d\n",chrnum));
2256 
2257   if (high > *chrhigh) {
2258     /* Straddles two or more chromosomes */
2259     /* Test first chromosome */
2260     start_chrnum = chrnum;
2261     most_inbounds = (*chrhigh) - low;
2262     debug4(printf("Straddle.  First chromosome inbounds: %u.  ",most_inbounds));
2263 
2264 
2265     /* Test middle chromosomes */
2266     end_chrnum = Univ_IIT_get_one(this,high-1U,high-1U);
2267     for (test_chrnum = start_chrnum + 1; test_chrnum < end_chrnum; test_chrnum++) {
2268       Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,test_chrnum,circular_typeint);
2269       debug4(printf("Next inbounds: %u.  ",test_chrhigh - test_chroffset));
2270       if ((inbounds = test_chrhigh - test_chroffset) > most_inbounds) {
2271 	chrnum = test_chrnum; *chroffset = test_chroffset; *chrhigh = test_chrhigh, *chrlength = test_chrlength;
2272 	most_inbounds = inbounds;
2273       }
2274     }
2275 
2276 
2277     /* Test last chromosome */
2278     Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,end_chrnum,circular_typeint);
2279     debug4(printf("Last inbounds: %u\n",high - test_chroffset));
2280     if ((inbounds = high - test_chroffset) > most_inbounds) {
2281       chrnum = end_chrnum; *chroffset = test_chroffset; *chrhigh = test_chrhigh, *chrlength = test_chrlength;
2282       /* most_inbounds = inbounds; */
2283     }
2284   }
2285 
2286   debug4(printf("Returning chrnum %d\n",chrnum));
2287   return chrnum;
2288 }
2289 
2290 
2291 int
Univ_IIT_get_trnum(Trcoord_T * troffset,Trcoord_T * trhigh,Trcoord_T * trlength,T this,Trcoord_T low,Trcoord_T high)2292 Univ_IIT_get_trnum (Trcoord_T *troffset, Trcoord_T *trhigh, Trcoord_T *trlength,
2293 		    T this, Trcoord_T low, Trcoord_T high) {
2294   int trnum, start_trnum, end_trnum, test_trnum;
2295   Trcoord_T test_troffset, test_trhigh;
2296   Trcoord_T most_inbounds, inbounds, test_trlength;
2297 
2298   assert(low < high);
2299 
2300   trnum = Univ_IIT_get_one(this,low,low);
2301   Univ_IIT_interval_bounds_small(&(*troffset),&(*trhigh),&(*trlength),this,trnum);
2302   debug4(printf("Setting trnum to be %d\n",trnum));
2303 
2304   if (high > *trhigh) {
2305     /* Straddles two or more transcripts */
2306     /* Test first transcript */
2307     start_trnum = trnum;
2308     most_inbounds = (*trhigh) - low;
2309     debug4(printf("Straddle.  First transcript inbounds: %u.  ",most_inbounds));
2310 
2311 
2312     /* Test middle transcripts */
2313     end_trnum = Univ_IIT_get_one(this,high-1U,high-1U);
2314     for (test_trnum = start_trnum + 1; test_trnum < end_trnum; test_trnum++) {
2315       Univ_IIT_interval_bounds_small(&test_troffset,&test_trhigh,&test_trlength,this,test_trnum);
2316       debug4(printf("Next inbounds: %u.  ",test_trhigh - test_troffset));
2317       if ((inbounds = test_trhigh - test_troffset) > most_inbounds) {
2318 	trnum = test_trnum; *troffset = test_troffset; *trhigh = test_trhigh, *trlength = test_trlength;
2319 	most_inbounds = inbounds;
2320       }
2321     }
2322 
2323     /* Test last transcript */
2324     Univ_IIT_interval_bounds_small(&test_troffset,&test_trhigh,&test_trlength,this,end_trnum);
2325     debug4(printf("Last inbounds: %u\n",high - test_troffset));
2326     if ((inbounds = high - test_troffset) > most_inbounds) {
2327       trnum = end_trnum; *troffset = test_troffset; *trhigh = test_trhigh, *trlength = test_trlength;
2328       /* most_inbounds = inbounds; */
2329     }
2330   }
2331 
2332   debug4(printf("Returning trnum %d\n",trnum));
2333   return trnum;
2334 }
2335 
2336 
2337 /* Generally called where intervals don't overlap, like chromosomes,
2338    and where x == y. */
2339 /*
2340 int
2341 Univ_IIT_get_one_safe (T this, Univcoord_T x, Univcoord_T y) {
2342   int index;
2343   int *matches, nmatches;
2344 
2345   matches = Univ_IIT_get(&nmatches,this,x,y,sortp);
2346   if (nmatches != 1) {
2347     fprintf(stderr,"Expected one match for %u--%u, but got %d\n",
2348 	    x,y,nmatches);
2349     abort();
2350   }
2351   index = matches[0];
2352   FREE(matches);
2353   return index;
2354 }
2355 */
2356 
2357 
2358 /* Note: Procedure call from get-genome.c needed to subtract 1 from
2359    position and then add 1 to chrpos */
2360 char *
Univ_IIT_string_from_position(Chrpos_T * chrpos,Univcoord_T position,T chromosome_iit)2361 Univ_IIT_string_from_position (Chrpos_T *chrpos, Univcoord_T position, T chromosome_iit) {
2362   char *string, *chrstring;
2363   int index;
2364   bool allocp;
2365 
2366   index = Univ_IIT_get_one(chromosome_iit,position,position);
2367   *chrpos = position - Univinterval_low(Univ_IIT_interval(chromosome_iit,index));
2368   chrstring = Univ_IIT_label(chromosome_iit,index,&allocp);
2369   if (allocp == true) {
2370     return chrstring;
2371   } else {
2372     string = (char *) CALLOC(strlen(chrstring)+1,sizeof(char));
2373     strcpy(string,chrstring);
2374     return string;
2375   }
2376 }
2377 
2378 
2379