1 static char rcsid[] = "$Id: iit-read-univ.c 222811 2020-06-03 22:02:06Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "iit-read-univ.h"
7 #include "parserange.h"
8
9 #ifdef WORDS_BIGENDIAN
10 #include "bigendian.h"
11 #else
12 #include "littleendian.h"
13 #endif
14
15 #include <stdlib.h> /* For qsort */
16 #include <string.h> /* For memset */
17 #include <strings.h>
18 #include <ctype.h> /* For isspace */
19 #ifdef HAVE_UNISTD_H
20 #include <unistd.h> /* For mmap on Linux */
21 #endif
22 #ifdef HAVE_SYS_TYPES_H
23 #include <sys/types.h> /* For open, fstat, and mmap */
24 #endif
25 /* Not sure why this was included
26 #include <sys/param.h>
27 */
28 #ifdef HAVE_FCNTL_H
29 #include <fcntl.h> /* For open */
30 #endif
31 #ifdef HAVE_SYS_STAT_H
32 #include <sys/stat.h> /* For open and fstat */
33 #endif
34 #include <sys/mman.h> /* For mmap and madvise */
35 #include <math.h> /* For qsort */
36 #include <errno.h> /* For perror */
37 #include "assert.h"
38 #include "except.h"
39 #include "mem.h"
40 #include "access.h"
41 #include "fopen.h"
42 #include "uintlist.h"
43 #include "intlist.h"
44
45
46 /* Note: if sizeof(int) or sizeof(unsigned int) are not 4, then the below code is faulty */
47
48
49 /* Integer interval tree. */
50
51 /*
52 * n intervals;
53 * specified by their indices e[1..n]
54 * and endpoint-access function:
55 * low (e[i])
56 * high (e[i])
57 * is_contained (x, e[i])
58 * eg:
59 * interval e[i] ... "[" low (e[i]) "," high (e[i]) ")"
60 * is_contained (x, e[i]) ... ( (low (e[i]) <= x
61 * and (x < high (e[i]))
62 */
63
64 /*--------------------------------------------------------------------------*/
65
66 #ifdef DEBUG
67 #define debug(x) x
68 #else
69 #define debug(x)
70 #endif
71
72 /* Timing */
73 #ifdef DEBUG1
74 #define debug1(x) x
75 #else
76 #define debug1(x)
77 #endif
78
79 /* Flanking */
80 #ifdef DEBUG2
81 #define debug2(x) x
82 #else
83 #define debug2(x)
84 #endif
85
86 /* Binary search */
87 #ifdef DEBUG3
88 #define debug3(x) x
89 #else
90 #define debug3(x)
91 #endif
92
93 /* Straddle */
94 #ifdef DEBUG4
95 #define debug4(x) x
96 #else
97 #define debug4(x)
98 #endif
99
100
101 /* Cannot use version in iitdef.h, because that uses Chrpos_T for the
102 value, and utility functions need to have Univcoord_T (or UINT8 if
103 available). */
104 typedef struct Univ_FNode_T *Univ_FNode_T;
105 struct Univ_FNode_T {
106 Univ_IIT_coord_T value;
107 int a;
108 int b;
109 int leftindex;
110 int rightindex;
111 };
112
113 #define T Univ_IIT_T
114 struct T {
115 bool coord_values_8p;
116
117 int fd;
118 Access_T access; /* access type */
119
120 #ifdef HAVE_PTHREAD
121 pthread_mutex_t read_mutex;
122 #endif
123
124 int ntypes; /* Always >= 1 */
125
126 int total_nintervals;
127 int nnodes;
128
129 int *sigmas; /* Ordering for IIT */
130 int *omegas; /* Ordering for IIT */
131
132 struct Univ_FNode_T *nodes;
133 struct Univinterval_T *intervals;
134
135 UINT4 *typepointers;
136 char *typestrings;
137
138 size_t labelorder_offset;
139 size_t labelorder_length; /* mmap length (mmap uses size_t, not off_t) */
140 char *labelorder_mmap;
141
142 size_t labelpointers_offset;
143 size_t labelpointers_length; /* mmap length (mmap uses size_t, not off_t) */
144 char *labelpointers_mmap;
145
146 size_t label_offset;
147 size_t label_length; /* mmap length (mmap uses size_t, not off_t) */
148 char *label_mmap;
149
150 size_t annotpointers_offset;
151 size_t annotpointers_length; /* mmap length (mmap uses size_t, not off_t) */
152 char *annotpointers_mmap;
153
154 size_t annot_offset;
155 size_t annot_length; /* mmap length (mmap uses size_t, not off_t) */
156 char *annot_mmap;
157
158 int *labelorder;
159 UINT4 *labelpointers;
160 char *labels;
161
162 UINT4 *annotpointers;
163 char *annotations;
164
165 void **datapointers;
166 };
167
168
169
170 static void
file_move_absolute(int fd,size_t offset,size_t objsize,Univcoord_T n)171 file_move_absolute (int fd, size_t offset, size_t objsize, Univcoord_T n) {
172 off_t position = offset + n*objsize;
173
174 if (lseek(fd,position,SEEK_SET) < 0) {
175 perror("Error in gmap, file_move_label");
176 exit(9);
177 }
178 return;
179 }
180
181
182 bool
Univ_IIT_coord_values_8p(T this)183 Univ_IIT_coord_values_8p (T this) {
184 return this->coord_values_8p;
185 }
186
187 int
Univ_IIT_total_nintervals(T this)188 Univ_IIT_total_nintervals (T this) {
189 return this->total_nintervals;
190 }
191
192 int
Univ_IIT_ntypes(T this)193 Univ_IIT_ntypes (T this) {
194 return this->ntypes;
195 }
196
197 Univcoord_T
Univ_IIT_length(T this,int index)198 Univ_IIT_length (T this, int index) {
199 Univinterval_T interval;
200
201 interval = &(this->intervals[index-1]);
202 return Univinterval_length(interval);
203 }
204
205
206 /* Assumes intervals are stored using universal coordinates */
207 Univcoord_T
Univ_IIT_genomelength(T chromosome_iit,bool with_circular_alias_p)208 Univ_IIT_genomelength (T chromosome_iit, bool with_circular_alias_p) {
209 Univcoord_T max = 0U, this_max;
210 Univinterval_T interval;
211 int i;
212 int circular_typeint;
213
214 circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
215
216 for (i = 0; i < chromosome_iit->total_nintervals; i++) {
217 interval = &(chromosome_iit->intervals[i]);
218 if (with_circular_alias_p == true && Univinterval_type(interval) == circular_typeint) {
219 this_max = Univinterval_high(interval) + Univinterval_length(interval);
220 } else {
221 this_max = Univinterval_high(interval);
222 }
223 if (this_max > max) {
224 max = this_max;
225 }
226 }
227
228 /* Convert from zero-based coordinate */
229 return max+1U;
230 }
231
232
233 bool *
Univ_IIT_circularp(bool * any_circular_p,T chromosome_iit)234 Univ_IIT_circularp (bool *any_circular_p, T chromosome_iit) {
235 bool *circularp;
236 Univinterval_T interval;
237 int chrnum, nchromosomes;
238 int circular_typeint;
239
240 nchromosomes = chromosome_iit->total_nintervals;
241 circularp = (bool *) CALLOC(nchromosomes+1,sizeof(bool));
242
243 *any_circular_p = false;
244 circularp[0] = false; /* chrnum of 0 indicates translocation */
245 if ((circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular")) >= 0) {
246 for (chrnum = 0; chrnum < nchromosomes; chrnum++) {
247 interval = &(chromosome_iit->intervals[chrnum]);
248 if (Univinterval_type(interval) == circular_typeint) {
249 *any_circular_p = true;
250 circularp[chrnum+1] = true;
251 }
252 }
253 }
254
255 return circularp;
256 }
257
258
259 bool *
Univ_IIT_altlocp(Univcoord_T ** alias_starts,Univcoord_T ** alias_ends,T chromosome_iit,T altscaffold_iit)260 Univ_IIT_altlocp (Univcoord_T **alias_starts, Univcoord_T **alias_ends, T chromosome_iit, T altscaffold_iit) {
261 bool *altlocp, alloc_header_p;
262 char *chr, *primary_chr, *restofheader, *p;
263 int chri, nchromosomes, alti, primary_chrnum;
264 bool revcomp;
265 Univcoord_T genomicstart, chroffset;
266 Chrpos_T genomiclength, chrstart, chrend, chrlength;
267 UINT4 start;
268 int nprimary = 0, naltloc = 0;
269 int circular_typeint;
270 Univinterval_T interval;
271
272
273 nchromosomes = chromosome_iit->total_nintervals;
274 altlocp = (bool *) CALLOC(nchromosomes+1,sizeof(bool));
275 *alias_starts = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
276 *alias_ends = (Univcoord_T *) CALLOC(nchromosomes+1,sizeof(Univcoord_T));
277
278 altlocp[0] = false; /* chrnum of 0 indicates translocation */
279 circular_typeint = Univ_IIT_typeint(chromosome_iit,"circular");
280
281 /* Can go through all chromosomes in chromosome_iit, since it contains both primary and alternate loci */
282 for (chri = 0; chri < nchromosomes; chri++) {
283 #ifdef WORDS_BIGENDIAN
284 /* chromosome_iit should be version 1 */
285 start = Bigendian_convert_uint(chromosome_iit->labelpointers[chri]);
286 #else
287 start = chromosome_iit->labelpointers[chri];
288 #endif
289 chr = &(chromosome_iit->labels[start]);
290
291 if ((alti = Univ_IIT_find_one(altscaffold_iit,chr)) >= 0) {
292 /* Don't need to assign result, since annot is never allocated */
293 Univ_IIT_annotation(&restofheader,altscaffold_iit,alti/*+1*/,&alloc_header_p);
294 p = restofheader;
295 while (isspace((int) *p)) {
296 p++;
297 }
298
299 if (naltloc > 10) {
300 /* Don't print */
301 } else if (naltloc == 10) {
302 fprintf(stderr,"More than 10 alternate loci. Will stop printing.\n");
303 } else {
304 fprintf(stderr,"Alternate locus %s maps to primary region %s\n",chr,p);
305 }
306
307 Parserange_universal_iit(&primary_chr,&revcomp,&genomicstart,&genomiclength,
308 &chrstart,&chrend,&chroffset,&chrlength,
309 p,chromosome_iit,/*contig_iit*/NULL);
310 /* fprintf(stderr," => %s %u..%u\n",primary_chr,genomicstart,genomicstart+genomiclength); */
311
312 if (circular_typeint < 0) {
313 altlocp[chri+1] = true;
314 (*alias_starts)[chri+1] = genomicstart;
315 (*alias_ends)[chri+1] = genomicstart + genomiclength;
316 naltloc++;
317
318 } else {
319 primary_chrnum = Univ_IIT_find_one(chromosome_iit,primary_chr);
320 interval = &(chromosome_iit->intervals[primary_chrnum-1]);
321 if (Univinterval_type(interval) != circular_typeint) {
322 altlocp[chri+1] = true;
323 (*alias_starts)[chri+1] = genomicstart;
324 (*alias_ends)[chri+1] = genomicstart + genomiclength;
325 naltloc++;
326
327 } else {
328 fprintf(stderr,"Alternate locus %s maps to region %s, which is circular and not allowed. Ignoring.\n",chr,p);
329 nprimary++;
330 }
331 }
332
333 if (alloc_header_p) {
334 FREE(restofheader);
335 }
336
337 FREE(primary_chr);
338
339 } else {
340 /* printf("Chromosome %s is not an alt locus\n",chr); */
341 nprimary++;
342 }
343 }
344
345 fprintf(stderr,"Found %d alternate loci, and %d primary contigs (not mapping to another contig)\n",
346 naltloc,nprimary);
347
348 return altlocp;
349 }
350
351
352 Univinterval_T
Univ_IIT_interval(T this,int index)353 Univ_IIT_interval (T this, int index) {
354 assert(index <= this->total_nintervals);
355 return &(this->intervals[index-1]); /* Convert to 0-based */
356 }
357
358 Univcoord_T
Univ_IIT_interval_low(T this,int index)359 Univ_IIT_interval_low (T this, int index) {
360 Univinterval_T interval;
361
362 assert(index <= this->total_nintervals);
363 interval = &(this->intervals[index-1]);
364 return Univinterval_low(interval);
365 }
366
367 Univcoord_T
Univ_IIT_interval_high(T this,int index)368 Univ_IIT_interval_high (T this, int index) {
369 Univinterval_T interval;
370
371 assert(index <= this->total_nintervals);
372 interval = &(this->intervals[index-1]);
373 return Univinterval_high(interval);
374 }
375
376 Univcoord_T
Univ_IIT_interval_length(T this,int index)377 Univ_IIT_interval_length (T this, int index) {
378 Univinterval_T interval;
379
380 assert(index <= this->total_nintervals);
381 interval = &(this->intervals[index-1]);
382 return Univinterval_length(interval);
383 }
384
385 int
Univ_IIT_interval_type(T this,int index)386 Univ_IIT_interval_type (T this, int index) {
387 Univinterval_T interval;
388
389 assert(index <= this->total_nintervals);
390 interval = &(this->intervals[index-1]);
391 return Univinterval_type(interval);
392 }
393
394
395 Univcoord_T
Univ_IIT_next_chrbound(T this,int index,int circular_typeint)396 Univ_IIT_next_chrbound (T this, int index, int circular_typeint) {
397 Univinterval_T interval;
398
399 assert(index <= this->total_nintervals);
400 interval = &(this->intervals[index-1]);
401 if (Univinterval_type(interval) == circular_typeint) {
402 return Univinterval_high(interval) + Univinterval_length(interval);
403 } else {
404 return Univinterval_high(interval);
405 }
406 }
407
408 /* chrhigh is one past the highest position in the chromosome */
409 void
Univ_IIT_interval_bounds(Univcoord_T * low,Univcoord_T * high,Chrpos_T * length,T this,int index,int circular_typeint)410 Univ_IIT_interval_bounds (Univcoord_T *low, Univcoord_T *high, Chrpos_T *length, T this,
411 int index, int circular_typeint) {
412 Univinterval_T interval;
413
414 assert(index > 0);
415 assert(index <= this->total_nintervals);
416
417 interval = &(this->intervals[index-1]);
418 *low = Univinterval_low(interval);
419 *length = Univinterval_length(interval);
420 if (Univinterval_type(interval) == circular_typeint) {
421 *high = Univinterval_high(interval) + 1 + (*length);
422 } else {
423 *high = Univinterval_high(interval) + 1;
424 }
425 return;
426 }
427
428
429 static void
Univ_IIT_interval_bounds_small(Trcoord_T * low,Trcoord_T * high,Chrpos_T * length,T this,int index)430 Univ_IIT_interval_bounds_small (Trcoord_T *low, Trcoord_T *high, Chrpos_T *length, T this,
431 int index) {
432 Univinterval_T interval;
433
434 assert(index > 0);
435 assert(index <= this->total_nintervals);
436
437 interval = &(this->intervals[index-1]);
438 *low = Univinterval_low(interval);
439 *length = Univinterval_length(interval);
440 *high = Univinterval_high(interval) + 1;
441 return;
442 }
443
444
445 void
Univ_IIT_intervals_setup(Univcoord_T ** chroffsets,Univcoord_T ** chrhighs,Chrpos_T ** chrlengths,T this,int nchromosomes,int circular_typeint)446 Univ_IIT_intervals_setup (Univcoord_T **chroffsets, Univcoord_T **chrhighs, Chrpos_T **chrlengths,
447 T this, int nchromosomes, int circular_typeint) {
448 Univinterval_T interval;
449 Univcoord_T *chroffsets_ptr, *chrhighs_ptr;
450 Chrpos_T *chrlengths_ptr, length;
451 int index;
452
453 chroffsets_ptr = *chroffsets = (Univcoord_T *) CALLOC(nchromosomes,sizeof(Univcoord_T));
454 chrhighs_ptr = *chrhighs = (Univcoord_T *) CALLOC(nchromosomes,sizeof(Univcoord_T));
455 chrlengths_ptr = *chrlengths = (Chrpos_T *) CALLOC(nchromosomes,sizeof(Chrpos_T));
456
457 for (index = 1; index <= nchromosomes; index++) {
458 interval = &(this->intervals[index-1]);
459 *chroffsets_ptr++ = Univinterval_low(interval);
460 length = *chrlengths_ptr++ = Univinterval_length(interval);
461 if (Univinterval_type(interval) == circular_typeint) {
462 *chrhighs_ptr++ = Univinterval_high(interval) + 1 + length;
463 } else {
464 *chrhighs_ptr++ = Univinterval_high(interval) + 1;
465 }
466 }
467
468 return;
469 }
470
471
472 /* divint_crosstable maps from chromosome_iit chrnums to iit divints */
473 int *
Univ_IIT_divint_crosstable(T chromosome_iit,IIT_T iit)474 Univ_IIT_divint_crosstable (T chromosome_iit, IIT_T iit) {
475 int *crosstable;
476 int chrnum, nchromosomes;
477 char *chr;
478 UINT4 start;
479
480 nchromosomes = chromosome_iit->total_nintervals;
481 crosstable = (int *) CALLOC(nchromosomes+1,sizeof(int));
482
483 for (chrnum = 0; chrnum < nchromosomes; chrnum++) {
484 #ifdef WORDS_BIGENDIAN
485 /* chromosome_iit should be version 1 */
486 start = Bigendian_convert_uint(chromosome_iit->labelpointers[chrnum]);
487 #else
488 start = chromosome_iit->labelpointers[chrnum];
489 #endif
490 chr = &(chromosome_iit->labels[start]);
491
492 /* upon lookup, chrnum from Univ_IIT_get_one(chromosome_iit)
493 is 1-based, so we need to store in chrnum+1 */
494 crosstable[chrnum+1] = IIT_divint(iit,chr);
495 #if 0
496 if (crosstable[chrnum+1] < 0) {
497 fprintf(stderr,"Note: No splicesites are provided in chr %s\n",chr);
498 } else {
499 fprintf(stderr,"chrnum %d maps to splicesite divint %d\n",chrnum,crosstable[chrnum]);
500 }
501 #endif
502 }
503
504 #if 0
505 for (chrnum = 0; chrnum < nchromosomes; chrnum++) {
506 printf("Genomic chrnum %d => IIT chrnum %d\n",chrnum+1,crosstable[chrnum+1]);
507 }
508 #endif
509
510 return crosstable;
511 }
512
513
514 /* chrnum_crosstable maps from iit divints to chromosome_iit chrnums */
515 int *
Univ_IIT_chrnum_crosstable(T chromosome_iit,IIT_T iit)516 Univ_IIT_chrnum_crosstable (T chromosome_iit, IIT_T iit) {
517 int *crosstable;
518 int divint, ndivs;
519 char *chr;
520
521 ndivs = IIT_ndivs(iit);
522 crosstable = (int *) CALLOC(ndivs+1,sizeof(int));
523
524 for (divint = 1; divint < ndivs; divint++) {
525 chr = IIT_divstring(iit,divint);
526
527 /* upon lookup, chrnum is 1-based, so we need to store in chrnum+1 */
528 crosstable[divint] = Univ_IIT_find_one(chromosome_iit,chr);
529 }
530
531 #if 0
532 for (divint = 0; divint < ndivs; divint++) {
533 printf("IIT divint %d => IIT chrnum %d\n",divint+1,crosstable[divint+1]);
534 }
535 #endif
536
537 return crosstable;
538 }
539
540
541
542 /* The iit file has a '\0' after each string, so functions know where
543 it ends */
544 char *
Univ_IIT_typestring(T this,int type)545 Univ_IIT_typestring (T this, int type) {
546 UINT4 start;
547
548 start = this->typepointers[type];
549 return &(this->typestrings[start]);
550 }
551
552 int
Univ_IIT_typeint(T this,char * typestring)553 Univ_IIT_typeint (T this, char *typestring) {
554 int i = 0;
555 UINT4 start;
556
557 while (i < this->ntypes) {
558 start = this->typepointers[i];
559 if (!strcmp(typestring,&(this->typestrings[start]))) {
560 return i;
561 }
562 i++;
563 }
564
565 return -1;
566 }
567
568 char *
Univ_IIT_label(T this,int index,bool * allocp)569 Univ_IIT_label (T this, int index, bool *allocp) {
570 int recno;
571 UINT4 start;
572
573 recno = index - 1; /* Convert to 0-based */
574
575 #ifdef WORDS_BIGENDIAN
576 start = Bigendian_convert_uint(this->labelpointers[recno]);
577 #else
578 start = this->labelpointers[recno];
579 #endif
580 *allocp = false;
581 return &(this->labels[start]);
582 }
583
584
585 static char EMPTY_STRING[1] = {'\0'};
586
587 /* The iit file has a '\0' after each string, so functions know where
588 it ends */
589 /* Note: annotation itself is never allocated */
590 char *
Univ_IIT_annotation(char ** restofheader,T this,int index,bool * alloc_header_p)591 Univ_IIT_annotation (char **restofheader, T this, int index, bool *alloc_header_p) {
592 int recno;
593 UINT4 start;
594 char *annotation, *p;
595 int len;
596
597 recno = index - 1; /* Convert to 0-based */
598 #ifdef WORDS_BIGENDIAN
599 start = Bigendian_convert_uint(this->annotpointers[recno]);
600 #else
601 start = this->annotpointers[recno];
602 #endif
603
604 annotation = &(this->annotations[start]);
605 if (annotation[0] == '\0') {
606 *restofheader = annotation; /* Both are empty strings */
607
608 *alloc_header_p = false;
609 return annotation;
610
611 } else if (annotation[0] == '\n') {
612 *restofheader = EMPTY_STRING;
613
614 *alloc_header_p = false;
615 return &(annotation[1]);
616
617 } else {
618 p = annotation;
619 while (*p != '\0' && *p != '\n') p++;
620 len = (p - annotation)/sizeof(char);
621 *restofheader = (char *) MALLOC((1+len+1)*sizeof(char));
622 *restofheader[0] = ' ';
623 strncpy(&((*restofheader)[1]),annotation,len);
624 (*restofheader)[1+len] = '\0';
625
626 if (*p == '\n') p++;
627
628 *alloc_header_p = true;
629 return p;
630 }
631 }
632
633
634 void
Univ_IIT_dump_typestrings(FILE * fp,T this)635 Univ_IIT_dump_typestrings (FILE *fp, T this) {
636 int type;
637 UINT4 start;
638
639 for (type = 0; type < this->ntypes; type++) {
640 start = this->typepointers[type];
641 fprintf(fp,"%d\t%s\n",type,&(this->typestrings[start]));
642 }
643 return;
644 }
645
646 void
Univ_IIT_dump(T this)647 Univ_IIT_dump (T this) {
648 int index = 0, i;
649 Univinterval_T interval;
650 Univcoord_T startpos, endpos;
651 char *label, *annotation, *restofheader;
652 bool allocp;
653
654 for (i = 0; i < this->total_nintervals; i++) {
655 interval = &(this->intervals[i]);
656 label = Univ_IIT_label(this,index+1,&allocp);
657 printf(">%s",label);
658 if (allocp == true) {
659 FREE(label);
660 }
661 startpos = Univinterval_low(interval);
662 endpos = startpos + Univinterval_length(interval) - 1U;
663
664 printf(" %llu..%llu",(unsigned long long) startpos,(unsigned long long) endpos);
665
666 if (Univinterval_type(interval) > 0) {
667 printf(" %s",Univ_IIT_typestring(this,Univinterval_type(interval)));
668 }
669
670 annotation = Univ_IIT_annotation(&restofheader,this,index+1,&allocp);
671 printf("%s\n",restofheader);
672 printf("%s",annotation);
673 if (allocp) {
674 FREE(restofheader);
675 }
676
677 index++;
678 }
679
680 return;
681 }
682
683 void
Univ_IIT_dump_table(T this,bool zerobasedp)684 Univ_IIT_dump_table (T this, bool zerobasedp) {
685 int index = 0, i;
686 Univinterval_T interval;
687 Univcoord_T startpos, endpos;
688 char *label;
689 bool allocp;
690
691 for (i = 0; i < this->total_nintervals; i++) {
692 interval = &(this->intervals[i]);
693 label = Univ_IIT_label(this,index+1,&allocp);
694 printf("%s\t",label);
695 if (allocp == true) {
696 FREE(label);
697 }
698 startpos = Univinterval_low(interval);
699 endpos = startpos + Univinterval_length(interval) - 1U;
700
701 if (zerobasedp) {
702 printf("%llu..%llu\t",(unsigned long long) startpos,(unsigned long long) endpos);
703 } else {
704 printf("%llu..%llu\t",(unsigned long long) startpos+1,(unsigned long long) endpos+1);
705 }
706
707 printf("%u",Univinterval_length(interval));
708 if (Univinterval_type(interval) > 0) {
709 printf("\t%s",Univ_IIT_typestring(this,Univinterval_type(interval)));
710 }
711 printf("\n");
712
713 index++;
714 }
715
716 return;
717 }
718
719 void
Univ_IIT_dump_fai(T this)720 Univ_IIT_dump_fai (T this) {
721 int index = 0, i;
722 Univinterval_T interval;
723 char *label;
724 bool allocp;
725
726 for (i = 0; i < this->total_nintervals; i++) {
727 interval = &(this->intervals[i]);
728 label = Univ_IIT_label(this,index+1,&allocp);
729 printf("%s %u\n",label,Univinterval_length(interval));
730 if (allocp == true) {
731 FREE(label);
732 }
733 index++;
734 }
735
736 return;
737 }
738
739
740 #ifdef USE_MPI
741 /* For chromosome.iit file, which is stored in version 1 */
742 void
Univ_IIT_dump_sam(MPI_File fp,T this,char * sam_read_group_id,char * sam_read_group_name,char * sam_read_group_library,char * sam_read_group_platform)743 Univ_IIT_dump_sam (MPI_File fp, T this, char *sam_read_group_id, char *sam_read_group_name,
744 char *sam_read_group_library, char *sam_read_group_platform) {
745 int index = 0, i;
746 Univinterval_T interval;
747 Chrpos_T interval_length;
748 char *label, buffer[20];
749 bool allocp;
750 int circular_typeint;
751
752 if (this == NULL) {
753 return;
754 } else {
755 circular_typeint = Univ_IIT_typeint(this,"circular");
756 }
757
758 for (i = 0; i < this->total_nintervals; i++) {
759 interval = &(this->intervals[i]);
760 label = Univ_IIT_label(this,index+1,&allocp);
761 MPI_File_write_shared(fp,"@SQ\tSN:",strlen("@SQ\tSN:"),MPI_CHAR,MPI_STATUS_IGNORE);
762 MPI_File_write_shared(fp,label,strlen(label),MPI_CHAR,MPI_STATUS_IGNORE);
763 if (allocp == true) {
764 FREE(label);
765 }
766 /* startpos = Univinterval_low(interval); */
767 /* endpos = startpos + Univinterval_length(interval) - 1U; */
768
769 interval_length = Univinterval_length(interval);
770 sprintf(buffer,"%u",interval_length);
771 MPI_File_write_shared(fp,"\tLN:%s",strlen("\tLN:")+strlen(buffer),MPI_CHAR,MPI_STATUS_IGNORE);
772 if (Univinterval_type(interval) == circular_typeint) {
773 MPI_File_write_shared(fp,"\ttp:circular",strlen("\ttp:circular"),MPI_CHAR,MPI_STATUS_IGNORE);
774 }
775 MPI_File_write_shared(fp,"\n",1,MPI_CHAR,MPI_STATUS_IGNORE);
776
777 index++;
778 }
779
780 if (sam_read_group_id != NULL) {
781 MPI_File_write_shared(fp,"@RG\tID:",strlen("@RG\tID:"),MPI_CHAR,MPI_STATUS_IGNORE);
782 MPI_File_write_shared(fp,sam_read_group_id,strlen(sam_read_group_id),MPI_CHAR,MPI_STATUS_IGNORE);
783
784 if (sam_read_group_platform != NULL) {
785 MPI_File_write_shared(fp,"\tPL:",strlen("\tPL:"),MPI_CHAR,MPI_STATUS_IGNORE);
786 MPI_File_write_shared(fp,sam_read_group_platform,strlen(sam_read_group_platform),MPI_CHAR,MPI_STATUS_IGNORE);
787 }
788 if (sam_read_group_library != NULL) {
789 MPI_File_write_shared(fp,"\tLB:",strlen("\tLB:"),MPI_CHAR,MPI_STATUS_IGNORE);
790 MPI_File_write_shared(fp,sam_read_group_library,strlen(sam_read_group_library),MPI_CHAR,MPI_STATUS_IGNORE);
791 }
792 MPI_File_write_shared(fp,"\tSM:",strlen("\tSM:"),MPI_CHAR,MPI_STATUS_IGNORE);
793 MPI_File_write_shared(fp,sam_read_group_name,strlen(sam_read_group_name),MPI_CHAR,MPI_STATUS_IGNORE);
794 MPI_File_write_shared(fp,"\n",1,MPI_CHAR,MPI_STATUS_IGNORE);
795 }
796
797 return;
798 }
799
800
801 int
Univ_IIT_reserve_sam(T this,char * sam_read_group_id,char * sam_read_group_name,char * sam_read_group_library,char * sam_read_group_platform)802 Univ_IIT_reserve_sam (T this, char *sam_read_group_id, char *sam_read_group_name,
803 char *sam_read_group_library, char *sam_read_group_platform) {
804 int nchars = 0;
805 int index = 0, i;
806 Univinterval_T interval;
807 Chrpos_T interval_length;
808 char *label, buffer[20];
809 bool allocp;
810 int circular_typeint;
811
812 if (this == NULL) {
813 return 0;
814 } else {
815 circular_typeint = Univ_IIT_typeint(this,"circular");
816 }
817
818 for (i = 0; i < this->total_nintervals; i++) {
819 interval = &(this->intervals[i]);
820 label = Univ_IIT_label(this,index+1,&allocp);
821 nchars += strlen("@SQ\tSN:");
822 nchars += strlen(label);
823 if (allocp == true) {
824 FREE(label);
825 }
826 /* startpos = Univinterval_low(interval); */
827 /* endpos = startpos + Univinterval_length(interval) - 1U; */
828
829 interval_length = Univinterval_length(interval);
830 sprintf(buffer,"%u",interval_length);
831 nchars += strlen("\tLN:")+strlen(buffer);
832 if (Univinterval_type(interval) == circular_typeint) {
833 nchars += strlen("\ttp:circular");
834 }
835 nchars += strlen("\n");
836
837 index++;
838 }
839
840 if (sam_read_group_id != NULL) {
841 nchars += strlen("@RG\tID:");
842 nchars += strlen(sam_read_group_id);
843
844 if (sam_read_group_platform != NULL) {
845 nchars += strlen("\tPL:");
846 nchars += strlen(sam_read_group_platform);
847 }
848 if (sam_read_group_library != NULL) {
849 nchars += strlen("\tLB:");
850 nchars += strlen(sam_read_group_library);
851 }
852 nchars += strlen("\tSM:");
853 nchars += strlen(sam_read_group_name);
854 nchars += strlen("\n");
855 }
856
857 return nchars;
858 }
859
860
861 #else
862 /* For chromosome.iit file, which is stored in version 1 */
863 void
Univ_IIT_dump_sam(FILE * fp,T this,char * sam_read_group_id,char * sam_read_group_name,char * sam_read_group_library,char * sam_read_group_platform)864 Univ_IIT_dump_sam (FILE *fp, T this, char *sam_read_group_id, char *sam_read_group_name,
865 char *sam_read_group_library, char *sam_read_group_platform) {
866 int index = 0, i;
867 Univinterval_T interval;
868 char *label;
869 bool allocp;
870 int circular_typeint;
871
872 if (this == NULL) {
873 return;
874 } else {
875 circular_typeint = Univ_IIT_typeint(this,"circular");
876 }
877
878 for (i = 0; i < this->total_nintervals; i++) {
879 interval = &(this->intervals[i]);
880 label = Univ_IIT_label(this,index+1,&allocp);
881 fprintf(fp,"@SQ\tSN:%s",label);
882 if (allocp == true) {
883 FREE(label);
884 }
885 /* startpos = Univinterval_low(interval); */
886 /* endpos = startpos + Univinterval_length(interval) - 1U; */
887
888 fprintf(fp,"\tLN:%u",Univinterval_length(interval));
889 if (Univinterval_type(interval) == circular_typeint) {
890 fprintf(fp,"\ttp:circular");
891 }
892 fprintf(fp,"\n");
893
894 index++;
895 }
896
897 if (sam_read_group_id != NULL) {
898 fprintf(fp,"@RG\tID:%s",sam_read_group_id);
899 if (sam_read_group_platform != NULL) {
900 fprintf(fp,"\tPL:%s",sam_read_group_platform);
901 }
902 if (sam_read_group_library != NULL) {
903 fprintf(fp,"\tLB:%s",sam_read_group_library);
904 }
905 fprintf(fp,"\tSM:%s",sam_read_group_name);
906 fprintf(fp,"\n");
907 }
908
909 return;
910 }
911 #endif
912
913
914
915 Chrpos_T *
Univ_IIT_chrlengths(T this)916 Univ_IIT_chrlengths (T this) {
917 Chrpos_T *chrlengths;
918 int i;
919 Univinterval_T interval;
920
921 chrlengths = (Chrpos_T *) MALLOC(this->total_nintervals * sizeof(Chrpos_T));
922 for (i = 0; i < this->total_nintervals; i++) {
923 interval = &(this->intervals[i]);
924 chrlengths[i] = Univinterval_length(interval);
925 }
926
927 return chrlengths;
928 }
929
930
931 void
Univ_IIT_dump_labels(FILE * fp,T this)932 Univ_IIT_dump_labels (FILE *fp, T this) {
933 int i;
934 UINT4 start;
935 char *label;
936
937 for (i = 0; i < this->total_nintervals; i++) {
938 #ifdef WORDS_BIGENDIAN
939 start = Bigendian_convert_uint(this->labelpointers[i]);
940 #else
941 start = this->labelpointers[i];
942 #endif
943 label = &(this->labels[start]);
944 fprintf(fp,"%s ",label);
945 }
946 fprintf(fp,"\n");
947 return;
948 }
949
950
951
952 /* The iit file has a '\0' after each string, so functions know where
953 it ends */
954 char
Univ_IIT_annotation_firstchar(T this,int index)955 Univ_IIT_annotation_firstchar (T this, int index) {
956 int recno;
957 UINT4 start;
958
959 recno = index - 1; /* Convert to 0-based */
960
961 #ifdef WORDS_BIGENDIAN
962 start = Bigendian_convert_uint(this->annotpointers[recno]);
963 #else
964 start = this->annotpointers[recno];
965 #endif
966
967 return this->annotations[start];
968 }
969
970
971
972
973 /* For contig.iit file, which is stored in version 1 */
974 void
Univ_IIT_dump_contigs(T this,T chromosome_iit,bool directionalp)975 Univ_IIT_dump_contigs (T this, T chromosome_iit, bool directionalp) {
976 int index = 0, i, chromosome_index;
977 Univinterval_T interval;
978 Univcoord_T startpos, endpos, chroffset;
979 Chrpos_T chrstart, chrend;
980 char *label, firstchar, *chrstring;
981 bool allocp;
982
983 for (i = 0; i < this->total_nintervals; i++) {
984 interval = &(this->intervals[i]);
985 label = Univ_IIT_label(this,index+1,&allocp);
986 printf("%s\t",label);
987 if (allocp == true) {
988 FREE(label);
989 }
990 startpos = Univinterval_low(interval);
991 endpos = startpos + Univinterval_length(interval) - 1U;
992
993 chromosome_index = Univ_IIT_get_one(chromosome_iit,startpos,startpos);
994 chroffset = Univinterval_low(Univ_IIT_interval(chromosome_iit,chromosome_index));
995 chrstart = startpos - chroffset;
996 chrend = endpos - chroffset;
997
998 chrstring = Univ_IIT_label(chromosome_iit,chromosome_index,&allocp);
999
1000 if (directionalp == false) {
1001 printf("%llu..%llu\t",(unsigned long long) startpos+1U,(unsigned long long) endpos+1U);
1002 printf("%s:%u..%u\t",chrstring,chrstart+1U,chrend+1U);
1003
1004 } else {
1005 firstchar = Univ_IIT_annotation_firstchar(this,index+1);
1006 if (firstchar == '-') {
1007 printf("%llu..%llu\t",(unsigned long long) endpos+1U,(unsigned long long) startpos+1U);
1008 printf("%s:%u..%u\t",chrstring,chrend+1U,chrstart+1U);
1009 } else {
1010 printf("%llu..%llu\t",(unsigned long long) startpos+1U,(unsigned long long) endpos+1U);
1011 printf("%s:%u..%u\t",chrstring,chrstart+1U,chrend+1U);
1012 }
1013 }
1014 if (allocp == true) {
1015 FREE(chrstring);
1016 }
1017
1018 printf("%u",Univinterval_length(interval));
1019 if (Univinterval_type(interval) > 0) {
1020 printf("\t%s",Univ_IIT_typestring(this,Univinterval_type(interval)));
1021 }
1022 printf("\n");
1023
1024 index++;
1025 }
1026
1027 return;
1028 }
1029
1030
1031 /************************************************************************
1032 * For file format, see iit-write-univ.c
1033 ************************************************************************/
1034
1035 void
Univ_IIT_free(T * old)1036 Univ_IIT_free (T *old) {
1037
1038 if (*old != NULL) {
1039 if ((*old)->access == MMAPPED) {
1040 #ifdef HAVE_MMAP
1041 munmap((void *) (*old)->annot_mmap,(*old)->annot_length);
1042 munmap((void *) (*old)->annotpointers_mmap,(*old)->annotpointers_length);
1043 munmap((void *) (*old)->label_mmap,(*old)->label_length);
1044 munmap((void *) (*old)->labelpointers_mmap,(*old)->labelpointers_length);
1045 munmap((void *) (*old)->labelorder_mmap,(*old)->labelorder_length);
1046 #endif
1047 close((*old)->fd);
1048
1049 } else if ((*old)->access == FILEIO) {
1050 FREE((*old)->annotations);
1051 FREE((*old)->annotpointers);
1052 FREE((*old)->labels);
1053 FREE((*old)->labelpointers);
1054 FREE((*old)->labelorder);
1055 /* close((*old)->fd); -- closed in read_annotations */
1056
1057 } else if ((*old)->access == ALLOCATED_PRIVATE) {
1058 /* Nothing to close. IIT must have been created by Univ_IIT_new. */
1059
1060 } else if ((*old)->access == ALLOCATED_SHARED) {
1061 /* Nothing to close. IIT must have been created by Univ_IIT_new. */
1062
1063 } else {
1064 abort();
1065 }
1066
1067 FREE((*old)->typestrings);
1068 FREE((*old)->typepointers);
1069
1070 FREE((*old)->intervals);
1071
1072 /* Note: we are depending on Mem_free() to check that these are non-NULL */
1073 FREE((*old)->nodes);
1074 FREE((*old)->omegas);
1075 FREE((*old)->sigmas);
1076
1077 FREE(*old);
1078
1079 }
1080
1081 return;
1082 }
1083
1084
1085
1086 static void
move_relative(FILE * fp,off_t offset)1087 move_relative (FILE *fp, off_t offset) {
1088
1089 #ifdef HAVE_FSEEKO
1090 if (fseeko(fp,offset,SEEK_CUR) < 0) {
1091 fprintf(stderr,"Error in move_relative, seek\n");
1092 abort();
1093 }
1094 #else
1095 if (fseek(fp,(long) offset,SEEK_CUR) < 0) {
1096 fprintf(stderr,"Error in move_relative, seek\n");
1097 abort();
1098 }
1099 #endif
1100
1101 return;
1102 }
1103
1104
1105 static size_t
read_tree_univ(size_t offset,size_t filesize,FILE * fp,char * filename,T new)1106 read_tree_univ (size_t offset, size_t filesize, FILE *fp, char *filename, T new) {
1107 size_t items_read;
1108 int i;
1109 UINT4 uint4;
1110
1111 if ((offset += sizeof(int)*(new->total_nintervals+1)) > filesize) {
1112 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after sigmas %lld, filesize %lld). Did you generate it using iit_store?\n",
1113 filename,(long long int) offset,(long long int) filesize);
1114 exit(9);
1115 } else {
1116 new->sigmas = (int *) CALLOC(new->total_nintervals+1,sizeof(int));
1117 if ((items_read = FREAD_INTS(new->sigmas,new->total_nintervals+1,fp)) != (unsigned int) new->total_nintervals + 1) {
1118 fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1119 exit(9);
1120 }
1121 }
1122
1123 if ((offset += sizeof(int)*(new->total_nintervals+1)) > filesize) {
1124 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after omegas %lld, filesize %lld). Did you generate it using iit_store?\n",
1125 filename,(long long int) offset,(long long int) filesize);
1126 exit(9);
1127 } else {
1128 new->omegas = (int *) CALLOC(new->total_nintervals+1,sizeof(int));
1129 if ((items_read = FREAD_INTS(new->omegas,new->total_nintervals+1,fp)) != (unsigned int) new->total_nintervals + 1) {
1130 fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1131 exit(9);
1132 }
1133 }
1134
1135 debug(printf("nnodes: %d\n",new->nnodes));
1136 if (new->nnodes == 0) {
1137 new->nodes = (struct Univ_FNode_T *) NULL;
1138 } else {
1139 new->nodes = (struct Univ_FNode_T *) CALLOC(new->nnodes,sizeof(struct Univ_FNode_T));
1140 #ifdef WORDS_BIGENDIAN
1141 if (new->coord_values_8p == true) {
1142 #ifdef HAVE_64_BIT
1143 for (i = 0; i < new->nnodes; i++) {
1144 Bigendian_fread_uint8(&(new->nodes[i].value),fp);
1145 Bigendian_fread_int(&(new->nodes[i].a),fp);
1146 Bigendian_fread_int(&(new->nodes[i].b),fp);
1147 Bigendian_fread_int(&(new->nodes[i].leftindex),fp);
1148 Bigendian_fread_int(&(new->nodes[i].rightindex),fp);
1149 }
1150 offset += (sizeof(UINT8)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1151 #else
1152 fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit. Cannot continue.\n");
1153 exit(9);
1154 #endif
1155 } else {
1156 for (i = 0; i < new->nnodes; i++) {
1157 Bigendian_fread_uint(&uint4,fp);
1158 new->nodes[i].value = (UINT8) uint4;
1159 Bigendian_fread_int(&(new->nodes[i].a),fp);
1160 Bigendian_fread_int(&(new->nodes[i].b),fp);
1161 Bigendian_fread_int(&(new->nodes[i].leftindex),fp);
1162 Bigendian_fread_int(&(new->nodes[i].rightindex),fp);
1163 }
1164 offset += (sizeof(UINT4)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1165 }
1166 #else
1167 if (new->coord_values_8p == true) {
1168 #ifdef HAVE_64_BIT
1169 #if 1
1170 offset += sizeof(struct Univ_FNode_T)*fread(new->nodes,sizeof(struct Univ_FNode_T),new->nnodes,fp);
1171 #else
1172 for (i = 0; i < new->nnodes; i++) {
1173 FREAD_UINT8(&(new->nodes[i].value),fp);
1174 FREAD_INT(&(new->nodes[i].a),fp);
1175 FREAD_INT(&(new->nodes[i].b),fp);
1176 FREAD_INT(&(new->nodes[i].leftindex),fp);
1177 FREAD_INT(&(new->nodes[i].rightindex),fp);
1178 printf("i %d, node value %llu\n",i,(unsigned long long) new->nodes[i].value);
1179 }
1180 offset += (sizeof(UINT8)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1181 #endif
1182 #else
1183 fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit. Cannot continue.\n");
1184 exit(9);
1185 #endif
1186 } else {
1187 for (i = 0; i < new->nnodes; i++) {
1188 FREAD_UINT(&uint4,fp);
1189 new->nodes[i].value = (UINT8) uint4;
1190 FREAD_INT(&(new->nodes[i].a),fp);
1191 FREAD_INT(&(new->nodes[i].b),fp);
1192 FREAD_INT(&(new->nodes[i].leftindex),fp);
1193 FREAD_INT(&(new->nodes[i].rightindex),fp);
1194 }
1195 offset += (sizeof(UINT4)+sizeof(int)+sizeof(int)+sizeof(int)+sizeof(int))*new->nnodes;
1196 }
1197 #endif
1198 if (offset > filesize) {
1199 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after nodes %lld, filesize %lld). Did you generate it using iit_store?\n",
1200 filename,(long long int) offset,(long long int) filesize);
1201 exit(9);
1202 }
1203 }
1204 debug(printf("\n"));
1205
1206 return offset;
1207 }
1208
1209
1210 static size_t
read_intervals_univ(size_t offset,size_t filesize,FILE * fp,char * filename,T new)1211 read_intervals_univ (size_t offset, size_t filesize, FILE *fp, char *filename, T new) {
1212 int i;
1213 UINT4 uint4;
1214
1215 #ifdef WORDS_BIGENDIAN
1216 if (new->coord_values_8p == true) {
1217 #ifdef HAVE_64_BIT
1218 for (i = 0; i < new->total_nintervals; i++) {
1219 Bigendian_fread_uint8(&(new->intervals[i].low),fp);
1220 Bigendian_fread_uint8(&(new->intervals[i].high),fp);
1221 Bigendian_fread_int(&(new->intervals[i].type),fp);
1222 }
1223 #else
1224 fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit. Cannot continue.\n");
1225 exit(9);
1226 #endif
1227 } else {
1228 for (i = 0; i < new->total_nintervals; i++) {
1229 Bigendian_fread_uint(&uint4,fp);
1230 new->intervals[i].low = (Univcoord_T) uint4;
1231 Bigendian_fread_uint(&uint4,fp);
1232 new->intervals[i].high = (Univcoord_T) uint4;
1233 Bigendian_fread_int(&(new->intervals[i].type),fp);
1234 }
1235 offset += (sizeof(UINT4)+sizeof(UINT4)+sizeof(int))*new->total_nintervals;
1236 }
1237 #else
1238 if (new->coord_values_8p == true) {
1239 #ifdef HAVE_64_BIT
1240 for (i = 0; i < new->total_nintervals; i++) {
1241 FREAD_UINT8(&(new->intervals[i].low),fp);
1242 FREAD_UINT8(&(new->intervals[i].high),fp);
1243 FREAD_INT(&(new->intervals[i].type),fp);
1244 }
1245 offset += (sizeof(UINT8)+sizeof(UINT8)+sizeof(int))*new->total_nintervals;
1246 #else
1247 fprintf(stderr,"IIT file contains 64-bit coordinates, but this computer is only 32-bit. Cannot continue.\n");
1248 exit(9);
1249 #endif
1250 } else {
1251 for (i = 0; i < new->total_nintervals; i++) {
1252 FREAD_UINT(&uint4,fp);
1253 new->intervals[i].low = (Univcoord_T) uint4;
1254 FREAD_UINT(&uint4,fp);
1255 new->intervals[i].high = (Univcoord_T) uint4;
1256 FREAD_INT(&(new->intervals[i].type),fp);
1257 }
1258 offset += (sizeof(UINT4)+sizeof(UINT4)+sizeof(int))*new->total_nintervals;
1259 }
1260 #endif
1261 if (offset > filesize) {
1262 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after intervals %lld, filesize %lld). Did you generate it using iit_store?\n",
1263 filename,(long long int) offset,(long long int) filesize);
1264 exit(9);
1265 }
1266
1267 return offset;
1268 }
1269
1270
1271 static void
read_words(size_t offset,size_t filesize,FILE * fp,T new)1272 read_words (size_t offset, size_t filesize, FILE *fp, T new) {
1273 size_t stringlen;
1274 UINT4 length;
1275 #ifdef DEBUG
1276 int i;
1277 #endif
1278
1279 new->typepointers = (UINT4 *) CALLOC(new->ntypes+1,sizeof(UINT4));
1280 offset += sizeof(int)*FREAD_UINTS(new->typepointers,new->ntypes+1,fp);
1281 debug(
1282 printf("typepointers:");
1283 for (i = 0; i < new->ntypes+1; i++) {
1284 printf(" %u",new->typepointers[i]);
1285 }
1286 printf("\n");
1287 );
1288
1289 stringlen = new->typepointers[new->ntypes];
1290 if (stringlen == 0) {
1291 new->typestrings = (char *) NULL;
1292 } else {
1293 new->typestrings = (char *) CALLOC(stringlen,sizeof(char));
1294 offset += sizeof(char)*FREAD_CHARS(new->typestrings,stringlen,fp);
1295 }
1296 debug(
1297 printf("typestrings:\n");
1298 for (s = 0; s < stringlen; s++) {
1299 printf("%c",new->typestrings[s]);
1300 }
1301 printf("\n");
1302 );
1303
1304 debug1(printf("Starting read of labelorder offset/length\n"));
1305 new->labelorder_offset = offset;
1306 new->labelorder_length = (size_t) (new->total_nintervals*sizeof(int));
1307 /* fprintf(stderr,"Doing a move_relative for labelorder_length %zu\n",new->labelorder_length); */
1308 move_relative(fp,new->labelorder_length);
1309 offset += new->labelorder_length;
1310
1311 debug1(printf("Starting read of labelpointer offset/length\n"));
1312 new->labelpointers_offset = offset;
1313 new->labelpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1314 /* fprintf(stderr,"Doing a move_relative for labelpointer %zu\n",new->total_nintervals * sizeof(UINT4)); */
1315 move_relative(fp,new->total_nintervals * sizeof(UINT4));
1316 FREAD_UINT(&length,fp);
1317 new->label_length = (size_t) length;
1318 offset += new->labelpointers_length;
1319
1320 debug1(printf("Starting read of label offset/length\n"));
1321 new->label_offset = offset;
1322 /* new->label_length computed above */
1323 /* fprintf(stderr,"Doing a move_relative for label_length %zu\n",new->label_length); */
1324 move_relative(fp,new->label_length);
1325 offset += new->label_length;
1326
1327 debug1(printf("Starting read of annotpointers offset/length\n"));
1328 new->annotpointers_offset = offset;
1329 new->annotpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1330 offset += new->annotpointers_length;
1331
1332 new->annot_offset = offset;
1333
1334 #ifdef BAD_32BIT
1335 /* This fails if length > 4 GB */
1336 move_relative(fp,new->total_nintervals * sizeof(unsigned int));
1337 FREAD_UINT(&length,fp);
1338 new->annot_length = (size_t) length;
1339 fprintf(stderr,"Incorrect length: %u\n",length);
1340 #else
1341 new->annot_length = filesize - new->annot_offset;
1342 /* fprintf(stderr,"annot_length: %zu\n",new->annot_length); */
1343 #endif
1344
1345 #if 0
1346 /* To do this check, we need to get stringlen for annotation similarly to that for labels */
1347 last_offset = offset + sizeof(char)*stringlen;
1348 if (last_offset != filesize) {
1349 fprintf(stderr,"Problem with last_offset (%lld) not equal to filesize = (%lld)\n",
1350 (long long int) last_offset,(long long int) filesize);
1351 exit(9);
1352 }
1353 #endif
1354
1355 return;
1356 }
1357
1358 static void
read_words_debug(size_t offset,size_t filesize,FILE * fp,T new)1359 read_words_debug (size_t offset, size_t filesize, FILE *fp, T new) {
1360 size_t stringlen, s;
1361 UINT4 length;
1362 int i;
1363 #if 0
1364 size_t last_offset;
1365 #endif
1366
1367 new->typepointers = (unsigned int *) CALLOC(new->ntypes+1,sizeof(unsigned int));
1368 offset += sizeof(int)*FREAD_UINTS(new->typepointers,new->ntypes+1,fp);
1369 printf("typepointers:");
1370 for (i = 0; i < new->ntypes+1; i++) {
1371 printf(" %u",new->typepointers[i]);
1372 }
1373 printf("\n");
1374
1375 stringlen = new->typepointers[new->ntypes];
1376 if (stringlen == 0) {
1377 new->typestrings = (char *) NULL;
1378 } else {
1379 new->typestrings = (char *) CALLOC(stringlen,sizeof(char));
1380 offset += sizeof(char)*FREAD_CHARS(new->typestrings,stringlen,fp);
1381 }
1382 printf("typestrings:\n");
1383 for (s = 0; s < stringlen; s++) {
1384 printf("%c",new->typestrings[s]);
1385 }
1386 printf("\n");
1387
1388 debug1(printf("Starting read of labelorder offset/length\n"));
1389 new->labelorder_offset = offset;
1390 new->labelorder_length = (size_t) (new->total_nintervals*sizeof(int));
1391 move_relative(fp,new->labelorder_length);
1392 offset += new->labelorder_length;
1393
1394 debug1(printf("Starting read of labelpointers offset/length\n"));
1395 new->labelpointers_offset = offset;
1396 new->labelpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1397 move_relative(fp,new->total_nintervals * sizeof(UINT4));
1398 FREAD_UINT(&length,fp);
1399 new->label_length = (size_t) length;
1400 offset += new->labelpointers_length;
1401
1402 fprintf(stderr,"label_length: %zu\n",new->label_length);
1403 debug1(printf("Starting read of label offset/length\n"));
1404 new->label_offset = offset;
1405 /* new->label_length computed above */
1406 move_relative(fp,new->label_length);
1407 offset += new->label_length;
1408
1409 debug1(printf("Starting read of annotpointers offset/length\n"));
1410 new->annotpointers_offset = offset;
1411 new->annotpointers_length = (size_t) ((new->total_nintervals+1)*sizeof(UINT4));
1412 offset += new->annotpointers_length;
1413
1414 new->annot_offset = offset;
1415
1416 #ifdef BAD_32BIT
1417 /* This fails if length > 4 GB */
1418 move_relative(fp,new->total_nintervals * sizeof(unsigned int));
1419 FREAD_UINT(&length,fp);
1420 new->annot_length = (size_t) length;
1421 fprintf(stderr,"Incorrect length: %u\n",length);
1422 #else
1423 new->annot_length = filesize - new->annot_offset;
1424 fprintf(stderr,"annot_length: %zu\n",new->annot_length);
1425 #endif
1426
1427 #if 0
1428 /* To do this check, we need to get stringlen for annotation similarly to that for labels */
1429 last_offset = offset + sizeof(char)*stringlen;
1430 if (last_offset != filesize) {
1431 fprintf(stderr,"Problem with last_offset (%lld) not equal to filesize = (%lld)\n",
1432 (long long int) last_offset,(long long int) filesize);
1433 exit(9);
1434 }
1435 #endif
1436
1437 return;
1438 }
1439
1440 /* This function only assigns pointers. Subsequent accesses to
1441 memory, other than char *, still need to be read correctly
1442 by bigendian machines */
1443 /* Previously allowed read/write access, but we can assume read-only access */
1444 #ifdef HAVE_MMAP
1445 static bool
mmap_annotations(char * filename,T new,bool readonlyp)1446 mmap_annotations (char *filename, T new, bool readonlyp) {
1447 int remainder;
1448
1449 assert(readonlyp == true);
1450
1451 if ((new->fd = open(filename,O_RDONLY,0764)) < 0) {
1452 fprintf(stderr,"Error: can't open file %s with open for reading\n",filename);
1453 exit(9);
1454 }
1455
1456 new->labelorder_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->labelorder_offset,new->labelorder_length,
1457 /*randomp*/true);
1458 debug(fprintf(stderr,"labelorder_mmap is %p\n",new->labelorder_mmap));
1459 new->labelorder = (int *) &(new->labelorder_mmap[remainder]);
1460 new->labelorder_length += (size_t) remainder;
1461
1462 new->labelpointers_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->labelpointers_offset,new->labelpointers_length,
1463 /*randomp*/true);
1464 debug(fprintf(stderr,"labelpointers_mmap is %p\n",new->labelpointers_mmap));
1465 new->labelpointers = (UINT4 *) &(new->labelpointers_mmap[remainder]);
1466 new->labelpointers_length += (size_t) remainder;
1467
1468 new->label_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->label_offset,new->label_length,
1469 /*randomp*/true);
1470 debug(fprintf(stderr,"labels_mmap is %p\n",new->label_mmap));
1471 new->labels = (char *) &(new->label_mmap[remainder]);
1472 new->label_length += (size_t) remainder;
1473
1474 new->annotpointers_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->annotpointers_offset,new->annotpointers_length,
1475 /*randomp*/true);
1476 debug(fprintf(stderr,"annotpointers_mmap is %p\n",new->annotpointers_mmap));
1477 new->annotpointers = (UINT4 *) &(new->annotpointers_mmap[remainder]);
1478 new->annotpointers_length += (size_t) remainder;
1479
1480 new->annot_mmap = (char *) Access_mmap_offset(&remainder,new->fd,new->annot_offset,new->annot_length,
1481 /*randomp*/true);
1482 debug(fprintf(stderr,"annots_mmap is %p\n",new->annot_mmap));
1483 new->annotations = (char *) &(new->annot_mmap[remainder]);
1484 new->annot_length += (size_t) remainder;
1485
1486
1487 if (new->labelorder == NULL || new->labelpointers == NULL || new->labels == NULL) {
1488 fprintf(stderr,"Memory mapping failed in reading IIT file %s. Using slow file IO instead.\n",filename);
1489 return false;
1490 }
1491
1492 if (new->annotpointers == NULL || new->annotations == NULL) {
1493 fprintf(stderr,"Memory mapping failed in reading IIT file %s. Using slow file IO instead.\n",filename);
1494 return false;
1495 }
1496
1497 return true;
1498 }
1499 #endif
1500
1501
1502 /* Used if access is FILEIO. Subsequent accesses by bigendian
1503 machines to anything but (char *) will still need to convert. */
1504 static void
read_annotations(T new)1505 read_annotations (T new) {
1506
1507 file_move_absolute(new->fd,new->labelorder_offset,sizeof(int),/*n*/0);
1508 new->labelorder = (int *) CALLOC(new->total_nintervals,sizeof(int));
1509 read(new->fd,new->labelorder,new->total_nintervals*sizeof(int));
1510
1511 file_move_absolute(new->fd,new->labelpointers_offset,sizeof(UINT4),/*n*/0);
1512 new->labelpointers = (UINT4 *) CALLOC(new->total_nintervals+1,sizeof(UINT4));
1513 read(new->fd,new->labelpointers,(new->total_nintervals+1)*sizeof(UINT4));
1514
1515 file_move_absolute(new->fd,new->label_offset,sizeof(char),/*n*/0);
1516 new->labels = (char *) CALLOC(new->label_length,sizeof(char));
1517 read(new->fd,new->labels,new->label_length*sizeof(char));
1518
1519 file_move_absolute(new->fd,new->annotpointers_offset,sizeof(UINT4),/*n*/0);
1520 new->annotpointers = (UINT4 *) CALLOC(new->total_nintervals+1,sizeof(UINT4));
1521 read(new->fd,new->annotpointers,(new->total_nintervals+1)*sizeof(UINT4));
1522
1523 file_move_absolute(new->fd,new->annot_offset,sizeof(char),/*n*/0);
1524 new->annotations = (char *) CALLOC(new->annot_length,sizeof(char));
1525 read(new->fd,new->annotations,new->annot_length*sizeof(char));
1526
1527 return;
1528 }
1529
1530
1531
1532 T
Univ_IIT_read(char * filename,bool readonlyp,bool add_iit_p)1533 Univ_IIT_read (char *filename, bool readonlyp, bool add_iit_p) {
1534 T new;
1535 FILE *fp;
1536 char *newfile = NULL;
1537 size_t offset = 0, filesize;
1538
1539 /* printf("Reading IIT file %s\n",filename); */
1540 if (add_iit_p == true) {
1541 newfile = (char *) CALLOC(strlen(filename)+strlen(".iit")+1,sizeof(char));
1542 sprintf(newfile,"%s.iit",filename);
1543 if ((fp = FOPEN_READ_BINARY(newfile)) != NULL) {
1544 filename = newfile;
1545 } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1546 /* fprintf(stderr,"Cannot open IIT file %s or %s\n",filename,newfile); */
1547 FREE(newfile);
1548 return NULL;
1549 }
1550 } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1551 /* fprintf(stderr,"Cannot open IIT file %s\n",filename); */
1552 return NULL;
1553 }
1554
1555 new = (T) MALLOC(sizeof(*new));
1556
1557 filesize = Access_filesize(filename);
1558
1559 if (FREAD_INT(&new->total_nintervals,fp) < 1) {
1560 fprintf(stderr,"IIT file %s appears to be empty\n",filename);
1561 return NULL;
1562 } else if ((offset += sizeof(int)) > filesize) {
1563 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after first byte %lld, filesize %lld). Did you generate it using iit_store?\n",
1564 filename,(long long int) offset,(long long int) filesize);
1565 return NULL;
1566 }
1567
1568 if (new->total_nintervals < 0) {
1569 #ifdef LARGE_GENOMES
1570 new->coord_values_8p = true;
1571 new->total_nintervals = -new->total_nintervals;
1572 #elif defined(UTILITYP)
1573 new->coord_values_8p = true;
1574 new->total_nintervals = -new->total_nintervals;
1575 #else
1576 fprintf(stderr,"This is a large genome of more than 2^32 (4 billion) bp.\n");
1577 #ifdef GSNAP
1578 fprintf(stderr,"You should run gsnapl instead.\n");
1579 #else
1580 fprintf(stderr,"You should run gmapl instead.\n");
1581 #endif
1582 exit(9);
1583 #endif
1584
1585 } else if (new->total_nintervals > 0) {
1586 new->coord_values_8p = false;
1587
1588 } else {
1589 abort();
1590 }
1591
1592 debug(printf("version: 1\n"));
1593 debug(printf("total_nintervals: %d\n",new->total_nintervals));
1594 debug(printf("coord values 8p: %d\n",new->coord_values_8p));
1595
1596
1597 if (FREAD_INT(&new->ntypes,fp) < 1) {
1598 fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1599 return NULL;
1600 } else if (new->ntypes < 0) {
1601 fprintf(stderr,"IIT file %s appears to have a negative number of types\n",filename);
1602 return NULL;
1603 } else if ((offset += sizeof(int)) > filesize) {
1604 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after ntypes %lld, filesize %lld). Did you generate it using iit_store?\n",
1605 filename,(long long int) offset,(long long int) filesize);
1606 return NULL;
1607 }
1608 debug(printf("ntypes: %d\n",new->ntypes));
1609
1610 if (FREAD_INT(&new->nnodes,fp) < 1) {
1611 fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1612 return NULL;
1613 } else if (new->nnodes < 0) {
1614 fprintf(stderr,"IIT file %s appears to have a negative number of nodes\n",filename);
1615 return NULL;
1616 } else if ((offset += sizeof(int)) > filesize) {
1617 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after nnodes %lld, filesize %lld). Did you generate it using iit_store?\n",
1618 filename,(long long int) offset,(long long int) filesize);
1619 return NULL;
1620 }
1621
1622 /* new->divsort = NO_SORT; */
1623 offset = read_tree_univ(offset,filesize,fp,filename,new);
1624
1625 new->intervals = (struct Univinterval_T *) CALLOC(new->total_nintervals,sizeof(struct Univinterval_T));
1626 offset = read_intervals_univ(offset,filesize,fp,filename,new);
1627
1628 read_words(offset,filesize,fp,new);
1629 fclose(fp);
1630
1631 #ifndef HAVE_MMAP
1632 debug1(printf("No mmap available. Reading annotations\n"));
1633 new->access = FILEIO;
1634 new->fd = Access_fileio(filename);
1635 read_annotations(new);
1636 close(new->fd);
1637 /* pthread_mutex_init(&new->read_mutex,NULL); */
1638 #else
1639 debug1(printf("mmap available. Setting up pointers to annotations\n"));
1640 new->access = MMAPPED;
1641 if (mmap_annotations(filename,new,readonlyp) == false) {
1642 debug1(printf(" Failed. Reading annotations\n"));
1643 new->access = FILEIO;
1644 new->fd = Access_fileio(filename);
1645 read_annotations(new);
1646 close(new->fd);
1647 /* pthread_mutex_init(&new->read_mutex,NULL); */
1648 }
1649 #endif
1650
1651 if (newfile != NULL) {
1652 FREE(newfile);
1653 }
1654
1655 return new;
1656 }
1657
1658
1659 void
Univ_IIT_debug(char * filename)1660 Univ_IIT_debug (char *filename) {
1661 T new;
1662 FILE *fp;
1663 char *newfile = NULL;
1664 size_t offset = 0, filesize;
1665 bool add_iit_p = false;
1666
1667 if (add_iit_p == true) {
1668 newfile = (char *) CALLOC(strlen(filename)+strlen(".iit")+1,sizeof(char));
1669 sprintf(newfile,"%s.iit",filename);
1670 if ((fp = FOPEN_READ_BINARY(newfile)) != NULL) {
1671 filename = newfile;
1672 } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1673 /* fprintf(stderr,"Cannot open IIT file %s or %s\n",filename,newfile); */
1674 FREE(newfile);
1675 return;
1676 }
1677 } else if ((fp = FOPEN_READ_BINARY(filename)) == NULL) {
1678 /* fprintf(stderr,"Cannot open IIT file %s\n",filename); */
1679 return;
1680 }
1681
1682 new = (T) MALLOC(sizeof(*new));
1683
1684 filesize = Access_filesize(filename);
1685
1686 if (FREAD_INT(&new->total_nintervals,fp) < 1) {
1687 fprintf(stderr,"IIT file %s appears to be empty\n",filename);
1688 return;
1689 } else if ((offset += sizeof(int)) > filesize) {
1690 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after first byte %lld, filesize %lld). Did you generate it using iit_store?\n",
1691 filename,(long long int) offset,(long long int) filesize);
1692 return;
1693 }
1694
1695 if (new->total_nintervals < 0) {
1696 #ifdef HAVE_64_BIT
1697 new->coord_values_8p = true;
1698 new->total_nintervals = -new->total_nintervals;
1699 #else
1700 fprintf(stderr,"IIT file has 64-bit coordinates, but this machine is only 32-bit, so it cannot process it.\n");
1701 exit(9);
1702 #endif
1703
1704 } else if (new->total_nintervals > 0) {
1705 new->coord_values_8p = false;
1706
1707 } else {
1708 abort();
1709 }
1710
1711 printf("version: 1\n");
1712 printf("total_nintervals: %d\n",new->total_nintervals);
1713 printf("coord values 8p: %d\n",new->coord_values_8p);
1714
1715
1716 if (FREAD_INT(&new->ntypes,fp) < 1) {
1717 fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1718 return;
1719 } else if (new->ntypes < 0) {
1720 fprintf(stderr,"IIT file %s appears to have a negative number of types\n",filename);
1721 return;
1722 } else if ((offset += sizeof(int)) > filesize) {
1723 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after ntypes %lld, filesize %lld). Did you generate it using iit_store?\n",
1724 filename,(long long int) offset,(long long int) filesize);
1725 return;
1726 }
1727 printf("ntypes: %d\n",new->ntypes);
1728
1729 if (FREAD_INT(&new->nnodes,fp) < 1) {
1730 fprintf(stderr,"IIT file %s appears to be truncated\n",filename);
1731 return;
1732 } else if (new->nnodes < 0) {
1733 fprintf(stderr,"IIT file %s appears to have a negative number of nodes\n",filename);
1734 return;
1735 } else if ((offset += sizeof(int)) > filesize) {
1736 fprintf(stderr,"IIT file %s has an invalid binary format -- offset is too large (offset after nnodes %lld, filesize %lld). Did you generate it using iit_store?\n",
1737 filename,(long long int) offset,(long long int) filesize);
1738 return;
1739 }
1740
1741 /* new->divsort = NO_SORT; */
1742 offset = read_tree_univ(offset,filesize,fp,filename,new);
1743
1744 new->intervals = (struct Univinterval_T *) CALLOC(new->total_nintervals,sizeof(struct Univinterval_T));
1745 offset = read_intervals_univ(offset,filesize,fp,filename,new);
1746
1747 read_words_debug(offset,filesize,fp,new);
1748 fclose(fp);
1749
1750 #ifndef HAVE_MMAP
1751 debug1(printf("No mmap available. Reading annotations\n"));
1752 new->access = FILEIO;
1753 new->fd = Access_fileio(filename);
1754 read_annotations(new);
1755 close(new->fd);
1756 /* pthread_mutex_init(&new->read_mutex,NULL); */
1757 #else
1758 debug1(printf("mmap available. Setting up pointers to annotations\n"));
1759 new->access = MMAPPED;
1760 if (mmap_annotations(filename,new,/*readonlyp*/true) == false) {
1761 debug1(printf(" Failed. Reading annotations\n"));
1762 new->access = FILEIO;
1763 new->fd = Access_fileio(filename);
1764 read_annotations(new);
1765 close(new->fd);
1766 /* pthread_mutex_init(&new->read_mutex,NULL); */
1767 }
1768 #endif
1769
1770 if (newfile != NULL) {
1771 FREE(newfile);
1772 }
1773
1774 Univ_IIT_free(&new);
1775
1776 return;
1777 }
1778
1779
1780 /************************************************************************/
1781
1782 static void
fnode_query_aux(int * min,int * max,T this,int nodeindex,Univcoord_T x)1783 fnode_query_aux (int *min, int *max, T this, int nodeindex, Univcoord_T x) {
1784 int lambda;
1785 Univ_FNode_T node;
1786
1787 if (nodeindex == -1) {
1788 return;
1789 }
1790
1791 node = &(this->nodes[nodeindex]);
1792 debug(printf("Entered fnode_query_aux with nodeindex %d: a %d, b %d, leftindex %d, rightindex %d, value %llu\n",
1793 nodeindex,node->a,node->b,node->leftindex,node->rightindex,(unsigned long long) node->value));
1794
1795 if (x == node->value) {
1796 debug(printf("%lluD:\n",(unsigned long long) node->value));
1797 if (node->a < *min) {
1798 *min = node->a;
1799 }
1800 if (node->b > *max) {
1801 *max = node->b;
1802 }
1803 return;
1804 } else if (x < node->value) {
1805 debug(printf("x %llu < node->value %llu\n",(unsigned long long) x,(unsigned long long) node->value));
1806 fnode_query_aux(&(*min),&(*max),this,node->leftindex,x);
1807 debug(printf("%lluL:\n",(unsigned long long) node->value));
1808 if (node->a < *min) {
1809 *min = node->a;
1810 }
1811 for (lambda = node->a; lambda <= node->b; lambda++) {
1812 debug(printf("Looking at lambda %d, segment %d\n",
1813 lambda,this->sigmas[lambda]));
1814 if (Univinterval_is_contained(x,this->intervals,this->sigmas[lambda]) == true) {
1815 if (lambda > *max) {
1816 *max = lambda;
1817 }
1818 } else {
1819 return;
1820 }
1821 }
1822 return;
1823 } else {
1824 /* (node->value < x) */
1825 debug(printf("x %llu > node->value %llu\n",(unsigned long long) x,(unsigned long long) node->value));
1826 fnode_query_aux(&(*min),&(*max),this,node->rightindex,x);
1827 debug(printf("%lluR:\n",(unsigned long long) node->value));
1828 if (node->b > *max) {
1829 *max = node->b;
1830 }
1831 for (lambda = node->b; lambda >= node->a; lambda--) {
1832 debug(printf("Looking at lambda %d, segment %d\n",
1833 lambda,this->omegas[lambda]));
1834 if (Univinterval_is_contained(x,this->intervals,this->omegas[lambda]) == true) {
1835 if (lambda < *min) {
1836 *min = lambda;
1837 }
1838 } else {
1839 return;
1840 }
1841 }
1842 return;
1843 }
1844 }
1845
1846 /************************************************************************/
1847
1848 int *
Univ_IIT_find(int * nmatches,T this,char * label)1849 Univ_IIT_find (int *nmatches, T this, char *label) {
1850 int *matches = NULL, j;
1851 int low, middle, high, recno;
1852 bool foundp = false;
1853 int cmp;
1854
1855 low = 0;
1856 high = this->total_nintervals;
1857 *nmatches = 0;
1858
1859 while (!foundp && low < high) {
1860 middle = low + (high - low)/2;
1861
1862 #ifdef WORDS_BIGENDIAN
1863 cmp = strcmp(label,&(this->labels[Bigendian_convert_uint(this->labelpointers[Bigendian_convert_int(this->labelorder[middle])])]));
1864 #else
1865 cmp = strcmp(label,&(this->labels[this->labelpointers[this->labelorder[middle]]]));
1866 #endif
1867
1868 if (cmp < 0) {
1869 high = middle;
1870 } else if (cmp > 0) {
1871 low = middle + 1;
1872 } else {
1873 foundp = true;
1874 }
1875 }
1876
1877 if (foundp == true) {
1878 low = middle;
1879 #ifdef WORDS_BIGENDIAN
1880 while (low-1 >= 0 &&
1881 !strcmp(label,&(this->labels[Bigendian_convert_uint(this->labelpointers[Bigendian_convert_int(this->labelorder[low-1])])]))) {
1882 low--;
1883 }
1884 #else
1885 while (low-1 >= 0 &&
1886 !strcmp(label,&(this->labels[this->labelpointers[this->labelorder[low-1]]]))) {
1887 low--;
1888 }
1889 #endif
1890
1891 high = middle;
1892 #ifdef WORDS_BIGENDIAN
1893 while (high+1 < this->total_nintervals &&
1894 !strcmp(label,&(this->labels[Bigendian_convert_uint(this->labelpointers[Bigendian_convert_int(this->labelorder[high+1])])]))) {
1895 high++;
1896 }
1897 #else
1898 while (high+1 < this->total_nintervals &&
1899 !strcmp(label,&(this->labels[this->labelpointers[this->labelorder[high+1]]]))) {
1900 high++;
1901 }
1902 #endif
1903
1904
1905 *nmatches = high - low + 1;
1906 if (*nmatches > 0) {
1907 matches = (int *) CALLOC(*nmatches,sizeof(int));
1908 j = 0;
1909 for (recno = low; recno <= high; recno++) {
1910 #ifdef WORDS_BIGENDIAN
1911 #ifdef DEBUG
1912 printf("Pushing %d:%d\n",recno,Bigendian_convert_int(this->labelorder[recno]));
1913 #endif
1914 matches[j++] = Bigendian_convert_int(this->labelorder[recno])+1;
1915
1916 #else
1917 #ifdef DEBUG
1918 printf("Pushing %d:%d\n",recno,this->labelorder[recno]);
1919 #endif
1920 matches[j++] = this->labelorder[recno]+1;
1921 #endif
1922 }
1923 }
1924 }
1925
1926 return matches;
1927 }
1928
1929 /* Slow. Used before binary search method above. */
1930 int
Univ_IIT_find_linear(T this,char * label)1931 Univ_IIT_find_linear (T this, char *label) {
1932 int i;
1933 char *p;
1934
1935 for (i = 0; i < this->total_nintervals; i++) {
1936 #ifdef WORDS_BIGENDIAN
1937 p = &(this->labels[Bigendian_convert_uint(this->labelpointers[i])]);
1938 #else
1939 p = &(this->labels[this->labelpointers[i]]);
1940 #endif
1941 while (isspace((int) *p)) {
1942 p++;
1943 }
1944 if (!strcmp(label,p)) {
1945 return i + 1;
1946 }
1947 }
1948
1949 return -1;
1950 }
1951
1952 /* Returns 1-based index */
1953 int
Univ_IIT_find_one(T this,char * label)1954 Univ_IIT_find_one (T this, char *label) {
1955 int index;
1956 int *matches, nmatches;
1957
1958 matches = Univ_IIT_find(&nmatches,this,label);
1959 if (nmatches == 0) {
1960 /*
1961 fprintf(stderr,"Expected one match for %s, but got 0\n",
1962 label);
1963 */
1964 index = -1;
1965 } else {
1966 if (nmatches > 1) {
1967 fprintf(stderr,"Expected one match for %s, but got %d\n",
1968 label,nmatches);
1969 }
1970 index = matches[0];
1971 FREE(matches);
1972 }
1973
1974 return index;
1975 }
1976
1977
1978 /************************************************************************/
1979
1980
1981 static const Except_T iit_error = { "IIT problem" };
1982
1983 static int
int_compare(const void * a,const void * b)1984 int_compare (const void *a, const void *b) {
1985 int x = * (int *) a;
1986 int y = * (int *) b;
1987
1988 if (x < y) {
1989 return -1;
1990 } else if (y < x) {
1991 return +1;
1992 } else {
1993 return 0;
1994 }
1995 }
1996
1997 int *
Univ_IIT_get(int * nmatches,T this,Univcoord_T x,Univcoord_T y)1998 Univ_IIT_get (int *nmatches, T this, Univcoord_T x, Univcoord_T y) {
1999 /* int *sorted; */
2000 int *matches = NULL, *uniq, neval, nuniq, i;
2001 int lambda, prev;
2002 int min1, max1 = 0, min2, max2 = 0;
2003 int nintervals;
2004
2005 if ((nintervals = this->total_nintervals) == 0) {
2006 *nmatches = 0;
2007 return (int *) NULL;
2008 } else {
2009 min1 = min2 = nintervals + 1;
2010 }
2011
2012 debug(printf("Entering Univ_IIT_get with query %u %u\n",x,y));
2013 fnode_query_aux(&min1,&max1,this,0,x);
2014 fnode_query_aux(&min2,&max2,this,0,y);
2015 debug(printf("min1=%d max1=%d min2=%d max2=%d\n",min1,max1,min2,max2));
2016
2017 *nmatches = 0;
2018 if (max2 >= min1) {
2019 neval = (max2 - min1 + 1) + (max2 - min1 + 1);
2020 matches = (int *) CALLOC(neval,sizeof(int));
2021 uniq = (int *) CALLOC(neval,sizeof(int));
2022
2023 i = 0;
2024 for (lambda = min1; lambda <= max2; lambda++) {
2025 matches[i++] = this->sigmas[lambda];
2026 matches[i++] = this->omegas[lambda];
2027 }
2028
2029 /* Eliminate duplicates */
2030 qsort(matches,neval,sizeof(int),int_compare);
2031 nuniq = 0;
2032 prev = 0;
2033 debug(printf("unique segments in lambda %d to %d:",min1,max2));
2034 for (i = 0; i < neval; i++) {
2035 if (matches[i] != prev) {
2036 debug(printf(" %d",matches[i]));
2037 uniq[nuniq++] = matches[i];
2038 prev = matches[i];
2039 }
2040 }
2041 debug(printf("\n"));
2042
2043 for (i = 0; i < nuniq; i++) {
2044 if (Univinterval_overlap_p(x,y,this->intervals,uniq[i]) == true) {
2045 matches[(*nmatches)++] = uniq[i];
2046 debug(printf("Pushing overlapping segment %d (%u..%u)\n",uniq[i],
2047 Univinterval_low(&(this->intervals[uniq[i]-1])),
2048 Univinterval_high(&(this->intervals[uniq[i]-1]))));
2049 } else {
2050 debug(printf("Not pushing non-overlapping segment %d (%u..%u)\n",uniq[i],
2051 Univinterval_low(&(this->intervals[uniq[i]-1])),
2052 Univinterval_high(&(this->intervals[uniq[i]-1]))));
2053 }
2054 }
2055
2056 FREE(uniq);
2057 }
2058
2059 return matches;
2060
2061 #if 0
2062 /* For some reason, sort_matches_by_position is annotated as being for version 3 and later */
2063 if (sortp == false) {
2064 return matches;
2065 } else if (this->version <= 2) {
2066 sorted = sort_matches_by_type(this,matches,*nmatches,/*alphabetizep*/true);
2067 FREE(matches);
2068 return sorted;
2069 } else {
2070 sorted = sort_matches_by_position(this,matches,*nmatches);
2071 FREE(matches);
2072 return sorted;
2073 }
2074 #endif
2075 }
2076
2077
2078 /* Guaranteed to return one result, even if coordinate is out of bounds */
2079 int
Univ_IIT_get_one(T this,Univcoord_T x,Univcoord_T y)2080 Univ_IIT_get_one (T this, Univcoord_T x, Univcoord_T y) {
2081 int lambda;
2082 int min1, max1 = 0, min2, max2 = 0;
2083 bool stopp;
2084 Univinterval_T interval;
2085
2086 min1 = min2 = this->total_nintervals + 1;
2087
2088 debug(printf("Entering Univ_IIT_get_one with query %llu %llu\n",(unsigned long long) x,(unsigned long long) y));
2089 fnode_query_aux(&min1,&max1,this,0,x);
2090 fnode_query_aux(&min2,&max2,this,0,y);
2091 debug(printf("min1=%d max1=%d min2=%d max2=%d\n",min1,max1,min2,max2));
2092
2093 if (max2 >= min1) {
2094 for (lambda = min1; lambda <= max2; lambda++) {
2095 if (Univinterval_overlap_p(x,y,this->intervals,this->sigmas[lambda]) == true) {
2096 return this->sigmas[lambda];
2097 }
2098 }
2099 for (lambda = min1; lambda <= max2; lambda++) {
2100 if (Univinterval_overlap_p(x,y,this->intervals,this->omegas[lambda]) == true) {
2101 return this->omegas[lambda];
2102 }
2103 }
2104 }
2105
2106 /* fprintf(stderr,"Expected one match for %u--%u, but got none\n",x,y); */
2107 /* If we miss (e.g., for circular chromosome), then report the chromosome below */
2108 /* Look at betas or omegas for left flank */
2109 lambda = min1 - 1;
2110 stopp = false;
2111 while (lambda >= 1 && stopp == false) {
2112 interval = &(this->intervals[this->omegas[lambda]-1]);
2113 if (Univinterval_high(interval) >= x) {
2114 lambda--;
2115 } else {
2116 return this->omegas[lambda];
2117 }
2118 }
2119
2120 return this->omegas[/*lambda*/1];
2121 }
2122
2123
2124 /* Assumes we have a running value for *chrhigh and that successive calls have increasing values for left */
2125 Univcoord_T
Univ_IIT_update_chrnum(int * chrnum,Univcoord_T * chroffset,Univcoord_T chrhigh,Chrpos_T * chrlength,T this,Univcoord_T low,Univcoord_T high,int circular_typeint)2126 Univ_IIT_update_chrnum (int *chrnum, Univcoord_T *chroffset, Univcoord_T chrhigh, Chrpos_T *chrlength,
2127 T this, Univcoord_T low, Univcoord_T high, int circular_typeint) {
2128 int start_chrnum, end_chrnum, test_chrnum;
2129 Univcoord_T test_chroffset, test_chrhigh;
2130 Chrpos_T most_inbounds, inbounds, test_chrlength;
2131
2132 #ifdef FAST_CHR_UPDATE
2133 /* Need to call Univ_IIT_intervals_setup at start of program */
2134 Univcoord_T goal;
2135 int nchromosomes_local = nchromosomes;
2136 Univcoord_T *chrhighs_local = chrhighs;
2137 int j, testj, endj;
2138 #endif
2139
2140 if (low > chrhigh) {
2141 debug4(printf("\nUpdating chrhigh because low %u > chrhigh %u\n",low,chrhigh));
2142
2143 #ifndef FAST_CHR_UPDATE
2144 *chrnum = Univ_IIT_get_one(this,low,low);
2145 Univ_IIT_interval_bounds(&(*chroffset),&chrhigh,&(*chrlength),this,*chrnum,circular_typeint);
2146
2147 #else
2148 /* Code from segment-search.c */
2149 j = 1;
2150 goal = low + 1;
2151 while (j < nchromosomes_local && chrhighs_local[j] < goal) {
2152 j <<= 1; /* gallop by 2 */
2153 }
2154 if (j >= nchromosomes_local) {
2155 j = binary_search(j >> 1,nchromosomes_local,chrhighs_local,goal);
2156 } else {
2157 j = binary_search(j >> 1,j,chrhighs_local,goal);
2158 }
2159 chrnum += j;
2160
2161 chrhigh = chrhighs[chrnum-1];
2162 chroffset = chroffsets[chrnum-1];
2163 chrlength = chrlengths[chrnum-1];
2164 chrhighs_local += j;
2165 nchromosomes_local -= j;
2166 debug4(printf("Got chrnum %d, chroffset %u, chrhigh %u\n",chrnum,chroffset,chrhigh));
2167 #endif
2168
2169 debug4(printf("Updating chrnum to be %d\n",*chrnum));
2170 }
2171
2172 if (high > chrhigh) {
2173 /* Straddles two or more chromosomes */
2174 /* Test first chromosome */
2175 start_chrnum = *chrnum;
2176 most_inbounds = chrhigh - low;
2177 debug4(printf("Straddle. First chromosome inbounds: %u. ",most_inbounds));
2178
2179
2180 /* Test middle chromosomes */
2181 #ifndef FAST_CHR_UPDATE
2182 end_chrnum = Univ_IIT_get_one(this,high-1U,high-1U);
2183
2184 for (test_chrnum = start_chrnum + 1; test_chrnum < end_chrnum; test_chrnum++) {
2185 Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,test_chrnum,circular_typeint);
2186 debug4(printf("Next inbounds: %u. ",test_chrhigh - test_chroffset));
2187 if ((inbounds = test_chrhigh - test_chroffset) > most_inbounds) {
2188 *chrnum = test_chrnum; *chroffset = test_chroffset; chrhigh = test_chrhigh, *chrlength = test_chrlength;
2189 most_inbounds = inbounds;
2190 }
2191 }
2192
2193 #else
2194 /* Code from segment-search.c */
2195 endj = 0;
2196 while (endj+1 < nchromosomes_local && high > chrhighs_local[endj+1]) {
2197 debug1(printf("For high %u, advancing to chrhigh %u\n",high,chrhighs_local[endj+1]));
2198 endj++;
2199 }
2200 if (endj+1 < nchromosomes_local) {
2201 endj++;
2202 }
2203 debug4(printf("endj is %d\n",endj));
2204
2205 for (testj = 1; testj < endj; testj++) {
2206 debug1(printf("Next inbounds: %u. ",chrhighs_local[testj] - chroffsets[(chrnum+testj)-1]));
2207 if ((inbounds = chrhighs_local[testj] - chroffsets[(chrnum+testj)-1]) > most_inbounds) {
2208 j = testj;
2209 most_inbounds = inbounds;
2210 }
2211 }
2212 #endif
2213
2214
2215 /* Test last chromosome */
2216 #ifndef FAST_CHR_UPDATE
2217 Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,end_chrnum,circular_typeint);
2218 debug4(printf("Last inbounds: %u\n",high - test_chroffset));
2219 if ((/*inbounds = */high - test_chroffset) > most_inbounds) {
2220 *chrnum = end_chrnum; *chroffset = test_chroffset; chrhigh = test_chrhigh, *chrlength = test_chrlength;
2221 /* most_inbounds = inbounds; */
2222 }
2223
2224 #else
2225 /* Code from segment-search.c */
2226 if ((/*inbounds = */high - chroffsets[(chrnum+endj)-1]) > most_inbounds) {
2227 j = endj;
2228 }
2229
2230 chrnum += j;
2231 chrhigh = chrhighs[chrnum-1];
2232 chroffset = chroffsets[chrnum-1];
2233 chrlength = chrlengths[chrnum-1];
2234 chrhighs_local += j;
2235 nchromosomes_local -= j;
2236 #endif
2237 }
2238
2239 debug4(printf("Returning chrnum %d\n",*chrnum));
2240 return chrhigh;
2241 }
2242
2243
2244 int
Univ_IIT_get_chrnum(Univcoord_T * chroffset,Univcoord_T * chrhigh,Chrpos_T * chrlength,T this,Univcoord_T low,Univcoord_T high,int circular_typeint)2245 Univ_IIT_get_chrnum (Univcoord_T *chroffset, Univcoord_T *chrhigh, Chrpos_T *chrlength,
2246 T this, Univcoord_T low, Univcoord_T high, int circular_typeint) {
2247 int chrnum, start_chrnum, end_chrnum, test_chrnum;
2248 Univcoord_T test_chroffset, test_chrhigh;
2249 Chrpos_T most_inbounds, inbounds, test_chrlength;
2250
2251 assert(low < high);
2252
2253 chrnum = Univ_IIT_get_one(this,low,low);
2254 Univ_IIT_interval_bounds(&(*chroffset),&(*chrhigh),&(*chrlength),this,chrnum,circular_typeint);
2255 debug4(printf("Setting chrnum to be %d\n",chrnum));
2256
2257 if (high > *chrhigh) {
2258 /* Straddles two or more chromosomes */
2259 /* Test first chromosome */
2260 start_chrnum = chrnum;
2261 most_inbounds = (*chrhigh) - low;
2262 debug4(printf("Straddle. First chromosome inbounds: %u. ",most_inbounds));
2263
2264
2265 /* Test middle chromosomes */
2266 end_chrnum = Univ_IIT_get_one(this,high-1U,high-1U);
2267 for (test_chrnum = start_chrnum + 1; test_chrnum < end_chrnum; test_chrnum++) {
2268 Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,test_chrnum,circular_typeint);
2269 debug4(printf("Next inbounds: %u. ",test_chrhigh - test_chroffset));
2270 if ((inbounds = test_chrhigh - test_chroffset) > most_inbounds) {
2271 chrnum = test_chrnum; *chroffset = test_chroffset; *chrhigh = test_chrhigh, *chrlength = test_chrlength;
2272 most_inbounds = inbounds;
2273 }
2274 }
2275
2276
2277 /* Test last chromosome */
2278 Univ_IIT_interval_bounds(&test_chroffset,&test_chrhigh,&test_chrlength,this,end_chrnum,circular_typeint);
2279 debug4(printf("Last inbounds: %u\n",high - test_chroffset));
2280 if ((inbounds = high - test_chroffset) > most_inbounds) {
2281 chrnum = end_chrnum; *chroffset = test_chroffset; *chrhigh = test_chrhigh, *chrlength = test_chrlength;
2282 /* most_inbounds = inbounds; */
2283 }
2284 }
2285
2286 debug4(printf("Returning chrnum %d\n",chrnum));
2287 return chrnum;
2288 }
2289
2290
2291 int
Univ_IIT_get_trnum(Trcoord_T * troffset,Trcoord_T * trhigh,Trcoord_T * trlength,T this,Trcoord_T low,Trcoord_T high)2292 Univ_IIT_get_trnum (Trcoord_T *troffset, Trcoord_T *trhigh, Trcoord_T *trlength,
2293 T this, Trcoord_T low, Trcoord_T high) {
2294 int trnum, start_trnum, end_trnum, test_trnum;
2295 Trcoord_T test_troffset, test_trhigh;
2296 Trcoord_T most_inbounds, inbounds, test_trlength;
2297
2298 assert(low < high);
2299
2300 trnum = Univ_IIT_get_one(this,low,low);
2301 Univ_IIT_interval_bounds_small(&(*troffset),&(*trhigh),&(*trlength),this,trnum);
2302 debug4(printf("Setting trnum to be %d\n",trnum));
2303
2304 if (high > *trhigh) {
2305 /* Straddles two or more transcripts */
2306 /* Test first transcript */
2307 start_trnum = trnum;
2308 most_inbounds = (*trhigh) - low;
2309 debug4(printf("Straddle. First transcript inbounds: %u. ",most_inbounds));
2310
2311
2312 /* Test middle transcripts */
2313 end_trnum = Univ_IIT_get_one(this,high-1U,high-1U);
2314 for (test_trnum = start_trnum + 1; test_trnum < end_trnum; test_trnum++) {
2315 Univ_IIT_interval_bounds_small(&test_troffset,&test_trhigh,&test_trlength,this,test_trnum);
2316 debug4(printf("Next inbounds: %u. ",test_trhigh - test_troffset));
2317 if ((inbounds = test_trhigh - test_troffset) > most_inbounds) {
2318 trnum = test_trnum; *troffset = test_troffset; *trhigh = test_trhigh, *trlength = test_trlength;
2319 most_inbounds = inbounds;
2320 }
2321 }
2322
2323 /* Test last transcript */
2324 Univ_IIT_interval_bounds_small(&test_troffset,&test_trhigh,&test_trlength,this,end_trnum);
2325 debug4(printf("Last inbounds: %u\n",high - test_troffset));
2326 if ((inbounds = high - test_troffset) > most_inbounds) {
2327 trnum = end_trnum; *troffset = test_troffset; *trhigh = test_trhigh, *trlength = test_trlength;
2328 /* most_inbounds = inbounds; */
2329 }
2330 }
2331
2332 debug4(printf("Returning trnum %d\n",trnum));
2333 return trnum;
2334 }
2335
2336
2337 /* Generally called where intervals don't overlap, like chromosomes,
2338 and where x == y. */
2339 /*
2340 int
2341 Univ_IIT_get_one_safe (T this, Univcoord_T x, Univcoord_T y) {
2342 int index;
2343 int *matches, nmatches;
2344
2345 matches = Univ_IIT_get(&nmatches,this,x,y,sortp);
2346 if (nmatches != 1) {
2347 fprintf(stderr,"Expected one match for %u--%u, but got %d\n",
2348 x,y,nmatches);
2349 abort();
2350 }
2351 index = matches[0];
2352 FREE(matches);
2353 return index;
2354 }
2355 */
2356
2357
2358 /* Note: Procedure call from get-genome.c needed to subtract 1 from
2359 position and then add 1 to chrpos */
2360 char *
Univ_IIT_string_from_position(Chrpos_T * chrpos,Univcoord_T position,T chromosome_iit)2361 Univ_IIT_string_from_position (Chrpos_T *chrpos, Univcoord_T position, T chromosome_iit) {
2362 char *string, *chrstring;
2363 int index;
2364 bool allocp;
2365
2366 index = Univ_IIT_get_one(chromosome_iit,position,position);
2367 *chrpos = position - Univinterval_low(Univ_IIT_interval(chromosome_iit,index));
2368 chrstring = Univ_IIT_label(chromosome_iit,index,&allocp);
2369 if (allocp == true) {
2370 return chrstring;
2371 } else {
2372 string = (char *) CALLOC(strlen(chrstring)+1,sizeof(char));
2373 strcpy(string,chrstring);
2374 return string;
2375 }
2376 }
2377
2378
2379