1 #include <limits.h>
2 #include <float.h>
3 #include <stdlib.h>
4 #include <string.h>
5 #include <math.h>
6 #include "bigWig.h"
7 #include "bwCommon.h"
8
9 /// @cond SKIP
10 struct val_t {
11 uint32_t tid;
12 uint32_t start;
13 uint32_t nBases;
14 float min, max, sum, sumsq;
15 double scalar;
16 struct val_t *next;
17 };
18 /// @endcond
19
20 //Create a chromList_t and attach it to a bigWigFile_t *. Returns NULL on error
21 //Note that chroms and lengths are duplicated, so you MUST free the input
bwCreateChromList(char ** chroms,uint32_t * lengths,int64_t n)22 chromList_t *bwCreateChromList(char **chroms, uint32_t *lengths, int64_t n) {
23 int64_t i = 0;
24 chromList_t *cl = calloc(1, sizeof(chromList_t));
25 if(!cl) return NULL;
26
27 cl->nKeys = n;
28 cl->chrom = malloc(sizeof(char*)*n);
29 cl->len = malloc(sizeof(uint32_t)*n);
30 if(!cl->chrom) goto error;
31 if(!cl->len) goto error;
32
33 for(i=0; i<n; i++) {
34 cl->len[i] = lengths[i];
35 cl->chrom[i] = strdup(chroms[i]);
36 if(!cl->chrom[i]) goto error;
37 }
38
39 return cl;
40
41 error:
42 if(i) {
43 int64_t j;
44 for(j=0; j<i; j++) free(cl->chrom[j]);
45 }
46 if(cl) {
47 if(cl->chrom) free(cl->chrom);
48 if(cl->len) free(cl->len);
49 free(cl);
50 }
51 return NULL;
52 }
53
54 //If maxZooms == 0, then 0 is used (i.e., there are no zoom levels). If maxZooms < 0 or > 65535 then 10 is used.
55 //TODO allow changing bufSize and blockSize
bwCreateHdr(bigWigFile_t * fp,int32_t maxZooms)56 int bwCreateHdr(bigWigFile_t *fp, int32_t maxZooms) {
57 if(!fp->isWrite) return 1;
58 bigWigHdr_t *hdr = calloc(1, sizeof(bigWigHdr_t));
59 if(!hdr) return 2;
60
61 hdr->version = 4;
62 if(maxZooms < 0 || maxZooms > 65535) {
63 hdr->nLevels = 10;
64 } else {
65 hdr->nLevels = maxZooms;
66 }
67
68 hdr->bufSize = 32768; //When the file is finalized this is reset if fp->writeBuffer->compressPsz is 0!
69 hdr->minVal = DBL_MAX;
70 hdr->maxVal = DBL_MIN;
71 fp->hdr = hdr;
72 fp->writeBuffer->blockSize = 64;
73
74 //Allocate the writeBuffer buffers
75 fp->writeBuffer->compressPsz = compressBound(hdr->bufSize);
76 fp->writeBuffer->compressP = malloc(fp->writeBuffer->compressPsz);
77 if(!fp->writeBuffer->compressP) return 3;
78 fp->writeBuffer->p = calloc(1,hdr->bufSize);
79 if(!fp->writeBuffer->p) return 4;
80
81 return 0;
82 }
83
84 //return 0 on success
writeAtPos(void * ptr,size_t sz,size_t nmemb,size_t pos,FILE * fp)85 static int writeAtPos(void *ptr, size_t sz, size_t nmemb, size_t pos, FILE *fp) {
86 size_t curpos = ftell(fp);
87 if(fseek(fp, pos, SEEK_SET)) return 1;
88 if(fwrite(ptr, sz, nmemb, fp) != nmemb) return 2;
89 if(fseek(fp, curpos, SEEK_SET)) return 3;
90 return 0;
91 }
92
93 //We lose keySize bytes on error
writeChromList(FILE * fp,chromList_t * cl)94 static int writeChromList(FILE *fp, chromList_t *cl) {
95 uint16_t k;
96 uint32_t j, magic = CIRTREE_MAGIC;
97 uint32_t nperblock = (cl->nKeys > 0x7FFF) ? 0x7FFF : cl->nKeys; //Items per leaf/non-leaf, there are no unsigned ints in java :(
98 uint32_t nblocks, keySize = 0, valSize = 8; //In theory valSize could be optimized, in practice that'd be annoying
99 uint64_t i, nonLeafEnd, leafSize, nextLeaf;
100 uint8_t eight;
101 int64_t i64;
102 char *chrom;
103 size_t l;
104
105 if(cl->nKeys > 1073676289) {
106 fprintf(stderr, "[writeChromList] Error: Currently only 1,073,676,289 contigs are supported. If you really need more then please post a request on github.\n");
107 return 1;
108 }
109 nblocks = cl->nKeys/nperblock;
110 nblocks += ((cl->nKeys % nperblock) > 0)?1:0;
111
112 for(i64=0; i64<cl->nKeys; i64++) {
113 l = strlen(cl->chrom[i64]);
114 if(l>keySize) keySize = l;
115 }
116 l--; //We don't null terminate strings, because schiess mich tot
117 chrom = calloc(keySize, sizeof(char));
118
119 //Write the root node of a largely pointless tree
120 if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 1;
121 if(fwrite(&nperblock, sizeof(uint32_t), 1, fp) != 1) return 2;
122 if(fwrite(&keySize, sizeof(uint32_t), 1, fp) != 1) return 3;
123 if(fwrite(&valSize, sizeof(uint32_t), 1, fp) != 1) return 4;
124 if(fwrite(&(cl->nKeys), sizeof(uint64_t), 1, fp) != 1) return 5;
125
126 //Padding?
127 i=0;
128 if(fwrite(&i, sizeof(uint64_t), 1, fp) != 1) return 6;
129
130 //Do we need a non-leaf node?
131 if(nblocks > 1) {
132 eight = 0;
133 if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 7;
134 if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 8; //padding
135 if(fwrite(&nblocks, sizeof(uint16_t), 1, fp) != 1) return 8;
136 nonLeafEnd = ftell(fp) + nperblock * (keySize + 8);
137 leafSize = nperblock * (keySize + 8) + 4;
138 for(i=0; i<nblocks; i++) { //Why yes, this is pointless
139 chrom = strncpy(chrom, cl->chrom[i * nperblock], keySize);
140 nextLeaf = nonLeafEnd + i * leafSize;
141 if(fwrite(chrom, keySize, 1, fp) != 1) return 9;
142 if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 10;
143 }
144 for(i=0; i<keySize; i++) chrom[i] = '\0';
145 nextLeaf = 0;
146 for(i=nblocks; i<nperblock; i++) {
147 if(fwrite(chrom, keySize, 1, fp) != 1) return 9;
148 if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 10;
149 }
150 }
151
152 //Write the leaves
153 nextLeaf = 0;
154 for(i=0, j=0; i<nblocks; i++) {
155 eight = 1;
156 if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 11;
157 eight = 0;
158 if(fwrite(&eight, sizeof(uint8_t), 1, fp) != 1) return 12;
159 if(cl->nKeys - j < nperblock) {
160 k = cl->nKeys - j;
161 if(fwrite(&k, sizeof(uint16_t), 1, fp) != 1) return 13;
162 } else {
163 if(fwrite(&nperblock, sizeof(uint16_t), 1, fp) != 1) return 13;
164 }
165 for(k=0; k<nperblock; k++) {
166 if(j>=cl->nKeys) {
167 if(chrom[0]) {
168 for(l=0; l<keySize; l++) chrom[l] = '\0';
169 }
170 if(fwrite(chrom, keySize, 1, fp) != 1) return 15;
171 if(fwrite(&nextLeaf, sizeof(uint64_t), 1, fp) != 1) return 16;
172 } else {
173 chrom = strncpy(chrom, cl->chrom[j], keySize);
174 if(fwrite(chrom, keySize, 1, fp) != 1) return 15;
175 if(fwrite(&j, sizeof(uint32_t), 1, fp) != 1) return 16;
176 if(fwrite(&(cl->len[j++]), sizeof(uint32_t), 1, fp) != 1) return 17;
177 }
178 }
179 }
180
181 free(chrom);
182 return 0;
183 }
184
185 //returns 0 on success
186 //Still need to fill in indexOffset
bwWriteHdr(bigWigFile_t * bw)187 int bwWriteHdr(bigWigFile_t *bw) {
188 uint32_t magic = BIGWIG_MAGIC;
189 uint16_t two = 4;
190 FILE *fp;
191 void *p = calloc(58, sizeof(uint8_t)); //58 bytes of nothing
192 if(!bw->isWrite) return 1;
193
194 //The header itself, largely just reserving space...
195 fp = bw->URL->x.fp;
196 if(!fp) return 2;
197 if(fseek(fp, 0, SEEK_SET)) return 3;
198 if(fwrite(&magic, sizeof(uint32_t), 1, fp) != 1) return 4;
199 if(fwrite(&two, sizeof(uint16_t), 1, fp) != 1) return 5;
200 if(fwrite(p, sizeof(uint8_t), 58, fp) != 58) return 6;
201
202 //Empty zoom headers
203 if(bw->hdr->nLevels) {
204 for(two=0; two<bw->hdr->nLevels; two++) {
205 if(fwrite(p, sizeof(uint8_t), 24, fp) != 24) return 9;
206 }
207 }
208
209 //Update summaryOffset and write an empty summary block
210 bw->hdr->summaryOffset = ftell(fp);
211 if(fwrite(p, sizeof(uint8_t), 40, fp) != 40) return 10;
212 if(writeAtPos(&(bw->hdr->summaryOffset), sizeof(uint64_t), 1, 0x2c, fp)) return 11;
213
214 //Write the chromosome list as a stupid freaking tree (because let's TREE ALL THE THINGS!!!)
215 bw->hdr->ctOffset = ftell(fp);
216 if(writeChromList(fp, bw->cl)) return 7;
217 if(writeAtPos(&(bw->hdr->ctOffset), sizeof(uint64_t), 1, 0x8, fp)) return 8;
218
219 //Update the dataOffset
220 bw->hdr->dataOffset = ftell(fp);
221 if(writeAtPos(&bw->hdr->dataOffset, sizeof(uint64_t), 1, 0x10, fp)) return 12;
222
223 //Save space for the number of blocks
224 if(fwrite(p, sizeof(uint8_t), 8, fp) != 8) return 13;
225
226 free(p);
227 return 0;
228 }
229
insertIndexNode(bigWigFile_t * fp,bwRTreeNode_t * leaf)230 static int insertIndexNode(bigWigFile_t *fp, bwRTreeNode_t *leaf) {
231 bwLL *l = malloc(sizeof(bwLL));
232 if(!l) return 1;
233 l->node = leaf;
234 l->next = NULL;
235
236 if(!fp->writeBuffer->firstIndexNode) {
237 fp->writeBuffer->firstIndexNode = l;
238 } else {
239 fp->writeBuffer->currentIndexNode->next = l;
240 }
241 fp->writeBuffer->currentIndexNode = l;
242 return 0;
243 }
244
245 //0 on success
appendIndexNodeEntry(bigWigFile_t * fp,uint32_t tid0,uint32_t tid1,uint32_t start,uint32_t end,uint64_t offset,uint64_t size)246 static int appendIndexNodeEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) {
247 bwLL *n = fp->writeBuffer->currentIndexNode;
248 if(!n) return 1;
249 if(n->node->nChildren >= fp->writeBuffer->blockSize) return 2;
250
251 n->node->chrIdxStart[n->node->nChildren] = tid0;
252 n->node->baseStart[n->node->nChildren] = start;
253 n->node->chrIdxEnd[n->node->nChildren] = tid1;
254 n->node->baseEnd[n->node->nChildren] = end;
255 n->node->dataOffset[n->node->nChildren] = offset;
256 n->node->x.size[n->node->nChildren] = size;
257 n->node->nChildren++;
258 return 0;
259 }
260
261 //Returns 0 on success
addIndexEntry(bigWigFile_t * fp,uint32_t tid0,uint32_t tid1,uint32_t start,uint32_t end,uint64_t offset,uint64_t size)262 static int addIndexEntry(bigWigFile_t *fp, uint32_t tid0, uint32_t tid1, uint32_t start, uint32_t end, uint64_t offset, uint64_t size) {
263 bwRTreeNode_t *node;
264
265 if(appendIndexNodeEntry(fp, tid0, tid1, start, end, offset, size)) {
266 //The last index node is full, we need to add a new one
267 node = calloc(1, sizeof(bwRTreeNode_t));
268 if(!node) return 1;
269
270 //Allocate and set the fields
271 node->isLeaf = 1;
272 node->nChildren = 1;
273 node->chrIdxStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
274 if(!node->chrIdxStart) goto error;
275 node->baseStart = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
276 if(!node->baseStart) goto error;
277 node->chrIdxEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
278 if(!node->chrIdxEnd) goto error;
279 node->baseEnd = malloc(sizeof(uint32_t)*fp->writeBuffer->blockSize);
280 if(!node->baseEnd) goto error;
281 node->dataOffset = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize);
282 if(!node->dataOffset) goto error;
283 node->x.size = malloc(sizeof(uint64_t)*fp->writeBuffer->blockSize);
284 if(!node->x.size) goto error;
285
286 node->chrIdxStart[0] = tid0;
287 node->baseStart[0] = start;
288 node->chrIdxEnd[0] = tid1;
289 node->baseEnd[0] = end;
290 node->dataOffset[0] = offset;
291 node->x.size[0] = size;
292
293 if(insertIndexNode(fp, node)) goto error;
294 }
295
296 return 0;
297
298 error:
299 if(node->chrIdxStart) free(node->chrIdxStart);
300 if(node->baseStart) free(node->baseStart);
301 if(node->chrIdxEnd) free(node->chrIdxEnd);
302 if(node->baseEnd) free(node->baseEnd);
303 if(node->dataOffset) free(node->dataOffset);
304 if(node->x.size) free(node->x.size);
305 return 2;
306 }
307
308 /*
309 * TODO:
310 * The buffer size and compression sz need to be determined elsewhere (and p and compressP filled in!)
311 */
flushBuffer(bigWigFile_t * fp)312 static int flushBuffer(bigWigFile_t *fp) {
313 bwWriteBuffer_t *wb = fp->writeBuffer;
314 uLongf sz = wb->compressPsz;
315 uint16_t nItems;
316 if(!fp->writeBuffer->l) return 0;
317 if(!wb->ltype) return 0;
318
319 //Fill in the header
320 if(!memcpy(wb->p, &(wb->tid), sizeof(uint32_t))) return 1;
321 if(!memcpy(wb->p+4, &(wb->start), sizeof(uint32_t))) return 2;
322 if(!memcpy(wb->p+8, &(wb->end), sizeof(uint32_t))) return 3;
323 if(!memcpy(wb->p+12, &(wb->step), sizeof(uint32_t))) return 4;
324 if(!memcpy(wb->p+16, &(wb->span), sizeof(uint32_t))) return 5;
325 if(!memcpy(wb->p+20, &(wb->ltype), sizeof(uint8_t))) return 6;
326 //1 byte padding
327 //Determine the number of items
328 switch(wb->ltype) {
329 case 1:
330 nItems = (wb->l-24)/12;
331 break;
332 case 2:
333 nItems = (wb->l-24)/8;
334 break;
335 case 3:
336 nItems = (wb->l-24)/4;
337 break;
338 default:
339 return 7;
340 }
341 if(!memcpy(wb->p+22, &nItems, sizeof(uint16_t))) return 8;
342
343 if(sz) {
344 //compress
345 if(compress(wb->compressP, &sz, wb->p, wb->l) != Z_OK) return 9;
346
347 //write the data to disk
348 if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 10;
349 } else {
350 sz = wb->l;
351 if(fwrite(wb->p, sizeof(uint8_t), wb->l, fp->URL->x.fp) != wb->l) return 10;
352 }
353
354 //Add an entry into the index
355 if(addIndexEntry(fp, wb->tid, wb->tid, wb->start, wb->end, bwTell(fp)-sz, sz)) return 11;
356
357 wb->nBlocks++;
358 wb->l = 24;
359 return 0;
360 }
361
updateStats(bigWigFile_t * fp,uint32_t span,float val)362 static void updateStats(bigWigFile_t *fp, uint32_t span, float val) {
363 if(val < fp->hdr->minVal) fp->hdr->minVal = val;
364 else if(val > fp->hdr->maxVal) fp->hdr->maxVal = val;
365 fp->hdr->nBasesCovered += span;
366 fp->hdr->sumData += span*val;
367 fp->hdr->sumSquared += span*pow(val,2);
368
369 fp->writeBuffer->nEntries++;
370 fp->writeBuffer->runningWidthSum += span;
371 }
372
373 //12 bytes per entry
bwAddIntervals(bigWigFile_t * fp,char ** chrom,uint32_t * start,uint32_t * end,float * values,uint32_t n)374 int bwAddIntervals(bigWigFile_t *fp, char **chrom, uint32_t *start, uint32_t *end, float *values, uint32_t n) {
375 uint32_t tid = 0, i;
376 char *lastChrom = NULL;
377 bwWriteBuffer_t *wb = fp->writeBuffer;
378 if(!n) return 0; //Not an error per se
379 if(!fp->isWrite) return 1;
380 if(!wb) return 2;
381
382 //Flush if needed
383 if(wb->ltype != 1) if(flushBuffer(fp)) return 3;
384 if(wb->l+36 > fp->hdr->bufSize) if(flushBuffer(fp)) return 4;
385 lastChrom = chrom[0];
386 tid = bwGetTid(fp, chrom[0]);
387 if(tid == (uint32_t) -1) return 5;
388 if(tid != wb->tid) {
389 if(flushBuffer(fp)) return 6;
390 wb->tid = tid;
391 wb->start = start[0];
392 wb->end = end[0];
393 }
394
395 //Ensure that everything is set correctly
396 wb->ltype = 1;
397 if(wb->l <= 24) {
398 wb->start = start[0];
399 wb->span = 0;
400 wb->step = 0;
401 }
402 if(!memcpy(wb->p+wb->l, start, sizeof(uint32_t))) return 7;
403 if(!memcpy(wb->p+wb->l+4, end, sizeof(uint32_t))) return 8;
404 if(!memcpy(wb->p+wb->l+8, values, sizeof(float))) return 9;
405 updateStats(fp, end[0]-start[0], values[0]);
406 wb->l += 12;
407
408 for(i=1; i<n; i++) {
409 if(strcmp(chrom[i],lastChrom) != 0) {
410 wb->end = end[i-1];
411 flushBuffer(fp);
412 lastChrom = chrom[i];
413 tid = bwGetTid(fp, chrom[i]);
414 if(tid == (uint32_t) -1) return 10;
415 wb->tid = tid;
416 wb->start = start[i];
417 }
418 if(wb->l+12 > fp->hdr->bufSize) { //12 bytes/entry
419 wb->end = end[i-1];
420 flushBuffer(fp);
421 wb->start = start[i];
422 }
423 if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 11;
424 if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 12;
425 if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 13;
426 updateStats(fp, end[i]-start[i], values[i]);
427 wb->l += 12;
428 }
429 wb->end = end[i-1];
430
431 return 0;
432 }
433
bwAppendIntervals(bigWigFile_t * fp,uint32_t * start,uint32_t * end,float * values,uint32_t n)434 int bwAppendIntervals(bigWigFile_t *fp, uint32_t *start, uint32_t *end, float *values, uint32_t n) {
435 uint32_t i;
436 bwWriteBuffer_t *wb = fp->writeBuffer;
437 if(!n) return 0;
438 if(!fp->isWrite) return 1;
439 if(!wb) return 2;
440 if(wb->ltype != 1) return 3;
441
442 for(i=0; i<n; i++) {
443 if(wb->l+12 > fp->hdr->bufSize) {
444 if(i>0) { //otherwise it's already set
445 wb->end = end[i-1];
446 }
447 flushBuffer(fp);
448 wb->start = start[i];
449 }
450 if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4;
451 if(!memcpy(wb->p+wb->l+4, &(end[i]), sizeof(uint32_t))) return 5;
452 if(!memcpy(wb->p+wb->l+8, &(values[i]), sizeof(float))) return 6;
453 updateStats(fp, end[i]-start[i], values[i]);
454 wb->l += 12;
455 }
456 wb->end = end[i-1];
457
458 return 0;
459 }
460
461 //8 bytes per entry
bwAddIntervalSpans(bigWigFile_t * fp,char * chrom,uint32_t * start,uint32_t span,float * values,uint32_t n)462 int bwAddIntervalSpans(bigWigFile_t *fp, char *chrom, uint32_t *start, uint32_t span, float *values, uint32_t n) {
463 uint32_t i, tid;
464 bwWriteBuffer_t *wb = fp->writeBuffer;
465 if(!n) return 0;
466 if(!fp->isWrite) return 1;
467 if(!wb) return 2;
468 if(wb->ltype != 2) if(flushBuffer(fp)) return 3;
469 if(flushBuffer(fp)) return 4;
470
471 tid = bwGetTid(fp, chrom);
472 if(tid == (uint32_t) -1) return 5;
473 wb->tid = tid;
474 wb->start = start[0];
475 wb->step = 0;
476 wb->span = span;
477 wb->ltype = 2;
478
479 for(i=0; i<n; i++) {
480 if(wb->l + 8 >= fp->hdr->bufSize) { //8 bytes/entry
481 if(i) wb->end = start[i-1]+span;
482 flushBuffer(fp);
483 wb->start = start[i];
484 }
485 if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 5;
486 if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 6;
487 updateStats(fp, span, values[i]);
488 wb->l += 8;
489 }
490 wb->end = start[n-1] + span;
491
492 return 0;
493 }
494
bwAppendIntervalSpans(bigWigFile_t * fp,uint32_t * start,float * values,uint32_t n)495 int bwAppendIntervalSpans(bigWigFile_t *fp, uint32_t *start, float *values, uint32_t n) {
496 uint32_t i;
497 bwWriteBuffer_t *wb = fp->writeBuffer;
498 if(!n) return 0;
499 if(!fp->isWrite) return 1;
500 if(!wb) return 2;
501 if(wb->ltype != 2) return 3;
502
503 for(i=0; i<n; i++) {
504 if(wb->l + 8 >= fp->hdr->bufSize) {
505 if(i) wb->end = start[i-1]+wb->span;
506 flushBuffer(fp);
507 wb->start = start[i];
508 }
509 if(!memcpy(wb->p+wb->l, &(start[i]), sizeof(uint32_t))) return 4;
510 if(!memcpy(wb->p+wb->l+4, &(values[i]), sizeof(float))) return 5;
511 updateStats(fp, wb->span, values[i]);
512 wb->l += 8;
513 }
514 wb->end = start[n-1] + wb->span;
515
516 return 0;
517 }
518
519 //4 bytes per entry
bwAddIntervalSpanSteps(bigWigFile_t * fp,char * chrom,uint32_t start,uint32_t span,uint32_t step,float * values,uint32_t n)520 int bwAddIntervalSpanSteps(bigWigFile_t *fp, char *chrom, uint32_t start, uint32_t span, uint32_t step, float *values, uint32_t n) {
521 uint32_t i, tid;
522 bwWriteBuffer_t *wb = fp->writeBuffer;
523 if(!n) return 0;
524 if(!fp->isWrite) return 1;
525 if(!wb) return 2;
526 if(wb->ltype != 3) flushBuffer(fp);
527 if(flushBuffer(fp)) return 3;
528
529 tid = bwGetTid(fp, chrom);
530 if(tid == (uint32_t) -1) return 4;
531 wb->tid = tid;
532 wb->start = start;
533 wb->step = step;
534 wb->span = span;
535 wb->ltype = 3;
536
537 for(i=0; i<n; i++) {
538 if(wb->l + 4 >= fp->hdr->bufSize) {
539 wb->end = wb->start + ((wb->l-24)>>2) * step;
540 flushBuffer(fp);
541 wb->start = wb->end;
542 }
543 if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 5;
544 updateStats(fp, wb->span, values[i]);
545 wb->l += 4;
546 }
547 wb->end = wb->start + (wb->l>>2) * step;
548
549 return 0;
550 }
551
bwAppendIntervalSpanSteps(bigWigFile_t * fp,float * values,uint32_t n)552 int bwAppendIntervalSpanSteps(bigWigFile_t *fp, float *values, uint32_t n) {
553 uint32_t i;
554 bwWriteBuffer_t *wb = fp->writeBuffer;
555 if(!n) return 0;
556 if(!fp->isWrite) return 1;
557 if(!wb) return 2;
558 if(wb->ltype != 3) return 3;
559
560 for(i=0; i<n; i++) {
561 if(wb->l + 4 >= fp->hdr->bufSize) {
562 wb->end = wb->start + ((wb->l-24)>>2) * wb->step;
563 flushBuffer(fp);
564 wb->start = wb->end;
565 }
566 if(!memcpy(wb->p+wb->l, &(values[i]), sizeof(float))) return 4;
567 updateStats(fp, wb->span, values[i]);
568 wb->l += 4;
569 }
570 wb->end = wb->start + (wb->l>>2) * wb->step;
571
572 return 0;
573 }
574
575 //0 on success
writeSummary(bigWigFile_t * fp)576 int writeSummary(bigWigFile_t *fp) {
577 if(writeAtPos(&(fp->hdr->nBasesCovered), sizeof(uint64_t), 1, fp->hdr->summaryOffset, fp->URL->x.fp)) return 1;
578 if(writeAtPos(&(fp->hdr->minVal), sizeof(double), 1, fp->hdr->summaryOffset+8, fp->URL->x.fp)) return 2;
579 if(writeAtPos(&(fp->hdr->maxVal), sizeof(double), 1, fp->hdr->summaryOffset+16, fp->URL->x.fp)) return 3;
580 if(writeAtPos(&(fp->hdr->sumData), sizeof(double), 1, fp->hdr->summaryOffset+24, fp->URL->x.fp)) return 4;
581 if(writeAtPos(&(fp->hdr->sumSquared), sizeof(double), 1, fp->hdr->summaryOffset+32, fp->URL->x.fp)) return 5;
582 return 0;
583 }
584
makeEmptyNode(uint32_t blockSize)585 static bwRTreeNode_t *makeEmptyNode(uint32_t blockSize) {
586 bwRTreeNode_t *n = calloc(1, sizeof(bwRTreeNode_t));
587 if(!n) return NULL;
588
589 n->chrIdxStart = malloc(blockSize*sizeof(uint32_t));
590 if(!n->chrIdxStart) goto error;
591 n->baseStart = malloc(blockSize*sizeof(uint32_t));
592 if(!n->baseStart) goto error;
593 n->chrIdxEnd = malloc(blockSize*sizeof(uint32_t));
594 if(!n->chrIdxEnd) goto error;
595 n->baseEnd = malloc(blockSize*sizeof(uint32_t));
596 if(!n->baseEnd) goto error;
597 n->dataOffset = calloc(blockSize,sizeof(uint64_t)); //This MUST be 0 for node writing!
598 if(!n->dataOffset) goto error;
599 n->x.child = malloc(blockSize*sizeof(uint64_t));
600 if(!n->x.child) goto error;
601
602 return n;
603
604 error:
605 if(n->chrIdxStart) free(n->chrIdxStart);
606 if(n->baseStart) free(n->baseStart);
607 if(n->chrIdxEnd) free(n->chrIdxEnd);
608 if(n->baseEnd) free(n->baseEnd);
609 if(n->dataOffset) free(n->dataOffset);
610 if(n->x.child) free(n->x.child);
611 free(n);
612 return NULL;
613 }
614
615 //Returns 0 on success. This doesn't attempt to clean up!
addLeaves(bwLL ** ll,uint64_t * sz,uint64_t toProcess,uint32_t blockSize)616 static bwRTreeNode_t *addLeaves(bwLL **ll, uint64_t *sz, uint64_t toProcess, uint32_t blockSize) {
617 uint32_t i;
618 uint64_t foo;
619 bwRTreeNode_t *n = makeEmptyNode(blockSize);
620 if(!n) return NULL;
621
622 if(toProcess <= blockSize) {
623 for(i=0; i<toProcess; i++) {
624 n->chrIdxStart[i] = (*ll)->node->chrIdxStart[0];
625 n->baseStart[i] = (*ll)->node->baseStart[0];
626 n->chrIdxEnd[i] = (*ll)->node->chrIdxEnd[(*ll)->node->nChildren-1];
627 n->baseEnd[i] = (*ll)->node->baseEnd[(*ll)->node->nChildren-1];
628 n->x.child[i] = (*ll)->node;
629 *sz += 4 + 32*(*ll)->node->nChildren;
630 *ll = (*ll)->next;
631 n->nChildren++;
632 }
633 } else {
634 for(i=0; i<blockSize; i++) {
635 foo = ceil(((double) toProcess)/((double) blockSize-i));
636 if(!ll) break;
637 n->x.child[i] = addLeaves(ll, sz, foo, blockSize);
638 if(!n->x.child[i]) goto error;
639 n->chrIdxStart[i] = n->x.child[i]->chrIdxStart[0];
640 n->baseStart[i] = n->x.child[i]->baseStart[0];
641 n->chrIdxEnd[i] = n->x.child[i]->chrIdxEnd[n->x.child[i]->nChildren-1];
642 n->baseEnd[i] = n->x.child[i]->baseEnd[n->x.child[i]->nChildren-1];
643 n->nChildren++;
644 toProcess -= foo;
645 }
646 }
647
648 *sz += 4 + 24*n->nChildren;
649 return n;
650
651 error:
652 bwDestroyIndexNode(n);
653 return NULL;
654 }
655
656 //Returns 1 on error
writeIndexTreeNode(FILE * fp,bwRTreeNode_t * n,uint8_t * wrote,int level)657 int writeIndexTreeNode(FILE *fp, bwRTreeNode_t *n, uint8_t *wrote, int level) {
658 uint8_t one = 0;
659 uint32_t i, j, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0
660
661 if(n->isLeaf) return 0;
662
663 for(i=0; i<n->nChildren; i++) {
664 if(n->dataOffset[i]) { //traverse into child
665 if(n->isLeaf) return 0; //Only write leaves once!
666 if(writeIndexTreeNode(fp, n->x.child[i], wrote, level+1)) return 1;
667 } else {
668 n->dataOffset[i] = ftell(fp);
669 if(fwrite(&(n->x.child[i]->isLeaf), sizeof(uint8_t), 1, fp) != 1) return 1;
670 if(fwrite(&one, sizeof(uint8_t), 1, fp) != 1) return 1; //one byte of padding
671 if(fwrite(&(n->x.child[i]->nChildren), sizeof(uint16_t), 1, fp) != 1) return 1;
672 for(j=0; j<n->x.child[i]->nChildren; j++) {
673 vector[0] = n->x.child[i]->chrIdxStart[j];
674 vector[1] = n->x.child[i]->baseStart[j];
675 vector[2] = n->x.child[i]->chrIdxEnd[j];
676 vector[3] = n->x.child[i]->baseEnd[j];
677 if(n->x.child[i]->isLeaf) {
678 //Include the offset and size
679 if(fwrite(vector, sizeof(uint32_t), 4, fp) != 4) return 1;
680 if(fwrite(&(n->x.child[i]->dataOffset[j]), sizeof(uint64_t), 1, fp) != 1) return 1;
681 if(fwrite(&(n->x.child[i]->x.size[j]), sizeof(uint64_t), 1, fp) != 1) return 1;
682 } else {
683 if(fwrite(vector, sizeof(uint32_t), 6, fp) != 6) return 1;
684 }
685 }
686 *wrote = 1;
687 }
688 }
689
690 return 0;
691 }
692
693 //returns 1 on success
writeIndexOffsets(FILE * fp,bwRTreeNode_t * n,uint64_t offset)694 int writeIndexOffsets(FILE *fp, bwRTreeNode_t *n, uint64_t offset) {
695 uint32_t i;
696
697 if(n->isLeaf) return 0;
698 for(i=0; i<n->nChildren; i++) {
699 if(writeIndexOffsets(fp, n->x.child[i], n->dataOffset[i])) return 1;
700 if(writeAtPos(&(n->dataOffset[i]), sizeof(uint64_t), 1, offset+20+24*i, fp)) return 2;
701 }
702 return 0;
703 }
704
705 //Returns 0 on success
writeIndexTree(bigWigFile_t * fp)706 int writeIndexTree(bigWigFile_t *fp) {
707 uint64_t offset;
708 uint8_t wrote = 0;
709 int rv;
710
711 while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->idx->root, &wrote, 0)) == 0) {
712 if(!wrote) break;
713 wrote = 0;
714 }
715
716 if(rv || wrote) return 1;
717
718 //Save the file position
719 offset = bwTell(fp);
720
721 //Write the offsets
722 if(writeIndexOffsets(fp->URL->x.fp, fp->idx->root, fp->idx->rootOffset)) return 2;
723
724 //Move the file pointer back to the end
725 bwSetPos(fp, offset);
726
727 return 0;
728 }
729
730 //Returns 0 on success. The original state SHOULD be preserved on error
writeIndex(bigWigFile_t * fp)731 int writeIndex(bigWigFile_t *fp) {
732 uint32_t four = IDX_MAGIC;
733 uint64_t idxSize = 0, foo;
734 uint8_t one = 0;
735 uint32_t i, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0
736 bwLL *ll = fp->writeBuffer->firstIndexNode, *p;
737 bwRTreeNode_t *root = NULL;
738
739 if(!fp->writeBuffer->nBlocks) return 0;
740 fp->idx = malloc(sizeof(bwRTree_t));
741 if(!fp->idx) return 2;
742 fp->idx->root = root;
743
744 //Update the file header to indicate the proper index position
745 foo = bwTell(fp);
746 if(writeAtPos(&foo, sizeof(uint64_t), 1, 0x18, fp->URL->x.fp)) return 3;
747
748 //Make the tree
749 if(ll == fp->writeBuffer->currentIndexNode) {
750 root = ll->node;
751 idxSize = 4 + 24*root->nChildren;
752 } else {
753 root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize);
754 }
755 if(!root) return 4;
756 fp->idx->root = root;
757
758 ll = fp->writeBuffer->firstIndexNode;
759 while(ll) {
760 p = ll->next;
761 free(ll);
762 ll=p;
763 }
764
765 //write the header
766 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 5;
767 if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6;
768 if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7;
769 if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8;
770 if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
771 if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10;
772 if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11;
773 if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12;
774 four = 1;
775 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13;
776 four = 0;
777 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding
778 fp->idx->rootOffset = bwTell(fp);
779
780 //Write the root node, since writeIndexTree writes the children and fills in the offset
781 if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16;
782 if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding
783 if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18;
784 for(i=0; i<root->nChildren; i++) {
785 vector[0] = root->chrIdxStart[i];
786 vector[1] = root->baseStart[i];
787 vector[2] = root->chrIdxEnd[i];
788 vector[3] = root->baseEnd[i];
789 if(root->isLeaf) {
790 //Include the offset and size
791 if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19;
792 if(fwrite(&(root->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20;
793 if(fwrite(&(root->x.size[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21;
794 } else {
795 root->dataOffset[i] = 0; //FIXME: Something upstream is setting this to impossible values (e.g., 0x21?!?!?)
796 if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22;
797 }
798 }
799
800 //Write each level
801 if(writeIndexTree(fp)) return 23;
802
803 return 0;
804 }
805
806 //The first zoom level has a resolution of 4x mean entry size
807 //This may or may not produce the requested number of zoom levels
makeZoomLevels(bigWigFile_t * fp)808 int makeZoomLevels(bigWigFile_t *fp) {
809 uint32_t meanBinSize, i;
810 uint32_t multiplier = 4, zoom = 10, maxZoom = 0;
811 uint16_t nLevels = 0;
812
813 meanBinSize = ((double) fp->writeBuffer->runningWidthSum)/(fp->writeBuffer->nEntries);
814 //In reality, one level is skipped
815 meanBinSize *= 4;
816 //N.B., we must ALWAYS check that the zoom doesn't overflow a uint32_t!
817 if(((uint32_t)-1)>>2 < meanBinSize) return 0; //No zoom levels!
818 if(meanBinSize*4 > zoom) zoom = multiplier*meanBinSize;
819
820 fp->hdr->zoomHdrs = calloc(1, sizeof(bwZoomHdr_t));
821 if(!fp->hdr->zoomHdrs) return 1;
822 fp->hdr->zoomHdrs->level = malloc(fp->hdr->nLevels * sizeof(uint32_t));
823 fp->hdr->zoomHdrs->dataOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t));
824 fp->hdr->zoomHdrs->indexOffset = calloc(fp->hdr->nLevels, sizeof(uint64_t));
825 fp->hdr->zoomHdrs->idx = calloc(fp->hdr->nLevels, sizeof(bwRTree_t*));
826 if(!fp->hdr->zoomHdrs->level) return 2;
827 if(!fp->hdr->zoomHdrs->dataOffset) return 3;
828 if(!fp->hdr->zoomHdrs->indexOffset) return 4;
829 if(!fp->hdr->zoomHdrs->idx) return 5;
830
831 //There's no point in having a zoom level larger than the largest chromosome
832 //This will none the less allow at least one zoom level, which is generally needed for IGV et al.
833 for(i=0; i<fp->cl->nKeys; i++) {
834 if(fp->cl->len[i] > maxZoom) maxZoom = fp->cl->len[i];
835 }
836 if(zoom > maxZoom) zoom = maxZoom;
837
838 for(i=0; i<fp->hdr->nLevels; i++) {
839 if(zoom > maxZoom) break; //prevent absurdly large zoom levels
840 fp->hdr->zoomHdrs->level[i] = zoom;
841 nLevels++;
842 if(((uint32_t)-1)/multiplier < zoom) break;
843 zoom *= multiplier;
844 }
845 fp->hdr->nLevels = nLevels;
846
847 fp->writeBuffer->firstZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*));
848 if(!fp->writeBuffer->firstZoomBuffer) goto error;
849 fp->writeBuffer->lastZoomBuffer = calloc(nLevels,sizeof(bwZoomBuffer_t*));
850 if(!fp->writeBuffer->lastZoomBuffer) goto error;
851 fp->writeBuffer->nNodes = calloc(nLevels, sizeof(uint64_t));
852
853 for(i=0; i<fp->hdr->nLevels; i++) {
854 fp->writeBuffer->firstZoomBuffer[i] = calloc(1, sizeof(bwZoomBuffer_t));
855 if(!fp->writeBuffer->firstZoomBuffer[i]) goto error;
856 fp->writeBuffer->firstZoomBuffer[i]->p = calloc(fp->hdr->bufSize/32, 32);
857 if(!fp->writeBuffer->firstZoomBuffer[i]->p) goto error;
858 fp->writeBuffer->firstZoomBuffer[i]->m = fp->hdr->bufSize;
859 ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[0] = 0;
860 ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[1] = 0;
861 ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->hdr->zoomHdrs->level[i];
862 if(fp->hdr->zoomHdrs->level[i] > fp->cl->len[0]) ((uint32_t*)fp->writeBuffer->firstZoomBuffer[i]->p)[2] = fp->cl->len[0];
863 fp->writeBuffer->lastZoomBuffer[i] = fp->writeBuffer->firstZoomBuffer[i];
864 }
865
866 return 0;
867
868 error:
869 if(fp->writeBuffer->firstZoomBuffer) {
870 for(i=0; i<fp->hdr->nLevels; i++) {
871 if(fp->writeBuffer->firstZoomBuffer[i]) {
872 if(fp->writeBuffer->firstZoomBuffer[i]->p) free(fp->writeBuffer->firstZoomBuffer[i]->p);
873 free(fp->writeBuffer->firstZoomBuffer[i]);
874 }
875 }
876 free(fp->writeBuffer->firstZoomBuffer);
877 }
878 if(fp->writeBuffer->lastZoomBuffer) free(fp->writeBuffer->lastZoomBuffer);
879 if(fp->writeBuffer->nNodes) free(fp->writeBuffer->lastZoomBuffer);
880 return 6;
881 }
882
883 //Given an interval start, calculate the next one at a zoom level
nextPos(bigWigFile_t * fp,uint32_t size,uint32_t * pos,uint32_t desiredTid)884 void nextPos(bigWigFile_t *fp, uint32_t size, uint32_t *pos, uint32_t desiredTid) {
885 uint32_t *tid = pos;
886 uint32_t *start = pos+1;
887 uint32_t *end = pos+2;
888 *start += size;
889 if(*start >= fp->cl->len[*tid]) {
890 (*start) = 0;
891 (*tid)++;
892 }
893
894 //prevent needless iteration when changing chromosomes
895 if(*tid < desiredTid) {
896 *tid = desiredTid;
897 *start = 0;
898 }
899
900 (*end) = *start+size;
901 if(*end > fp->cl->len[*tid]) (*end) = fp->cl->len[*tid];
902 }
903
904 //Return the number of bases two intervals overlap
overlapsInterval(uint32_t tid0,uint32_t start0,uint32_t end0,uint32_t tid1,uint32_t start1,uint32_t end1)905 uint32_t overlapsInterval(uint32_t tid0, uint32_t start0, uint32_t end0, uint32_t tid1, uint32_t start1, uint32_t end1) {
906 if(tid0 != tid1) return 0;
907 if(end0 <= start1) return 0;
908 if(end1 <= start0) return 0;
909 if(end0 <= end1) {
910 if(start1 > start0) return end0-start1;
911 return end0-start0;
912 } else {
913 if(start1 > start0) return end1-start1;
914 return end1-start0;
915 }
916 }
917
918 //Returns the number of bases of the interval written
updateInterval(bigWigFile_t * fp,bwZoomBuffer_t * buffer,double * sum,double * sumsq,uint32_t size,uint32_t tid,uint32_t start,uint32_t end,float value)919 uint32_t updateInterval(bigWigFile_t *fp, bwZoomBuffer_t *buffer, double *sum, double *sumsq, uint32_t size, uint32_t tid, uint32_t start, uint32_t end, float value) {
920 uint32_t *p2 = (uint32_t*) buffer->p;
921 float *fp2 = (float*) p2;
922 uint32_t rv = 0, offset = 0;
923 if(!buffer) return 0;
924 if(buffer->l+32 >= buffer->m) return 0;
925
926 //Make sure that we don't overflow a uint32_t by adding some huge value to start
927 if(start + size < start) size = ((uint32_t) -1) - start;
928
929 if(buffer->l) {
930 offset = buffer->l/32;
931 } else {
932 p2[0] = tid;
933 p2[1] = start;
934 if(start+size < end) p2[2] = start+size;
935 else p2[2] = end;
936 }
937
938 //Do we have any overlap with the previously added interval?
939 if(offset) {
940 rv = overlapsInterval(p2[8*(offset-1)], p2[8*(offset-1)+1], p2[8*(offset-1)+1] + size, tid, start, end);
941 if(rv) {
942 p2[8*(offset-1)+2] = start + rv;
943 p2[8*(offset-1)+3] += rv;
944 if(fp2[8*(offset-1)+4] > value) fp2[8*(offset-1)+4] = value;
945 if(fp2[8*(offset-1)+5] < value) fp2[8*(offset-1)+5] = value;
946 *sum += rv*value;
947 *sumsq += rv*pow(value, 2.0);
948 return rv;
949 } else {
950 fp2[8*(offset-1)+6] = *sum;
951 fp2[8*(offset-1)+7] = *sumsq;
952 *sum = 0.0;
953 *sumsq = 0.0;
954 }
955 }
956
957 //If we move to a new interval then skip iterating over a bunch of obviously non-overlapping intervals
958 if(offset && p2[8*offset+2] == 0) {
959 p2[8*offset] = tid;
960 p2[8*offset+1] = start;
961 if(start+size < end) p2[8*offset+2] = start+size;
962 else p2[8*offset+2] = end;
963 //nextPos(fp, size, p2+8*offset, tid); //We can actually skip uncovered intervals
964 }
965
966 //Add a new entry
967 while(!(rv = overlapsInterval(p2[8*offset], p2[8*offset+1], p2[8*offset+1] + size, tid, start, end))) {
968 p2[8*offset] = tid;
969 p2[8*offset+1] = start;
970 if(start+size < end) p2[8*offset+2] = start+size;
971 else p2[8*offset+2] = end;
972 //nextPos(fp, size, p2+8*offset, tid);
973 }
974 p2[8*offset+3] = rv;
975 fp2[8*offset+4] = value; //min
976 fp2[8*offset+5] = value; //max
977 *sum += rv * value;
978 *sumsq += rv * pow(value,2.0);
979 buffer->l += 32;
980 return rv;
981 }
982
983 //Returns 0 on success
addIntervalValue(bigWigFile_t * fp,uint64_t * nEntries,double * sum,double * sumsq,bwZoomBuffer_t * buffer,uint32_t itemsPerSlot,uint32_t zoom,uint32_t tid,uint32_t start,uint32_t end,float value)984 int addIntervalValue(bigWigFile_t *fp, uint64_t *nEntries, double *sum, double *sumsq, bwZoomBuffer_t *buffer, uint32_t itemsPerSlot, uint32_t zoom, uint32_t tid, uint32_t start, uint32_t end, float value) {
985 bwZoomBuffer_t *newBuffer = NULL;
986 uint32_t rv;
987
988 while(start < end) {
989 rv = updateInterval(fp, buffer, sum, sumsq, zoom, tid, start, end, value);
990 if(!rv) {
991 //Allocate a new buffer
992 newBuffer = calloc(1, sizeof(bwZoomBuffer_t));
993 if(!newBuffer) return 1;
994 newBuffer->p = calloc(itemsPerSlot, 32);
995 if(!newBuffer->p) goto error;
996 newBuffer->m = itemsPerSlot*32;
997 memcpy(newBuffer->p, buffer->p+buffer->l-32, 4);
998 memcpy(newBuffer->p+4, buffer->p+buffer->l-28, 4);
999 ((uint32_t*) newBuffer->p)[2] = ((uint32_t*) newBuffer->p)[1] + zoom;
1000 *sum = *sumsq = 0.0;
1001 rv = updateInterval(fp, newBuffer, sum, sumsq, zoom, tid, start, end, value);
1002 if(!rv) goto error;
1003 buffer->next = newBuffer;
1004 buffer = buffer->next;
1005 *nEntries += 1;
1006 }
1007 start += rv;
1008 }
1009
1010 return 0;
1011
1012 error:
1013 if(newBuffer) {
1014 if(newBuffer->m) free(newBuffer->p);
1015 free(newBuffer);
1016 }
1017 return 2;
1018 }
1019
1020 //Get all of the intervals and add them to the appropriate zoomBuffer
constructZoomLevels(bigWigFile_t * fp)1021 int constructZoomLevels(bigWigFile_t *fp) {
1022 bwOverlapIterator_t *it = NULL;
1023 double *sum = NULL, *sumsq = NULL;
1024 uint32_t i, j, k;
1025
1026 sum = calloc(fp->hdr->nLevels, sizeof(double));
1027 sumsq = calloc(fp->hdr->nLevels, sizeof(double));
1028 if(!sum || !sumsq) goto error;
1029
1030 for(i=0; i<fp->cl->nKeys; i++) {
1031 it = bwOverlappingIntervalsIterator(fp, fp->cl->chrom[i], 0, fp->cl->len[i], 100000);
1032 if(!it) goto error;
1033 while(it->data != NULL){
1034 for(j=0;j<it->intervals->l;j++){
1035 for(k=0;k<fp->hdr->nLevels;k++){
1036 if(addIntervalValue(fp, &(fp->writeBuffer->nNodes[k]), sum+k, sumsq+k, fp->writeBuffer->lastZoomBuffer[k], fp->hdr->bufSize/32, fp->hdr->zoomHdrs->level[k], i, it->intervals->start[j], it->intervals->end[j], it->intervals->value[j])) goto error;
1037 while(fp->writeBuffer->lastZoomBuffer[k]->next) fp->writeBuffer->lastZoomBuffer[k] = fp->writeBuffer->lastZoomBuffer[k]->next;
1038 }
1039 }
1040 it = bwIteratorNext(it);
1041 }
1042 bwIteratorDestroy(it);
1043
1044 }
1045
1046 //Make an index for each zoom level
1047 for(i=0; i<fp->hdr->nLevels; i++) {
1048 fp->hdr->zoomHdrs->idx[i] = calloc(1, sizeof(bwRTree_t));
1049 if(!fp->hdr->zoomHdrs->idx[i]) return 1;
1050 fp->hdr->zoomHdrs->idx[i]->blockSize = fp->writeBuffer->blockSize;
1051 }
1052
1053
1054 free(sum);
1055 free(sumsq);
1056
1057 return 0;
1058
1059 error:
1060 if(it) bwIteratorDestroy(it);
1061 if(sum) free(sum);
1062 if(sumsq) free(sumsq);
1063 return 1;
1064 }
1065
writeZoomLevels(bigWigFile_t * fp)1066 int writeZoomLevels(bigWigFile_t *fp) {
1067 uint64_t offset1, offset2, idxSize = 0;
1068 uint32_t i, j, four = 0, last, vector[6] = {0, 0, 0, 0, 0, 0}; //The last 8 bytes are left as 0;
1069 uint8_t wrote, one = 0;
1070 uint16_t actualNLevels = 0;
1071 int rv;
1072 bwLL *ll, *p;
1073 bwRTreeNode_t *root;
1074 bwZoomBuffer_t *zb, *zb2;
1075 bwWriteBuffer_t *wb = fp->writeBuffer;
1076 uLongf sz;
1077
1078 for(i=0; i<fp->hdr->nLevels; i++) {
1079 if(i) {
1080 //Is this a duplicate level?
1081 if(fp->writeBuffer->nNodes[i] == fp->writeBuffer->nNodes[i-1]) break;
1082 }
1083 actualNLevels++;
1084
1085 //reserve a uint32_t for the number of blocks
1086 fp->hdr->zoomHdrs->dataOffset[i] = bwTell(fp);
1087 fp->writeBuffer->nBlocks = 0;
1088 fp->writeBuffer->l = 24;
1089 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1;
1090 zb = fp->writeBuffer->firstZoomBuffer[i];
1091 fp->writeBuffer->firstIndexNode = NULL;
1092 fp->writeBuffer->currentIndexNode = NULL;
1093 while(zb) {
1094 sz = fp->hdr->bufSize;
1095 if(compress(wb->compressP, &sz, zb->p, zb->l) != Z_OK) return 2;
1096
1097 //write the data to disk
1098 if(fwrite(wb->compressP, sizeof(uint8_t), sz, fp->URL->x.fp) != sz) return 3;
1099
1100 //Add an entry into the index
1101 last = (zb->l - 32)>>2;
1102 if(addIndexEntry(fp, ((uint32_t*)zb->p)[0], ((uint32_t*)zb->p)[last], ((uint32_t*)zb->p)[1], ((uint32_t*)zb->p)[last+2], bwTell(fp)-sz, sz)) return 4;
1103
1104 wb->nBlocks++;
1105 wb->l = 24;
1106 zb = zb->next;
1107 }
1108 if(writeAtPos(&(wb->nBlocks), sizeof(uint32_t), 1, fp->hdr->zoomHdrs->dataOffset[i], fp->URL->x.fp)) return 5;
1109
1110 //Make the tree
1111 ll = fp->writeBuffer->firstIndexNode;
1112 if(ll == fp->writeBuffer->currentIndexNode) {
1113 root = ll->node;
1114 idxSize = 4 + 24*root->nChildren;
1115 } else {
1116 root = addLeaves(&ll, &idxSize, ceil(((double)fp->writeBuffer->nBlocks)/fp->writeBuffer->blockSize), fp->writeBuffer->blockSize);
1117 }
1118 if(!root) return 4;
1119 fp->hdr->zoomHdrs->idx[i]->root = root;
1120
1121 ll = fp->writeBuffer->firstIndexNode;
1122 while(ll) {
1123 p = ll->next;
1124 free(ll);
1125 ll=p;
1126 }
1127
1128
1129 //write the index
1130 wrote = 0;
1131 fp->hdr->zoomHdrs->indexOffset[i] = bwTell(fp);
1132 four = IDX_MAGIC;
1133 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 1;
1134 root = fp->hdr->zoomHdrs->idx[i]->root;
1135 if(fwrite(&(fp->writeBuffer->blockSize), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 6;
1136 if(fwrite(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 7;
1137 if(fwrite(&(root->chrIdxStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8;
1138 if(fwrite(&(root->baseStart[0]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
1139 if(fwrite(&(root->chrIdxEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 10;
1140 if(fwrite(&(root->baseEnd[root->nChildren-1]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 11;
1141 if(fwrite(&idxSize, sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 12;
1142 four = fp->hdr->bufSize/32;
1143 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 13;
1144 four = 0;
1145 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 14; //padding
1146 fp->hdr->zoomHdrs->idx[i]->rootOffset = bwTell(fp);
1147
1148 //Write the root node, since writeIndexTree writes the children and fills in the offset
1149 offset1 = bwTell(fp);
1150 if(fwrite(&(root->isLeaf), sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 16;
1151 if(fwrite(&one, sizeof(uint8_t), 1, fp->URL->x.fp) != 1) return 17; //one byte of padding
1152 if(fwrite(&(root->nChildren), sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 18;
1153 for(j=0; j<root->nChildren; j++) {
1154 vector[0] = root->chrIdxStart[j];
1155 vector[1] = root->baseStart[j];
1156 vector[2] = root->chrIdxEnd[j];
1157 vector[3] = root->baseEnd[j];
1158 if(root->isLeaf) {
1159 //Include the offset and size
1160 if(fwrite(vector, sizeof(uint32_t), 4, fp->URL->x.fp) != 4) return 19;
1161 if(fwrite(&(root->dataOffset[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 20;
1162 if(fwrite(&(root->x.size[j]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 21;
1163 } else {
1164 if(fwrite(vector, sizeof(uint32_t), 6, fp->URL->x.fp) != 6) return 22;
1165 }
1166 }
1167
1168 while((rv = writeIndexTreeNode(fp->URL->x.fp, fp->hdr->zoomHdrs->idx[i]->root, &wrote, 0)) == 0) {
1169 if(!wrote) break;
1170 wrote = 0;
1171 }
1172
1173 if(rv || wrote) return 6;
1174
1175 //Save the file position
1176 offset2 = bwTell(fp);
1177
1178 //Write the offsets
1179 if(writeIndexOffsets(fp->URL->x.fp, root, offset1)) return 2;
1180
1181 //Move the file pointer back to the end
1182 bwSetPos(fp, offset2);
1183
1184
1185 //Free the linked list
1186 zb = fp->writeBuffer->firstZoomBuffer[i];
1187 while(zb) {
1188 if(zb->p) free(zb->p);
1189 zb2 = zb->next;
1190 free(zb);
1191 zb = zb2;
1192 }
1193 fp->writeBuffer->firstZoomBuffer[i] = NULL;
1194 }
1195
1196 //Free unused zoom levels
1197 for(i=actualNLevels; i<fp->hdr->nLevels; i++) {
1198 zb = fp->writeBuffer->firstZoomBuffer[i];
1199 while(zb) {
1200 if(zb->p) free(zb->p);
1201 zb2 = zb->next;
1202 free(zb);
1203 zb = zb2;
1204 }
1205 fp->writeBuffer->firstZoomBuffer[i] = NULL;
1206 }
1207
1208 //Write the zoom headers to disk
1209 offset1 = bwTell(fp);
1210 if(bwSetPos(fp, 0x40)) return 7;
1211 four = 0;
1212 for(i=0; i<actualNLevels; i++) {
1213 if(fwrite(&(fp->hdr->zoomHdrs->level[i]), sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 8;
1214 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
1215 if(fwrite(&(fp->hdr->zoomHdrs->dataOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 10;
1216 if(fwrite(&(fp->hdr->zoomHdrs->indexOffset[i]), sizeof(uint64_t), 1, fp->URL->x.fp) != 1) return 11;
1217 }
1218
1219 //Write the number of levels if needed
1220 if(bwSetPos(fp, 0x6)) return 12;
1221 if(fwrite(&actualNLevels, sizeof(uint16_t), 1, fp->URL->x.fp) != 1) return 13;
1222
1223 if(bwSetPos(fp, offset1)) return 14;
1224
1225 return 0;
1226 }
1227
1228 //0 on success
bwFinalize(bigWigFile_t * fp)1229 int bwFinalize(bigWigFile_t *fp) {
1230 uint32_t four;
1231 uint64_t offset;
1232 if(!fp->isWrite) return 0;
1233
1234 //Flush the buffer
1235 if(flushBuffer(fp)) return 1; //Valgrind reports a problem here!
1236
1237 //Update the data section with the number of blocks written
1238 if(fp->hdr) {
1239 if(writeAtPos(&(fp->writeBuffer->nBlocks), sizeof(uint64_t), 1, fp->hdr->dataOffset, fp->URL->x.fp)) return 2;
1240 } else {
1241 //The header wasn't written!
1242 return 1;
1243 }
1244
1245 //write the bufferSize
1246 if(fp->hdr->bufSize) {
1247 if(writeAtPos(&(fp->hdr->bufSize), sizeof(uint32_t), 1, 0x34, fp->URL->x.fp)) return 2;
1248 }
1249
1250 //write the summary information
1251 if(writeSummary(fp)) return 3;
1252
1253 //Convert the linked-list to a tree and write to disk
1254 if(writeIndex(fp)) return 4;
1255
1256 //Zoom level stuff here?
1257 if(fp->hdr->nLevels && fp->writeBuffer->nBlocks) {
1258 offset = bwTell(fp);
1259 if(makeZoomLevels(fp)) return 5;
1260 if(constructZoomLevels(fp)) return 6;
1261 bwSetPos(fp, offset);
1262 if(writeZoomLevels(fp)) return 7; //This write nLevels as well
1263 }
1264
1265 //write magic at the end of the file
1266 four = BIGWIG_MAGIC;
1267 if(fwrite(&four, sizeof(uint32_t), 1, fp->URL->x.fp) != 1) return 9;
1268
1269 return 0;
1270 }
1271
1272 /*
1273 data chunk:
1274 uint64_t number of blocks (2 / 110851)
1275 some blocks
1276
1277 an uncompressed data block (24 byte header)
1278 uint32_t Tid 0-4
1279 uint32_t start 4-8
1280 uint32_t end 8-12
1281 uint32_t step 12-16
1282 uint32_t span 16-20
1283 uint8_t type 20
1284 uint8_t padding
1285 uint16_t nItems 22
1286 nItems of:
1287 type 1: //12 bytes
1288 uint32_t start
1289 uint32_t end
1290 float value
1291 type 2: //8 bytes
1292 uint32_t start
1293 float value
1294 type 3: //4 bytes
1295 float value
1296
1297 data block index header
1298 uint32_t magic
1299 uint32_t blockSize (256 in the example) maximum number of children
1300 uint64_t number of blocks (2 / 110851)
1301 uint32_t startTid
1302 uint32_t startPos
1303 uint32_t endTid
1304 uint32_t endPos
1305 uint64_t index size? (0x1E7 / 0x1AF0401F) index address?
1306 uint32_t itemsPerBlock (1 / 1) 1024 for zoom headers 1024 for zoom headers
1307 uint32_t padding
1308
1309 data block index node non-leaf (4 bytes + 24*nChildren)
1310 uint8_t isLeaf
1311 uint8_t padding
1312 uint16_t nChildren (2, 256)
1313 uint32_t startTid
1314 uint32_t startPos
1315 uint32_t endTid
1316 uint32_t endPos
1317 uint64_t dataOffset (0x1AF05853, 0x1AF07057)
1318
1319 data block index node leaf (4 bytes + 32*nChildren)
1320 uint8_t isLeaf
1321 uint8_t padding
1322 uint16_t nChildren (2)
1323 uint32_t startTid
1324 uint32_t startPos
1325 uint32_t endTid
1326 uint32_t endPos
1327 uint64_t dataOffset (0x198, 0x1CF)
1328 uint64_t dataSize (55, 24)
1329
1330 zoom data block
1331 uint32_t number of blocks (10519766)
1332 some data blocks
1333 */
1334