1 static char const rcsid[] = "$Id: ncbisort.c,v 6.9 2012/08/22 23:02:39 kans Exp $";
2
3 /* $Id: ncbisort.c,v 6.9 2012/08/22 23:02:39 kans Exp $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: $RCSfile: ncbisort.c,v $
29 *
30 * Author: Sergei Shavirin
31 *
32 * Initial Version Creation Date: 03/24/1997
33 *
34 * $Revision: 6.9 $
35 *
36 * File Description:
37 * Main file for SORTing library
38 *
39 * $Log: ncbisort.c,v $
40 * Revision 6.9 2012/08/22 23:02:39 kans
41 * null dereference checks and initializing variables due to clang static analysis
42 *
43 * Revision 6.8 2006/05/10 20:47:17 camacho
44 * From Ilya Dondoshansky: SORTFiles: added output parameter for total number of lines processed
45 *
46 * Revision 6.7 2003/07/15 20:17:39 coulouri
47 * remove signal() and setrlimit() calls
48 *
49 * Revision 6.6 2003/05/30 17:25:37 coulouri
50 * add rcsid
51 *
52 * Revision 6.5 2003/02/26 17:48:09 kimelman
53 * bugfix: format/argument mismatch in error message
54 *
55 * Revision 6.4 1998/06/01 14:52:37 madden
56 * Change to using TmpNam
57 *
58 * Revision 6.3 1998/04/09 20:40:45 kans
59 * use TmpNam for Mac sorting
60 *
61 * Revision 6.2 1998/04/09 18:54:23 kans
62 * on mac, \r must be converted to \n in buffer
63 *
64 * Revision 6.1 1997/12/02 18:57:38 shavirin
65 * Removed typecast warnings insprintf() and sscanf()
66 *
67 * Revision 6.0 1997/08/25 18:53:34 madden
68 * Revision changed to 6.0
69 *
70 * Revision 1.4 1997/05/05 14:36:57 shavirin
71 * Fixed tmp-prefix handling for non-UNIX platform
72 *
73 * Revision 1.3 1997/05/01 20:16:21 shavirin
74 * Added typecast to remove few warnings
75 *
76 * Revision 1.2 1997/05/01 17:22:31 shavirin
77 * Added function SORTObjectFree()
78 *
79 * Revision 1.1 1997/03/13 21:52:15 shavirin
80 * Initial revision
81 *
82 *
83 * ==========================================================================
84 */
85
86 #include <ncbisrti.h>
87
SORTObjectNew(CharPtr prefix,Uchar tab,Int4 linelength,Boolean reverse,Boolean unique)88 SORTObjectPtr SORTObjectNew(CharPtr prefix, Uchar tab, Int4 linelength,
89 Boolean reverse, Boolean unique)
90 {
91 SORTDataPtr sdp;
92 Int4 i;
93
94 sdp = (SORTDataPtr) MemNew(sizeof(SORTData));
95 sdp->temphead = (SORTempNodePtr) MemNew(sizeof(SORTempNode));
96 global_temphead = sdp->temphead; /* Used for cleanup by signal */
97 sdp->keyhead = (SORTKeyFieldPtr)MemNew(sizeof(SORTKeyField));
98
99 if(!tables_initialized) {
100 for (i = 0; i < UCHAR_LIM; ++i) {
101 if (ISBLANK(i))
102 blanks[i] = 1;
103 if (IS_DIGIT(i))
104 digits[i] = 1;
105 if (IS_UPPER(i))
106 fold_tolower[i] = tolower(i);
107 else
108 fold_tolower[i] = (Uint1) i;
109 }
110 }
111
112 sdp->sortalloc = SORTALLOC;
113 sdp->mergealloc = MERGEALLOC;
114
115 if(linelength != 0)
116 sdp->linelength = linelength;
117 else
118 sdp->linelength = 30;
119
120 if(prefix != NULL) {
121 sdp->prefix = StringSave(prefix);
122 } else {
123 #ifndef OS_UNIX
124 {{
125 CharPtr tmp_dir = getenv ("TMP");
126
127 if (!tmp_dir)
128 sdp->prefix = StringSave(".");
129 else
130 sdp->prefix = StringSave(tmp_dir);
131 }}
132 #else /* not OS_UNIX */
133 sdp->prefix = StringSave(PREFIX);
134 #endif
135 }
136 sdp->tab = tab;
137 sdp->reverse = reverse;
138 sdp->unique = unique;
139
140 return (SORTObjectPtr) sdp;
141 }
142
SORTObjectFree(SORTObjectPtr object)143 void SORTObjectFree(SORTObjectPtr object)
144 {
145 SORTDataPtr sdp;
146 SORTempNodePtr tnp_next, tnp_tmp;
147 SORTKeyFieldPtr kfp_next, kfp_tmp;
148
149 if((sdp = (SORTDataPtr) object) == NULL)
150 return;
151
152 for(tnp_tmp = sdp->temphead; tnp_tmp != NULL; tnp_tmp = tnp_next) {
153 tnp_next = tnp_tmp->next;
154 MemFree(tnp_tmp->name);
155 MemFree(tnp_tmp);
156 }
157
158 for(kfp_tmp = sdp->keyhead; kfp_tmp != NULL; kfp_tmp = kfp_next) {
159 kfp_next = kfp_tmp->next;
160 MemFree(kfp_tmp->ignore);
161 MemFree(kfp_tmp->translate);
162 MemFree(kfp_tmp);
163 }
164
165 MemFree(sdp->prefix);
166 MemFree(sdp);
167 }
168
SORTGetKeyHead(SORTObjectPtr sop)169 SORTKeyFieldPtr SORTGetKeyHead(SORTObjectPtr sop)
170 {
171 SORTDataPtr sdp;
172
173 if((sdp = (SORTDataPtr) sop) == NULL)
174 return NULL;
175
176 return sdp->keyhead;
177 }
SORTSetReverse(Boolean reverse,SORTObjectPtr sop)178 SORTErrorCode SORTSetReverse(Boolean reverse, SORTObjectPtr sop)
179 {
180 SORTDataPtr sdp;
181
182 if((sdp = (SORTDataPtr) sop) == NULL)
183 return SORTBadParameter;
184 sdp->reverse = reverse;
185 return SORTNoError;
186 }
SORTSetUnique(Boolean unique,SORTObjectPtr sop)187 SORTErrorCode SORTSetUnique(Boolean unique, SORTObjectPtr sop)
188 {
189 SORTDataPtr sdp;
190
191 if((sdp = (SORTDataPtr) sop) == NULL)
192 return SORTBadParameter;
193
194 sdp->unique = unique;
195 return SORTNoError;
196 }
SORTSetLineLength(Int4 linelength,SORTObjectPtr sop)197 SORTErrorCode SORTSetLineLength(Int4 linelength, SORTObjectPtr sop)
198 {
199 SORTDataPtr sdp;
200
201 if((sdp = (SORTDataPtr) sop) == NULL)
202 return SORTBadParameter;
203
204 if(linelength != 0)
205 sdp->linelength = linelength;
206 else
207 sdp->linelength = 30;
208
209 return SORTNoError;
210 }
211
SORTSetTab(Uchar tab,SORTObjectPtr sop)212 SORTErrorCode SORTSetTab(Uchar tab, SORTObjectPtr sop)
213 {
214 SORTDataPtr sdp;
215
216 if((sdp = (SORTDataPtr) sop) == NULL)
217 return SORTBadParameter;
218
219 sdp->tab = tab;
220
221 return SORTNoError;
222 }
SORTSetPrefix(CharPtr prefix,SORTObjectPtr sop)223 SORTErrorCode SORTSetPrefix(CharPtr prefix, SORTObjectPtr sop)
224 {
225 SORTDataPtr sdp;
226
227 if((sdp = (SORTDataPtr) sop) == NULL)
228 return SORTBadParameter;
229
230 if(prefix != NULL) {
231 if(sdp->prefix != NULL)
232 MemFree(sdp->prefix);
233 sdp->prefix = StringSave(prefix);
234 } else {
235 sdp->prefix = NULL;
236 }
237
238 return SORTNoError;
239 }
240
241 /* Sort any number of FILES onto the given OFP. */
SORTFiles(CharPtr PNTR files,Int4 nfiles,FILE * ofp,SORTObjectPtr sop,Int4 * line_count)242 SORTErrorCode SORTFiles(CharPtr PNTR files, Int4 nfiles, FILE *ofp,
243 SORTObjectPtr sop, Int4* line_count)
244 {
245 SORTBuffer buf;
246 SORTLines lines;
247 SORTLinePtr tmp;
248 Int4 i, ntmp;
249 FILE *fp, *tfp;
250 SORTempNodePtr node;
251 SORTDataPtr sdp;
252 Int4 ntemp = 0;
253 CharPtr PNTR tempfiles;
254 CharPtr temp;
255 SORTErrorCode error_code = SORTNoError;
256
257 *line_count = 0;
258
259 if((sdp = (SORTDataPtr) sop) == NULL) {
260 error_code = SORTBadParameter;
261 goto return_from_function;
262 }
263
264 SORTInitBuf(&buf, sdp->sortalloc);
265 SORTInitLines(&lines, sdp->sortalloc/sdp->linelength + 1);
266 ntmp = lines.alloc;
267
268 if((tmp = (SORTLinePtr) MemNew(ntmp * sizeof (SORTLine))) == NULL) {
269 error_code = SORTNoMemory;
270 goto return_from_function;
271 }
272
273 while (nfiles--) {
274 if((fp = FileOpen(*files++, "r")) == NULL) {
275 error_code = SORTBadFileName;
276 goto return_from_function;
277 }
278
279 while (SORTFillBuf(&buf, fp)) {
280 SORTFindLines(&buf, &lines, sdp);
281 *line_count += lines.used;
282 if (lines.used > ntmp) {
283 while (lines.used > ntmp)
284 ntmp *= 2;
285
286 if((tmp = (SORTLinePtr) Realloc(tmp, ntmp*sizeof(SORTLine))) == NULL) {
287 error_code = SORTNoMemory;
288 goto return_from_function;
289 }
290 }
291 SORTArrayLines(lines.lines, lines.used, tmp, sdp);
292 if (feof(fp) && !nfiles && !ntemp) {
293 tfp = ofp;
294 } else {
295 ++ntemp;
296
297 if((temp =SORTAddTempName(sop)) == NULL) {
298 error_code = SORTMiscError;
299 goto return_from_function;
300 }
301 if((tfp = FileOpen(temp, "w")) == NULL) {
302 error_code = SORTBadFileName;
303 goto return_from_function;
304 }
305 }
306 for (i = 0; i < lines.used; ++i)
307 if (!sdp->unique || i == lines.used - 1
308 || SORTCompare(&lines.lines[i], &lines.lines[i + 1], sdp)) {
309
310 if((FileWrite(lines.lines[i].text, 1, lines.lines[i].length, tfp)) !=
311 (Uint4) lines.lines[i].length) {
312 error_code = SORTWriteError;
313 goto return_from_function;
314 }
315 putc('\n', tfp);
316 }
317 if (tfp != ofp)
318 FileClose(tfp);
319 }
320 FileClose(fp);
321 }
322
323
324 MemFree(buf.buf);
325 MemFree((CharPtr) lines.lines);
326 MemFree((CharPtr) tmp);
327
328 if (ntemp) {
329 if((tempfiles = (CharPtr PNTR) MemNew(ntemp * sizeof (CharPtr))) == NULL) {
330 error_code = SORTNoMemory;
331 goto return_from_function;
332 }
333
334 i = ntemp;
335 for (node = sdp->temphead->next; node; node = node->next)
336 tempfiles[--i] = node->name;
337 SORTMergeFiles(tempfiles, ntemp, ofp, sop);
338 MemFree((CharPtr) tempfiles);
339 }
340
341 return_from_function:
342 if (sdp != NULL) {
343 SORTCleanup(sdp->temphead);
344 }
345 return error_code;
346 }
347
348 /* Merge any number of FILES onto the given OFP. */
SORTMergeFiles(CharPtr files[],Int4 nfiles,FILE * ofp,SORTObjectPtr sop)349 SORTErrorCode SORTMergeFiles(CharPtr files[], Int4 nfiles, FILE *ofp,
350 SORTObjectPtr sop)
351 {
352 Int4 i, j, t;
353 CharPtr temp;
354 FILE *fps[NMERGE], *tfp;
355 SORTDataPtr sdp;
356 SORTErrorCode error_code = SORTNoError;
357
358 if((sdp = (SORTDataPtr) sop) == NULL) {
359 error_code = SORTBadParameter;
360 goto return_from_function;
361 }
362
363 while (nfiles > NMERGE) {
364 t = 0;
365 for (i = 0; i < nfiles / NMERGE; ++i) {
366 for (j = 0; j < NMERGE; ++j) {
367 if((fps[j] = FileOpen(files[i * NMERGE + j], "r")) == NULL) {
368 error_code = SORTBadFileName;
369 goto return_from_function;
370 }
371 }
372 if((temp = SORTAddTempName(sop)) == NULL) {
373 error_code = SORTMiscError;
374 goto return_from_function;
375 }
376 if((tfp = FileOpen(temp, "w")) == NULL) {
377 error_code = SORTBadFileName;
378 goto return_from_function;
379 }
380
381 SORTMergeFPS(fps, NMERGE, tfp, sdp);
382
383 FileClose(tfp);
384
385 for (j = 0; j < NMERGE; ++j)
386 SORTDelTempName(files[i * NMERGE + j], sdp->temphead);
387
388 files[t++] = temp;
389 }
390 for (j = 0; j < nfiles % NMERGE; ++j)
391 if((fps[j] = FileOpen(files[i * NMERGE + j], "r")) == NULL) {
392 error_code = SORTBadFileName;
393 goto return_from_function;
394 }
395 if((temp = SORTAddTempName(sop)) == NULL) {
396 error_code = SORTMiscError;
397 goto return_from_function;
398 }
399
400 if((tfp = FileOpen(temp, "w")) == NULL) {
401 error_code = SORTBadFileName;
402 goto return_from_function;
403 }
404
405 SORTMergeFPS(fps, nfiles % NMERGE, tfp, sdp);
406 FileClose(tfp);
407
408 for (j = 0; j < nfiles % NMERGE; ++j)
409 SORTDelTempName(files[i * NMERGE + j], sdp->temphead);
410 files[t++] = temp;
411 nfiles = t;
412 }
413
414 for (i = 0; i < nfiles; ++i) {
415 if((fps[i] = FileOpen(files[i], "r")) == NULL) {
416 error_code = SORTBadFileName;
417 goto return_from_function;
418 }
419 }
420
421 SORTMergeFPS(fps, i, ofp, sdp);
422
423 for (i = 0; i < nfiles; ++i)
424 SORTDelTempName(files[i], sdp->temphead);
425
426 return_from_function:
427 if (sdp != NULL) {
428 SORTCleanup(sdp->temphead);
429 }
430 return error_code;
431 }
432
433 /* Clean up any remaining temporary files. */
434
SORTCleanup(SORTempNodePtr temphead)435 static void SORTCleanup(SORTempNodePtr temphead)
436 {
437 SORTempNodePtr node;
438
439 for (node = temphead->next; node; node = node->next)
440 FileRemove(node->name);
441
442 temphead->next = NULL;
443 }
444
445 /* Return a name for a temporary file. */
SORTAddTempName(SORTObjectPtr sop)446 CharPtr SORTAddTempName(SORTObjectPtr sop)
447 {
448 Char path [PATH_MAX];
449 CharPtr name;
450 SORTempNodePtr node;
451 SORTDataPtr sdp;
452
453 if((sdp = (SORTDataPtr) sop) == NULL)
454 return NULL;
455
456
457 TmpNam (path);
458 name = StringSave (path);
459
460 if((node = (SORTempNodePtr)MemNew(sizeof(SORTempNode))) == NULL) {
461 SORTCleanup(sdp->temphead);
462 return NULL;
463 }
464
465 node->name = name;
466 node->next = sdp->temphead->next;
467 sdp->temphead->next = node;
468
469 return name;
470 }
471
472 /* Search through the list of temporary files for the given name;
473 remove it if it is found on the list. */
SORTDelTempName(CharPtr name,SORTempNodePtr temphead)474 static void SORTDelTempName(CharPtr name, SORTempNodePtr temphead)
475 {
476 SORTempNodePtr node, temp;
477
478 for (node = temphead; node->next; node = node->next) {
479 if (!StringCmp(name, node->next->name))
480 break;
481 }
482
483 if (node->next) {
484 temp = node->next;
485 FileRemove(temp->name);
486 MemFree(temp->name);
487 node->next = temp->next;
488 MemFree((CharPtr) temp);
489 }
490 }
491
492 /* Initialize BUF allocating ALLOC bytes initially. */
493
SORTInitBuf(SORTBufferPtr buf,Int4 alloc)494 static SORTErrorCode SORTInitBuf (SORTBufferPtr buf, Int4 alloc)
495 {
496 buf->alloc = alloc;
497 if((buf->buf = (UcharPtr)MemNew(buf->alloc)) == NULL)
498 return SORTNoMemory;
499 buf->used = buf->left = 0;
500 return SORTNoError;
501 }
502
503 /* Fill BUF reading from FP, moving buf->left bytes from the end
504 of buf->buf to the beginning first. If EOF is reached and the
505 file wasn't terminated by a newline, supply one. Return a count
506 of bytes actually read. */
507
SORTFillBuf(SORTBufferPtr buf,FILE * fp)508 static Int4 SORTFillBuf(SORTBufferPtr buf, FILE *fp)
509 {
510 Int4 cc, total = 0;
511
512 MemMove(buf->buf, buf->buf + buf->used - buf->left, buf->left);
513 buf->used = buf->left;
514
515 while (!feof(fp) && !Nlm_MemChr(buf->buf, '\n', buf->used)) {
516 if (buf->used == buf->alloc) {
517 if((buf->buf = (UcharPtr)
518 Realloc(buf->buf, buf->alloc *= 2)) == NULL) {
519 return (Int4) SORTNoMemory;
520 }
521 }
522
523 if((cc = FileRead(buf->buf + buf->used,
524 1, buf->alloc - buf->used, fp)) == 0 && ferror(fp)) {
525 ErrLogPrintf("sort: read error (%s)\n", strerror(errno));
526 return (Int4) SORTReadError;
527 }
528
529 #ifdef WIN_MAC
530 {
531 Int4 ct;
532 UcharPtr ptr;
533
534 ptr = buf->buf + buf->used;
535 ct = cc;
536 while (ct > 0) {
537 if (*ptr == '\r') {
538 *ptr = '\n';
539 }
540 ptr++;
541 ct--;
542 }
543 }
544 #endif
545
546 buf->used += cc;
547 total += cc;
548 }
549
550 if (feof(fp) && buf->used && buf->buf[buf->used - 1] != '\n') {
551 if (buf->used == buf->alloc) {
552 if((buf->buf = (UcharPtr) Realloc(buf->buf, buf->alloc *= 2)) == NULL) {
553 return (Int4) SORTNoMemory;
554 }
555 }
556 buf->buf[buf->used++] = '\n';
557 ++total;
558 }
559
560 return total;
561 }
562
563 /* Initialize LINES, allocating space for ALLOC lines initially. */
SORTInitLines(SORTLinesPtr lines,Int4 alloc)564 static SORTErrorCode SORTInitLines(SORTLinesPtr lines, Int4 alloc)
565 {
566 lines->alloc = alloc;
567 if((lines->lines =
568 (SORTLinePtr)MemNew(lines->alloc * sizeof(SORTLine))) == NULL)
569 return SORTNoMemory;
570 lines->used = 0;
571 return SORTNoError;
572 }
573
574 /* Return a pointer to the first character of a field. */
SORTBegField(SORTLinePtr line,SORTKeyFieldPtr key,SORTDataPtr sdp)575 static UcharPtr SORTBegField(SORTLinePtr line, SORTKeyFieldPtr key,
576 SORTDataPtr sdp)
577 {
578 register UcharPtr ptr = line->text, lim = ptr + line->length;
579 register Int4 sword = key->sword, schar = key->schar;
580
581 if (sdp->tab)
582 while (ptr < lim && sword--) {
583 while (ptr < lim && *ptr != sdp->tab)
584 ++ptr;
585 if (ptr < lim)
586 ++ptr;
587 } else
588 while (ptr < lim && sword--) {
589 while (ptr < lim && blanks[UCHAR(*ptr)])
590 ++ptr;
591 while (ptr < lim && !blanks[UCHAR(*ptr)])
592 ++ptr;
593 }
594
595 if (key->skipsblanks)
596 while (ptr < lim && blanks[UCHAR(*ptr)])
597 ++ptr;
598
599 while (ptr < lim && schar--)
600 ++ptr;
601
602 return ptr;
603 }
604
605 /* Find the limit of a field; i.e., a pointer to the first character
606 after the field. */
SORTLimField(SORTLinePtr line,SORTKeyFieldPtr key,SORTDataPtr sdp)607 static UcharPtr SORTLimField(SORTLinePtr line, SORTKeyFieldPtr key,
608 SORTDataPtr sdp)
609 {
610 register UcharPtr ptr = line->text, lim = ptr + line->length;
611 register Int4 eword = key->eword, echar = key->echar;
612
613 if (sdp->tab)
614 while (ptr < lim && eword--) {
615 while (ptr < lim && *ptr != sdp->tab)
616 ++ptr;
617 if (ptr < lim && (eword || key->skipeblanks))
618 ++ptr;
619 } else
620 while (ptr < lim && eword--) {
621 while (ptr < lim && blanks[UCHAR(*ptr)])
622 ++ptr;
623 while (ptr < lim && !blanks[UCHAR(*ptr)])
624 ++ptr;
625 }
626
627 if (key->skipeblanks)
628 while (ptr < lim && blanks[UCHAR(*ptr)])
629 ++ptr;
630
631 while (ptr < lim && echar--)
632 ++ptr;
633
634 return ptr;
635 }
636
637 /* Find the lines in BUF, storing pointers and lengths in LINES.
638 Also replace newlines with NULs. */
SORTFindLines(SORTBufferPtr buf,SORTLinesPtr lines,SORTDataPtr sdp)639 static SORTErrorCode SORTFindLines(SORTBufferPtr buf, SORTLinesPtr lines,
640 SORTDataPtr sdp)
641 {
642 register UcharPtr beg, lim, ptr;
643 SORTKeyFieldPtr key;
644
645 beg = buf->buf;
646 lim = buf->buf + buf->used;
647 key = sdp->keyhead->next;
648
649 lines->used = 0;
650
651 while (beg < lim &&
652 ((ptr = (UcharPtr)Nlm_MemChr(beg, '\n', lim - beg)) != NULL)) {
653 /* There are various places in the code that rely on a NUL
654 being at the end of in-core lines; NULs inside the lines
655 will not cause trouble, though. */
656 *ptr = '\0';
657
658 if (lines->used == lines->alloc)
659 if((lines->lines = (SORTLinePtr) Realloc((CharPtr)lines->lines,
660 (lines->alloc *= 2) *
661 sizeof(SORTLine))) == NULL)
662 return SORTNoMemory;
663
664 lines->lines[lines->used].text = beg;
665 lines->lines[lines->used].length = ptr - beg;
666
667 /* Precompute the position of the first key for efficiency. */
668 if (key) {
669 if (key->eword >= 0)
670 lines->lines[lines->used].keylim =
671 SORTLimField(&lines->lines[lines->used], key, sdp);
672 else
673 lines->lines[lines->used].keylim = ptr;
674
675 if (key->sword >= 0)
676 lines->lines[lines->used].keybeg =
677 SORTBegField(&lines->lines[lines->used], key, sdp);
678 else {
679 if (key->skipsblanks)
680 while (blanks[UCHAR(*beg)])
681 ++beg;
682 lines->lines[lines->used].keybeg = beg;
683 }
684 }
685
686 ++lines->used;
687 beg = ptr + 1;
688 }
689
690 buf->left = lim - beg;
691 return SORTNoError;
692 }
693
694 /* Compare two strings containing decimal fractions < 1. Each string
695 should begin with a decimal point followed immediately by the digits
696 of the fraction. Strings not of this form are considered to be zero. */
SORTFracCompare(register UcharPtr a,register UcharPtr b)697 static Int4 SORTFracCompare(register UcharPtr a, register UcharPtr b)
698 {
699 register tmpa = UCHAR(*a), tmpb = UCHAR(*b);
700
701 if (tmpa == '.' && tmpb == '.') {
702 do
703 tmpa = UCHAR(*++a), tmpb = UCHAR(*++b);
704 while (tmpa == tmpb && digits[tmpa]);
705 if (digits[tmpa] && digits[tmpb])
706 return tmpa - tmpb;
707 if (digits[tmpa]) {
708 while (tmpa == '0')
709 tmpa = UCHAR(*++a);
710 if (digits[tmpa])
711 return 1;
712 return 0;
713 }
714 if (digits[tmpb]) {
715 while (tmpb == '0')
716 tmpb = UCHAR(*++b);
717 if (digits[tmpb])
718 return -1;
719 return 0;
720 }
721 return 0;
722 } else if (tmpa == '.') {
723 do
724 tmpa = UCHAR(*++a);
725 while (tmpa == '0');
726 if (digits[tmpa])
727 return 1;
728 return 0;
729 } else if (tmpb == '.') {
730 do
731 tmpb = UCHAR(*++b);
732 while (tmpb == '0');
733 if (digits[tmpb])
734 return -1;
735 return 0;
736 }
737 return 0;
738 }
739
740 /* Compare two strings as numbers without explicitly converting them to
741 machine numbers. Comparatively slow for short strings, but asymptotically
742 hideously fast. */
SORTNumCompare(register UcharPtr a,register UcharPtr b)743 static Int4 SORTNumCompare(register UcharPtr a, register UcharPtr b)
744 {
745 register Int4 tmpa, tmpb, loga, logb, tmp;
746
747 tmpa = UCHAR(*a), tmpb = UCHAR(*b);
748
749 if (tmpa == '-') {
750 tmpa = UCHAR(*++a);
751 if (tmpb != '-') {
752 if (digits[tmpa] && digits[tmpb])
753 return -1;
754 return 0;
755 }
756 tmpb = UCHAR(*++b);
757
758 while (tmpa == '0')
759 tmpa = UCHAR(*++a);
760 while (tmpb == '0')
761 tmpb = UCHAR(*++b);
762
763 while (tmpa == tmpb && digits[tmpa])
764 tmpa = UCHAR(*++a), tmpb = UCHAR(*++b);
765
766 if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa]))
767 return -SORTFracCompare(a, b);
768
769 if (digits[tmpa])
770 for (loga = 1; digits[UCHAR(*++a)]; ++loga)
771 continue;
772 else
773 loga = 0;
774
775 if (digits[tmpb])
776 for (logb = 1; digits[UCHAR(*++b)]; ++logb)
777 continue;
778 else
779 logb = 0;
780
781 if ((tmp = logb - loga) != 0)
782 return tmp;
783
784 if (! loga)
785 return 0;
786
787 return tmpb - tmpa;
788 } else if (tmpb == '-') {
789 if (digits[UCHAR(tmpa)] && digits[UCHAR(*++b)])
790 return 1;
791 return 0;
792 } else {
793 while (tmpa == '0')
794 tmpa = UCHAR(*++a);
795 while (tmpb == '0')
796 tmpb = UCHAR(*++b);
797
798 while (tmpa == tmpb && digits[tmpa])
799 tmpa = UCHAR(*++a), tmpb = UCHAR(*++b);
800
801 if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa]))
802 return SORTFracCompare(a, b);
803
804 if (digits[tmpa])
805 for (loga = 1; digits[UCHAR(*++a)]; ++loga)
806 continue;
807 else
808 loga = 0;
809
810 if (digits[tmpb])
811 for (logb = 1; digits[UCHAR(*++b)]; ++logb)
812 continue;
813 else
814 logb = 0;
815
816 if ((tmp = loga - logb) != 0)
817 return tmp;
818
819 if (! loga)
820 return 0;
821
822 return tmpa - tmpb;
823 }
824 }
825
826 /* Return an integer <= 12 associated with a month name (0 if the name
827 is not recognized. */
SORTGetMonth(UcharPtr s,Int4 len)828 static Int4 SORTGetMonth(UcharPtr s, Int4 len)
829 {
830 Char month[4];
831 register Int4 i, lo = 0, hi = 12;
832
833 if (len < 3)
834 return 0;
835
836 for (i = 0; i < 3; ++i)
837 month[i] = fold_tolower[UCHAR(s[i])];
838 month[3] = '\0';
839
840 while (hi - lo > 1)
841 if (StringCmp(month, monthtab[(lo + hi) / 2].name) < 0)
842 hi = (lo + hi) / 2;
843 else
844 lo = (lo + hi) / 2;
845 if (!StringCmp(month, monthtab[lo].name))
846 return monthtab[lo].val;
847 return 0;
848 }
849
850 /* Compare two lines trying every key in sequence until there
851 are no more keys or a difference is found. */
SORTKeyCompare(SORTLinePtr a,SORTLinePtr b,SORTDataPtr sdp)852 static Int4 SORTKeyCompare(SORTLinePtr a, SORTLinePtr b, SORTDataPtr sdp)
853 {
854 register UcharPtr texta, textb, lima, limb, translate;
855 register Int4Ptr ignore;
856 SORTKeyFieldPtr key;
857 Int4 diff = 0, iter = 0, lena, lenb;
858
859 for (key = sdp->keyhead->next; key; key = key->next, ++iter) {
860 ignore = key->ignore;
861 translate = key->translate;
862
863 /* Find the beginning and limit of each field. */
864 if (iter) {
865 if (key->eword >= 0)
866 lima = SORTLimField(a, key, sdp), limb = SORTLimField(b, key, sdp);
867 else
868 lima = a->text + a->length, limb = b->text + b->length;
869
870 if (key->sword >= 0)
871 texta = SORTBegField(a, key, sdp), textb = SORTBegField(b, key, sdp);
872 else {
873 texta = a->text, textb = b->text;
874 if (key->skipsblanks) {
875 while (texta < lima && blanks[UCHAR(*texta)])
876 ++texta;
877 while (textb < limb && blanks[UCHAR(*textb)])
878 ++textb;
879 }
880 }
881 } else {
882 /* For the first iteration only, the key positions have
883 been precomputed for us. */
884 texta = a->keybeg, lima = a->keylim;
885 textb = b->keybeg, limb = b->keylim;
886 }
887
888 /* Find the lengths. */
889 lena = lima - texta, lenb = limb - textb;
890 if (lena < 0)
891 lena = 0;
892 if (lenb < 0)
893 lenb = 0;
894
895 /* Actually compare the fields. */
896 if (key->numeric) {
897 if (*lima || *limb) {
898 Uchar savea = *lima, saveb = *limb;
899
900 *lima = *limb = '\0';
901 diff = SORTNumCompare(texta, textb);
902 *lima = savea, *limb = saveb;
903 } else
904 diff = SORTNumCompare(texta, textb);
905
906 if (diff)
907 return key->reverse ? -diff : diff;
908 continue;
909 } else if (key->month) {
910 diff = SORTGetMonth(texta, lena) - SORTGetMonth(textb, lenb);
911 if (diff)
912 return key->reverse ? -diff : diff;
913 continue;
914 } else if (ignore && translate)
915 while (texta < lima && textb < limb) {
916 while (texta < lima && ignore[UCHAR(*texta)])
917 ++texta;
918 while (textb < limb && ignore[UCHAR(*textb)])
919 ++textb;
920 if (texta < lima && textb < limb &&
921 translate[UCHAR(*texta++)] != translate[UCHAR(*textb++)]) {
922 diff = translate[UCHAR(*--texta)] - translate[UCHAR(*--textb)];
923 break;
924 }
925 } else if (ignore)
926 while (texta < lima && textb < limb) {
927 while (texta < lima && ignore[UCHAR(*texta)])
928 ++texta;
929 while (textb < limb && ignore[UCHAR(*textb)])
930 ++textb;
931 if (texta < lima && textb < limb && *texta++ != *textb++) {
932 diff = *--texta - *--textb;
933 break;
934 }
935 } else if (translate)
936 while (texta < lima && textb < limb) {
937 if (translate[UCHAR(*texta++)] != translate[UCHAR(*textb++)]) {
938 diff = translate[UCHAR(*--texta)] - translate[UCHAR(*--textb)];
939 break;
940 }
941 } else
942 diff = MemCmp(texta, textb, MIN(lena, lenb));
943
944 if (diff)
945 return key->reverse ? -diff : diff;
946 if ((diff = lena - lenb) != 0)
947 return key->reverse ? -diff : diff;
948 }
949 return 0;
950 }
951
952 /* Compare two lines, returning negative, zero, or positive depending on
953 whether A compares less than, equal to, or greater than B. */
SORTCompare(register SORTLinePtr a,register SORTLinePtr b,SORTDataPtr sdp)954 static Int4 SORTCompare(register SORTLinePtr a, register SORTLinePtr b,
955 SORTDataPtr sdp)
956 {
957 Int4 diff, tmpa, tmpb, min;
958
959 if (sdp->keyhead->next) {
960 if ((diff = SORTKeyCompare(a, b, sdp)) != 0)
961 return diff;
962 if (!sdp->unique) {
963 tmpa = a->length, tmpb = b->length;
964 diff = MemCmp(a->text, b->text, MIN(tmpa, tmpb));
965 if (! diff)
966 diff = tmpa - tmpb;
967 }
968 } else {
969 tmpa = a->length, tmpb = b->length;
970 min = MIN(tmpa, tmpb);
971 if (min == 0)
972 diff = tmpa - tmpb;
973 else {
974 UcharPtr ap = a->text;
975 UcharPtr bp = b->text;
976
977 diff = *ap - *bp;
978 if (diff == 0) {
979 diff = MemCmp(ap, bp, min);
980 if (diff == 0)
981 diff = tmpa - tmpb;
982 }
983 }
984 }
985 return sdp->reverse ? -diff : diff;
986 }
987
988 /* Check that the lines read from the given FP come in order. Return
989 1 if they do and 0 if there is a disorder. */
990
SORTCheckOrder(FILE * fp,SORTDataPtr sdp)991 static Int4 SORTCheckOrder(FILE *fp, SORTDataPtr sdp)
992 {
993 SORTBuffer buf; /* Input buffer. */
994 SORTLines lines; /* Lines scanned from the buffer. */
995 SORTLine temp; /* Copy of previous line. */
996 Int4 cc; /* Character count. */
997 Int4 cmp; /* Result of calling compare. */
998 Int4 alloc, i, success = 1;
999
1000 SORTInitBuf(&buf, sdp->mergealloc);
1001 SORTInitLines(&lines, sdp->mergealloc/sdp->linelength + 1);
1002 alloc = sdp->linelength;
1003 if((temp.text = (UcharPtr)MemNew(alloc)) == NULL)
1004 return -1;
1005
1006 cc = SORTFillBuf(&buf, fp);
1007 SORTFindLines(&buf, &lines, sdp);
1008
1009 if (cc)
1010 do {
1011 /* Compare each line in the buffer with its successor. */
1012 for (i = 0; i < lines.used - 1; ++i) {
1013 cmp = SORTCompare(&lines.lines[i], &lines.lines[i + 1], sdp);
1014 if ((sdp->unique && cmp >= 0) || cmp > 0) {
1015 success = 0;
1016 goto finish;
1017 }
1018 }
1019
1020 /* Save the last line of the buffer and refill the buffer. */
1021 if (lines.lines[lines.used - 1].length > alloc) {
1022 while (lines.lines[lines.used - 1].length + 1 > alloc)
1023 alloc *= 2;
1024 if((temp.text = (UcharPtr) Realloc(temp.text, alloc)) == NULL)
1025 return -1;
1026 }
1027 MemCpy((CharPtr)temp.text, (CharPtr)lines.lines[lines.used - 1].text,
1028 lines.lines[lines.used - 1].length + 1);
1029 temp.length = lines.lines[lines.used - 1].length;
1030
1031 cc = SORTFillBuf(&buf, fp);
1032 if (cc) {
1033 SORTFindLines(&buf, &lines, sdp);
1034 /* Make sure the line saved from the old buffer contents is
1035 less than or equal to the first line of the new buffer. */
1036 cmp = SORTCompare(&temp, &lines.lines[0], sdp);
1037 if ((sdp->unique && cmp >= 0) || cmp > 0) {
1038 success = 0;
1039 break;
1040 }
1041 }
1042 }
1043 while (cc);
1044
1045 finish:
1046 FileClose(fp);
1047 MemFree(buf.buf);
1048 MemFree((CharPtr) lines.lines);
1049 MemFree(temp.text);
1050 return success;
1051 }
1052
1053 /* Merge lines from FPS onto OFP. NFPS cannot be greater than NMERGE.
1054 Close FPS before returning. */
1055
SORTMergeFPS(FILE * fps[],register Int4 nfps,FILE * ofp,SORTDataPtr sdp)1056 static Int4 SORTMergeFPS(FILE *fps[], register Int4 nfps,
1057 FILE *ofp, SORTDataPtr sdp)
1058 {
1059 SORTBuffer buffer[NMERGE]; /* Input buffers for each file. */
1060 SORTLines lines[NMERGE]; /* Line tables for each buffer. */
1061 SORTLine saved; /* Saved line for unique check. */
1062 Int4 savedflag = 0; /* True if there is a saved line. */
1063 Int4 savealloc = 0; /* Size allocated for the saved line. */
1064 Int4 cur[NMERGE]; /* Current line in each line table. */
1065 Int4 ord[NMERGE]; /* Table representing a permutation of fps,
1066 such that lines[ord[0]].lines[cur[ord[0]]]
1067 is the smallest line and will be next
1068 output. */
1069 register Int4 i, j, t;
1070
1071 /* Allocate space for a saved line if necessary. */
1072 if (sdp->unique) {
1073 if((saved.text = (UcharPtr)MemNew(savealloc = sdp->linelength)) == NULL)
1074 return -1;
1075 }
1076
1077 /* Read initial lines from each input file. */
1078 for (i = 0; i < nfps; ++i) {
1079 SORTInitBuf(&buffer[i], sdp->mergealloc);
1080 /* If a file is empty, eliminate it from future consideration. */
1081 while (i < nfps && !SORTFillBuf(&buffer[i], fps[i])) {
1082 FileClose(fps[i]);
1083 --nfps;
1084 for (j = i; j < nfps; ++j)
1085 fps[j] = fps[j + 1];
1086 }
1087 if (i == nfps)
1088 MemFree(buffer[i].buf);
1089 else {
1090 SORTInitLines(&lines[i], sdp->mergealloc/sdp->linelength + 1);
1091 SORTFindLines(&buffer[i], &lines[i], sdp);
1092 cur[i] = 0;
1093 }
1094 }
1095
1096 /* Set up the ord table according to comparisons among input lines.
1097 Since this only reorders two items if one is strictly greater than
1098 the other, it is stable. */
1099 for (i = 0; i < NMERGE; i++) {
1100 ord [i] = 0;
1101 }
1102 for (i = 0; i < nfps; ++i)
1103 ord[i] = i;
1104 for (i = 1; i < nfps; ++i)
1105 if (SORTCompare(&lines[ord[i - 1]].lines[cur[ord[i - 1]]],
1106 &lines[ord[i]].lines[cur[ord[i]]], sdp) > 0)
1107 t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1108
1109 /* Repeatedly output the smallest line until no input remains. */
1110 while (nfps) {
1111 /* If uniqified output is turned out, output only the last of
1112 an identical series of lines. */
1113 if (sdp->unique) {
1114 if (savedflag &&
1115 SORTCompare(&saved, &lines[ord[0]].lines[cur[ord[0]]], sdp)) {
1116 if(FileWrite(saved.text, 1, saved.length, ofp) != (Uint4) saved.length)
1117 return -1;
1118 putc('\n', ofp);
1119 }
1120 if (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1) {
1121 while (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1)
1122 savealloc *= 2;
1123 if((saved.text = (UcharPtr) Realloc(saved.text, savealloc)) == NULL)
1124 return -1;
1125 }
1126 saved.length = lines[ord[0]].lines[cur[ord[0]]].length;
1127 MemCpy((CharPtr)saved.text,
1128 (CharPtr)lines[ord[0]].lines[cur[ord[0]]].text,
1129 saved.length + 1);
1130 savedflag = 1;
1131 } else {
1132 if(FileWrite(lines[ord[0]].lines[cur[ord[0]]].text, 1,
1133 lines[ord[0]].lines[cur[ord[0]]].length, ofp) !=
1134 (Uint4) lines[ord[0]].lines[cur[ord[0]]].length)
1135 return -1;
1136 putc('\n', ofp);
1137 }
1138
1139 /* Check if we need to read more lines into core. */
1140 if (++cur[ord[0]] == lines[ord[0]].used) {
1141 /* missing '{' can lead to mistakes in later edits... */
1142 if (SORTFillBuf(&buffer[ord[0]], fps[ord[0]])) {
1143 SORTFindLines(&buffer[ord[0]], &lines[ord[0]], sdp);
1144 cur[ord[0]] = 0;
1145 } else {
1146 /* We reached EOF on fps[ord[0]]. */
1147 for (i = 1; i < nfps; ++i)
1148 if (ord[i] > ord[0])
1149 --ord[i];
1150 --nfps;
1151 FileClose(fps[ord[0]]);
1152 MemFree(buffer[ord[0]].buf);
1153 MemFree((CharPtr) lines[ord[0]].lines);
1154 for (i = ord[0]; i < nfps; ++i) {
1155 fps[i] = fps[i + 1];
1156 buffer[i] = buffer[i + 1];
1157 lines[i] = lines[i + 1];
1158 cur[i] = cur[i + 1];
1159 }
1160 for (i = 0; i < nfps; ++i)
1161 ord[i] = ord[i + 1];
1162 continue;
1163 }
1164 }
1165
1166 /* The new line just read in may be larger than other lines
1167 already in core; push it back in the queue until we encounter
1168 a line larger than it. */
1169 for (i = 1; i < nfps; ++i) {
1170 t = SORTCompare(&lines[ord[0]].lines[cur[ord[0]]],
1171 &lines[ord[i]].lines[cur[ord[i]]], sdp);
1172 if (! t)
1173 t = ord[0] - ord[i];
1174 if (t < 0)
1175 break;
1176 }
1177 t = ord[0];
1178 for (j = 1; j < i; ++j)
1179 ord[j - 1] = ord[j];
1180 ord[i - 1] = t;
1181 }
1182
1183 if (sdp->unique && savedflag) {
1184 if(FileWrite(saved.text, 1, saved.length, ofp) != (Uint4) saved.length)
1185 return -1;
1186 putc('\n', ofp);
1187 MemFree(saved.text);
1188 }
1189 return 0;
1190 }
1191
1192 /* Sort the array LINES using TEMP for temporary space. */
SORTArrayLines(SORTLinePtr lines,Int4 nlines,SORTLinePtr temp,SORTDataPtr sdp)1193 static void SORTArrayLines(SORTLinePtr lines, Int4 nlines,
1194 SORTLinePtr temp, SORTDataPtr sdp)
1195 {
1196 register SORTLinePtr lo, hi, t;
1197 register Int4 nlo, nhi;
1198
1199 if (nlines == 2) {
1200 if (SORTCompare(&lines[0], &lines[1], sdp) > 0)
1201 *temp = lines[0], lines[0] = lines[1], lines[1] = *temp;
1202 return;
1203 }
1204
1205 nlo = nlines / 2;
1206 lo = lines;
1207 nhi = nlines - nlo;
1208 hi = lines + nlo;
1209
1210 if (nlo > 1)
1211 SORTArrayLines(lo, nlo, temp, sdp);
1212
1213 if (nhi > 1)
1214 SORTArrayLines(hi, nhi, temp, sdp);
1215
1216 t = temp;
1217
1218 while (nlo && nhi)
1219 if (SORTCompare(lo, hi, sdp) <= 0)
1220 *t++ = *lo++, --nlo;
1221 else
1222 *t++ = *hi++, --nhi;
1223 while (nlo--)
1224 *t++ = *lo++;
1225
1226 for (lo = lines, nlo = nlines - nhi, t = temp; nlo; --nlo)
1227 *lo++ = *t++;
1228 }
1229
1230 /* Check that each of the given FILES is ordered.
1231 Return a count of disordered files. */
SORTCheckOrderS(CharPtr files[],Int4 nfiles,SORTObjectPtr sop)1232 Int4 SORTCheckOrderS(CharPtr files[], Int4 nfiles, SORTObjectPtr sop)
1233 {
1234 Int4 i, disorders = 0;
1235 FILE *fp;
1236 SORTDataPtr sdp;
1237
1238 if((sdp = (SORTDataPtr) sop) == NULL)
1239 return -1;
1240
1241 for (i = 0; i < nfiles; ++i) {
1242 if((fp = FileOpen(files[i], "r")) == NULL)
1243 return -1;
1244 if (!SORTCheckOrder(fp, sdp)) {
1245 ErrLogPrintf("sort: disorder on %s\n", files[i]);
1246 ++disorders;
1247 }
1248 FileClose(fp);
1249 }
1250 return disorders;
1251 }
1252
1253 /* Insert a key at the end of the list. */
SORTInsertKey(SORTKeyFieldPtr key,SORTKeyFieldPtr keyhead)1254 SORTErrorCode SORTInsertKey(SORTKeyFieldPtr key, SORTKeyFieldPtr keyhead)
1255 {
1256 SORTKeyFieldPtr k = keyhead;
1257
1258 while (k->next) {
1259 k = k->next;
1260 }
1261 k->next = key;
1262 key->next = NULL;
1263
1264 return SORTNoError;
1265 }
1266