1 static char const rcsid[] = "$Id: ncbisort.c,v 6.9 2012/08/22 23:02:39 kans Exp $";
2 
3 /* $Id: ncbisort.c,v 6.9 2012/08/22 23:02:39 kans Exp $
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name:  $RCSfile: ncbisort.c,v $
29 *
30 * Author:  Sergei Shavirin
31 *
32 * Initial Version Creation Date: 03/24/1997
33 *
34 * $Revision: 6.9 $
35 *
36 * File Description:
37 *         Main file for SORTing library
38 *
39 * $Log: ncbisort.c,v $
40 * Revision 6.9  2012/08/22 23:02:39  kans
41 * null dereference checks and initializing variables due to clang static analysis
42 *
43 * Revision 6.8  2006/05/10 20:47:17  camacho
44 * From Ilya Dondoshansky: SORTFiles: added output parameter for total number of lines processed
45 *
46 * Revision 6.7  2003/07/15 20:17:39  coulouri
47 * remove signal() and setrlimit() calls
48 *
49 * Revision 6.6  2003/05/30 17:25:37  coulouri
50 * add rcsid
51 *
52 * Revision 6.5  2003/02/26 17:48:09  kimelman
53 * bugfix: format/argument mismatch in error message
54 *
55 * Revision 6.4  1998/06/01 14:52:37  madden
56 * Change to using TmpNam
57 *
58 * Revision 6.3  1998/04/09 20:40:45  kans
59 * use TmpNam for Mac sorting
60 *
61 * Revision 6.2  1998/04/09 18:54:23  kans
62 * on mac, \r must be converted to \n in buffer
63 *
64 * Revision 6.1  1997/12/02 18:57:38  shavirin
65 * Removed typecast warnings insprintf() and sscanf()
66 *
67 * Revision 6.0  1997/08/25 18:53:34  madden
68 * Revision changed to 6.0
69 *
70 * Revision 1.4  1997/05/05 14:36:57  shavirin
71 * Fixed tmp-prefix handling for non-UNIX platform
72 *
73  * Revision 1.3  1997/05/01  20:16:21  shavirin
74  * Added typecast to remove few warnings
75  *
76  * Revision 1.2  1997/05/01  17:22:31  shavirin
77  * Added function SORTObjectFree()
78  *
79  * Revision 1.1  1997/03/13  21:52:15  shavirin
80  * Initial revision
81  *
82 *
83 * ==========================================================================
84 */
85 
86 #include <ncbisrti.h>
87 
SORTObjectNew(CharPtr prefix,Uchar tab,Int4 linelength,Boolean reverse,Boolean unique)88 SORTObjectPtr SORTObjectNew(CharPtr prefix, Uchar tab, Int4 linelength,
89                             Boolean reverse, Boolean unique)
90 {
91   SORTDataPtr sdp;
92   Int4 i;
93 
94   sdp = (SORTDataPtr) MemNew(sizeof(SORTData));
95   sdp->temphead = (SORTempNodePtr) MemNew(sizeof(SORTempNode));
96   global_temphead = sdp->temphead; /* Used for cleanup by signal */
97   sdp->keyhead = (SORTKeyFieldPtr)MemNew(sizeof(SORTKeyField));
98 
99   if(!tables_initialized) {
100     for (i = 0; i < UCHAR_LIM; ++i) {
101       if (ISBLANK(i))
102         blanks[i] = 1;
103       if (IS_DIGIT(i))
104         digits[i] = 1;
105       if (IS_UPPER(i))
106         fold_tolower[i] = tolower(i);
107       else
108         fold_tolower[i] = (Uint1) i;
109     }
110   }
111 
112   sdp->sortalloc = SORTALLOC;
113   sdp->mergealloc = MERGEALLOC;
114 
115   if(linelength != 0)
116     sdp->linelength = linelength;
117   else
118     sdp->linelength = 30;
119 
120   if(prefix != NULL) {
121     sdp->prefix = StringSave(prefix);
122   } else {
123 #ifndef OS_UNIX
124     {{
125       CharPtr tmp_dir = getenv ("TMP");
126 
127       if (!tmp_dir)
128         sdp->prefix = StringSave(".");
129       else
130         sdp->prefix = StringSave(tmp_dir);
131     }}
132 #else /* not OS_UNIX */
133     sdp->prefix = StringSave(PREFIX);
134 #endif
135   }
136   sdp->tab = tab;
137   sdp->reverse  = reverse;
138   sdp->unique = unique;
139 
140   return (SORTObjectPtr) sdp;
141 }
142 
SORTObjectFree(SORTObjectPtr object)143 void SORTObjectFree(SORTObjectPtr object)
144 {
145     SORTDataPtr sdp;
146     SORTempNodePtr tnp_next, tnp_tmp;
147     SORTKeyFieldPtr kfp_next, kfp_tmp;
148 
149     if((sdp = (SORTDataPtr) object) == NULL)
150         return;
151 
152     for(tnp_tmp = sdp->temphead; tnp_tmp != NULL; tnp_tmp = tnp_next) {
153         tnp_next = tnp_tmp->next;
154         MemFree(tnp_tmp->name);
155         MemFree(tnp_tmp);
156     }
157 
158     for(kfp_tmp = sdp->keyhead; kfp_tmp != NULL; kfp_tmp = kfp_next) {
159         kfp_next = kfp_tmp->next;
160         MemFree(kfp_tmp->ignore);
161         MemFree(kfp_tmp->translate);
162         MemFree(kfp_tmp);
163     }
164 
165     MemFree(sdp->prefix);
166     MemFree(sdp);
167 }
168 
SORTGetKeyHead(SORTObjectPtr sop)169 SORTKeyFieldPtr SORTGetKeyHead(SORTObjectPtr sop)
170 {
171   SORTDataPtr sdp;
172 
173   if((sdp = (SORTDataPtr) sop) == NULL)
174     return NULL;
175 
176   return sdp->keyhead;
177 }
SORTSetReverse(Boolean reverse,SORTObjectPtr sop)178 SORTErrorCode SORTSetReverse(Boolean reverse, SORTObjectPtr sop)
179 {
180   SORTDataPtr sdp;
181 
182   if((sdp = (SORTDataPtr) sop) == NULL)
183     return SORTBadParameter;
184   sdp->reverse  = reverse;
185   return SORTNoError;
186 }
SORTSetUnique(Boolean unique,SORTObjectPtr sop)187 SORTErrorCode SORTSetUnique(Boolean unique, SORTObjectPtr sop)
188 {
189   SORTDataPtr sdp;
190 
191   if((sdp = (SORTDataPtr) sop) == NULL)
192     return SORTBadParameter;
193 
194   sdp->unique = unique;
195   return SORTNoError;
196 }
SORTSetLineLength(Int4 linelength,SORTObjectPtr sop)197 SORTErrorCode SORTSetLineLength(Int4 linelength, SORTObjectPtr sop)
198 {
199   SORTDataPtr sdp;
200 
201   if((sdp = (SORTDataPtr) sop) == NULL)
202     return SORTBadParameter;
203 
204   if(linelength != 0)
205     sdp->linelength = linelength;
206   else
207     sdp->linelength = 30;
208 
209   return SORTNoError;
210 }
211 
SORTSetTab(Uchar tab,SORTObjectPtr sop)212 SORTErrorCode SORTSetTab(Uchar tab, SORTObjectPtr sop)
213 {
214   SORTDataPtr sdp;
215 
216   if((sdp = (SORTDataPtr) sop) == NULL)
217     return SORTBadParameter;
218 
219   sdp->tab = tab;
220 
221   return SORTNoError;
222 }
SORTSetPrefix(CharPtr prefix,SORTObjectPtr sop)223 SORTErrorCode SORTSetPrefix(CharPtr prefix, SORTObjectPtr sop)
224 {
225   SORTDataPtr sdp;
226 
227   if((sdp = (SORTDataPtr) sop) == NULL)
228     return SORTBadParameter;
229 
230   if(prefix != NULL) {
231     if(sdp->prefix != NULL)
232       MemFree(sdp->prefix);
233     sdp->prefix = StringSave(prefix);
234   } else {
235     sdp->prefix = NULL;
236   }
237 
238   return SORTNoError;
239 }
240 
241 /* Sort any number of FILES onto the given OFP. */
SORTFiles(CharPtr PNTR files,Int4 nfiles,FILE * ofp,SORTObjectPtr sop,Int4 * line_count)242 SORTErrorCode SORTFiles(CharPtr PNTR files, Int4 nfiles, FILE *ofp,
243                SORTObjectPtr sop, Int4* line_count)
244 {
245   SORTBuffer   buf;
246   SORTLines   lines;
247   SORTLinePtr tmp;
248   Int4 i, ntmp;
249   FILE *fp, *tfp;
250   SORTempNodePtr node;
251   SORTDataPtr sdp;
252   Int4 ntemp = 0;
253   CharPtr PNTR tempfiles;
254   CharPtr temp;
255   SORTErrorCode error_code = SORTNoError;
256 
257   *line_count = 0;
258 
259   if((sdp = (SORTDataPtr) sop) == NULL) {
260     error_code = SORTBadParameter;
261     goto return_from_function;
262   }
263 
264   SORTInitBuf(&buf, sdp->sortalloc);
265   SORTInitLines(&lines, sdp->sortalloc/sdp->linelength + 1);
266   ntmp = lines.alloc;
267 
268   if((tmp = (SORTLinePtr) MemNew(ntmp * sizeof (SORTLine))) == NULL) {
269     error_code = SORTNoMemory;
270     goto return_from_function;
271   }
272 
273   while (nfiles--) {
274     if((fp = FileOpen(*files++, "r")) == NULL) {
275       error_code = SORTBadFileName;
276       goto return_from_function;
277     }
278 
279     while (SORTFillBuf(&buf, fp)) {
280       SORTFindLines(&buf, &lines, sdp);
281       *line_count += lines.used;
282       if (lines.used > ntmp) {
283         while (lines.used > ntmp)
284           ntmp *= 2;
285 
286         if((tmp = (SORTLinePtr) Realloc(tmp, ntmp*sizeof(SORTLine))) == NULL) {
287           error_code = SORTNoMemory;
288           goto return_from_function;
289         }
290       }
291       SORTArrayLines(lines.lines, lines.used, tmp, sdp);
292       if (feof(fp) && !nfiles && !ntemp) {
293         tfp = ofp;
294       } else {
295         ++ntemp;
296 
297         if((temp =SORTAddTempName(sop)) == NULL) {
298           error_code = SORTMiscError;
299           goto return_from_function;
300         }
301         if((tfp = FileOpen(temp, "w")) == NULL) {
302           error_code = SORTBadFileName;
303           goto return_from_function;
304         }
305       }
306       for (i = 0; i < lines.used; ++i)
307         if (!sdp->unique || i == lines.used - 1
308             || SORTCompare(&lines.lines[i], &lines.lines[i + 1], sdp)) {
309 
310           if((FileWrite(lines.lines[i].text, 1, lines.lines[i].length, tfp)) !=
311              (Uint4) lines.lines[i].length) {
312             error_code = SORTWriteError;
313             goto return_from_function;
314           }
315           putc('\n', tfp);
316         }
317       if (tfp != ofp)
318         FileClose(tfp);
319     }
320     FileClose(fp);
321   }
322 
323 
324   MemFree(buf.buf);
325   MemFree((CharPtr) lines.lines);
326   MemFree((CharPtr) tmp);
327 
328   if (ntemp) {
329     if((tempfiles = (CharPtr PNTR) MemNew(ntemp * sizeof (CharPtr))) == NULL) {
330       error_code = SORTNoMemory;
331       goto return_from_function;
332     }
333 
334     i = ntemp;
335     for (node = sdp->temphead->next; node; node = node->next)
336       tempfiles[--i] = node->name;
337     SORTMergeFiles(tempfiles, ntemp, ofp, sop);
338     MemFree((CharPtr) tempfiles);
339   }
340 
341  return_from_function:
342   if (sdp != NULL) {
343     SORTCleanup(sdp->temphead);
344   }
345   return error_code;
346 }
347 
348 /* Merge any number of FILES onto the given OFP. */
SORTMergeFiles(CharPtr files[],Int4 nfiles,FILE * ofp,SORTObjectPtr sop)349 SORTErrorCode SORTMergeFiles(CharPtr files[], Int4 nfiles, FILE *ofp,
350                 SORTObjectPtr sop)
351 {
352   Int4 i, j, t;
353   CharPtr temp;
354   FILE *fps[NMERGE], *tfp;
355   SORTDataPtr sdp;
356   SORTErrorCode error_code = SORTNoError;
357 
358   if((sdp = (SORTDataPtr) sop) == NULL) {
359     error_code = SORTBadParameter;
360     goto return_from_function;
361   }
362 
363   while (nfiles > NMERGE) {
364     t = 0;
365     for (i = 0; i < nfiles / NMERGE; ++i) {
366       for (j = 0; j < NMERGE; ++j) {
367         if((fps[j] = FileOpen(files[i * NMERGE + j], "r")) == NULL) {
368           error_code = SORTBadFileName;
369           goto return_from_function;
370         }
371       }
372       if((temp = SORTAddTempName(sop)) == NULL) {
373         error_code = SORTMiscError;
374         goto return_from_function;
375       }
376       if((tfp = FileOpen(temp, "w")) == NULL) {
377         error_code = SORTBadFileName;
378         goto return_from_function;
379       }
380 
381       SORTMergeFPS(fps, NMERGE, tfp, sdp);
382 
383       FileClose(tfp);
384 
385       for (j = 0; j < NMERGE; ++j)
386         SORTDelTempName(files[i * NMERGE + j], sdp->temphead);
387 
388       files[t++] = temp;
389     }
390     for (j = 0; j < nfiles % NMERGE; ++j)
391       if((fps[j] = FileOpen(files[i * NMERGE + j], "r")) == NULL) {
392         error_code = SORTBadFileName;
393         goto return_from_function;
394       }
395     if((temp = SORTAddTempName(sop)) == NULL) {
396       error_code = SORTMiscError;
397       goto return_from_function;
398     }
399 
400     if((tfp = FileOpen(temp, "w")) == NULL) {
401       error_code = SORTBadFileName;
402       goto return_from_function;
403     }
404 
405     SORTMergeFPS(fps, nfiles % NMERGE, tfp, sdp);
406     FileClose(tfp);
407 
408     for (j = 0; j < nfiles % NMERGE; ++j)
409       SORTDelTempName(files[i * NMERGE + j], sdp->temphead);
410     files[t++] = temp;
411     nfiles = t;
412   }
413 
414   for (i = 0; i < nfiles; ++i) {
415     if((fps[i] = FileOpen(files[i], "r")) == NULL) {
416       error_code = SORTBadFileName;
417       goto return_from_function;
418     }
419   }
420 
421   SORTMergeFPS(fps, i, ofp, sdp);
422 
423   for (i = 0; i < nfiles; ++i)
424     SORTDelTempName(files[i], sdp->temphead);
425 
426  return_from_function:
427   if (sdp != NULL) {
428     SORTCleanup(sdp->temphead);
429   }
430   return error_code;
431 }
432 
433 /* Clean up any remaining temporary files. */
434 
SORTCleanup(SORTempNodePtr temphead)435 static void SORTCleanup(SORTempNodePtr temphead)
436 {
437   SORTempNodePtr node;
438 
439   for (node = temphead->next; node; node = node->next)
440     FileRemove(node->name);
441 
442   temphead->next = NULL;
443 }
444 
445 /* Return a name for a temporary file. */
SORTAddTempName(SORTObjectPtr sop)446 CharPtr SORTAddTempName(SORTObjectPtr sop)
447 {
448   Char path [PATH_MAX];
449   CharPtr name;
450   SORTempNodePtr node;
451   SORTDataPtr sdp;
452 
453   if((sdp = (SORTDataPtr) sop) == NULL)
454     return NULL;
455 
456 
457   TmpNam (path);
458   name = StringSave (path);
459 
460   if((node = (SORTempNodePtr)MemNew(sizeof(SORTempNode))) == NULL) {
461     SORTCleanup(sdp->temphead);
462     return NULL;
463   }
464 
465   node->name = name;
466   node->next = sdp->temphead->next;
467   sdp->temphead->next = node;
468 
469   return name;
470 }
471 
472 /* Search through the list of temporary files for the given name;
473    remove it if it is found on the list. */
SORTDelTempName(CharPtr name,SORTempNodePtr temphead)474 static void SORTDelTempName(CharPtr name, SORTempNodePtr temphead)
475 {
476   SORTempNodePtr node, temp;
477 
478   for (node = temphead; node->next; node = node->next) {
479     if (!StringCmp(name, node->next->name))
480       break;
481   }
482 
483   if (node->next) {
484     temp = node->next;
485     FileRemove(temp->name);
486     MemFree(temp->name);
487     node->next = temp->next;
488     MemFree((CharPtr) temp);
489   }
490 }
491 
492 /* Initialize BUF allocating ALLOC bytes initially. */
493 
SORTInitBuf(SORTBufferPtr buf,Int4 alloc)494 static SORTErrorCode SORTInitBuf (SORTBufferPtr buf, Int4 alloc)
495 {
496   buf->alloc = alloc;
497   if((buf->buf = (UcharPtr)MemNew(buf->alloc)) == NULL)
498     return SORTNoMemory;
499   buf->used = buf->left = 0;
500   return SORTNoError;
501 }
502 
503 /* Fill BUF reading from FP, moving buf->left bytes from the end
504    of buf->buf to the beginning first.	If EOF is reached and the
505    file wasn't terminated by a newline, supply one.  Return a count
506    of bytes actually read. */
507 
SORTFillBuf(SORTBufferPtr buf,FILE * fp)508 static Int4 SORTFillBuf(SORTBufferPtr buf, FILE *fp)
509 {
510   Int4 cc, total = 0;
511 
512   MemMove(buf->buf, buf->buf + buf->used - buf->left, buf->left);
513   buf->used = buf->left;
514 
515   while (!feof(fp) && !Nlm_MemChr(buf->buf, '\n', buf->used)) {
516     if (buf->used == buf->alloc) {
517       if((buf->buf = (UcharPtr)
518           Realloc(buf->buf, buf->alloc *= 2)) == NULL) {
519         return (Int4) SORTNoMemory;
520       }
521     }
522 
523     if((cc = FileRead(buf->buf + buf->used,
524                       1, buf->alloc - buf->used, fp)) == 0 && ferror(fp)) {
525       ErrLogPrintf("sort: read error (%s)\n",  strerror(errno));
526       return (Int4) SORTReadError;
527     }
528 
529 #ifdef WIN_MAC
530     {
531       Int4      ct;
532       UcharPtr  ptr;
533 
534       ptr = buf->buf + buf->used;
535       ct = cc;
536       while (ct > 0) {
537         if (*ptr == '\r') {
538           *ptr = '\n';
539         }
540         ptr++;
541         ct--;
542       }
543     }
544 #endif
545 
546     buf->used += cc;
547     total += cc;
548   }
549 
550   if (feof(fp) && buf->used && buf->buf[buf->used - 1] != '\n') {
551     if (buf->used == buf->alloc) {
552       if((buf->buf = (UcharPtr) Realloc(buf->buf, buf->alloc *= 2)) == NULL) {
553         return (Int4) SORTNoMemory;
554       }
555     }
556     buf->buf[buf->used++] = '\n';
557     ++total;
558   }
559 
560   return total;
561 }
562 
563 /* Initialize LINES, allocating space for ALLOC lines initially. */
SORTInitLines(SORTLinesPtr lines,Int4 alloc)564 static SORTErrorCode SORTInitLines(SORTLinesPtr lines, Int4 alloc)
565 {
566   lines->alloc = alloc;
567   if((lines->lines =
568       (SORTLinePtr)MemNew(lines->alloc * sizeof(SORTLine))) == NULL)
569     return SORTNoMemory;
570   lines->used = 0;
571   return SORTNoError;
572 }
573 
574 /* Return a pointer to the first character of a field. */
SORTBegField(SORTLinePtr line,SORTKeyFieldPtr key,SORTDataPtr sdp)575 static UcharPtr SORTBegField(SORTLinePtr line, SORTKeyFieldPtr key,
576                              SORTDataPtr sdp)
577 {
578   register UcharPtr ptr = line->text, lim = ptr + line->length;
579   register Int4 sword = key->sword, schar = key->schar;
580 
581   if (sdp->tab)
582     while (ptr < lim && sword--) {
583       while (ptr < lim && *ptr != sdp->tab)
584         ++ptr;
585       if (ptr < lim)
586         ++ptr;
587     } else
588       while (ptr < lim && sword--) {
589 	while (ptr < lim && blanks[UCHAR(*ptr)])
590 	  ++ptr;
591 	while (ptr < lim && !blanks[UCHAR(*ptr)])
592 	  ++ptr;
593       }
594 
595   if (key->skipsblanks)
596     while (ptr < lim && blanks[UCHAR(*ptr)])
597       ++ptr;
598 
599   while (ptr < lim && schar--)
600     ++ptr;
601 
602   return ptr;
603 }
604 
605 /* Find the limit of a field; i.e., a pointer to the first character
606    after the field. */
SORTLimField(SORTLinePtr line,SORTKeyFieldPtr key,SORTDataPtr sdp)607 static UcharPtr SORTLimField(SORTLinePtr line, SORTKeyFieldPtr key,
608                              SORTDataPtr sdp)
609 {
610   register UcharPtr ptr = line->text, lim = ptr + line->length;
611   register Int4 eword = key->eword, echar = key->echar;
612 
613   if (sdp->tab)
614     while (ptr < lim && eword--) {
615       while (ptr < lim && *ptr != sdp->tab)
616         ++ptr;
617       if (ptr < lim && (eword || key->skipeblanks))
618         ++ptr;
619     } else
620       while (ptr < lim && eword--) {
621 	while (ptr < lim && blanks[UCHAR(*ptr)])
622 	  ++ptr;
623 	while (ptr < lim && !blanks[UCHAR(*ptr)])
624 	  ++ptr;
625       }
626 
627   if (key->skipeblanks)
628     while (ptr < lim && blanks[UCHAR(*ptr)])
629       ++ptr;
630 
631   while (ptr < lim && echar--)
632     ++ptr;
633 
634   return ptr;
635 }
636 
637 /* Find the lines in BUF, storing pointers and lengths in LINES.
638    Also replace newlines with NULs. */
SORTFindLines(SORTBufferPtr buf,SORTLinesPtr lines,SORTDataPtr sdp)639 static SORTErrorCode SORTFindLines(SORTBufferPtr buf, SORTLinesPtr lines,
640                           SORTDataPtr sdp)
641 {
642   register UcharPtr beg, lim, ptr;
643   SORTKeyFieldPtr key;
644 
645   beg = buf->buf;
646   lim = buf->buf + buf->used;
647   key = sdp->keyhead->next;
648 
649   lines->used = 0;
650 
651   while (beg < lim &&
652          ((ptr = (UcharPtr)Nlm_MemChr(beg, '\n', lim - beg)) != NULL)) {
653     /* There are various places in the code that rely on a NUL
654        being at the end of in-core lines; NULs inside the lines
655        will not cause trouble, though. */
656     *ptr = '\0';
657 
658     if (lines->used == lines->alloc)
659       if((lines->lines = (SORTLinePtr) Realloc((CharPtr)lines->lines,
660                                                (lines->alloc *= 2) *
661                                                sizeof(SORTLine))) == NULL)
662         return SORTNoMemory;
663 
664     lines->lines[lines->used].text = beg;
665     lines->lines[lines->used].length = ptr - beg;
666 
667     /* Precompute the position of the first key for efficiency. */
668     if (key) {
669       if (key->eword >= 0)
670         lines->lines[lines->used].keylim =
671           SORTLimField(&lines->lines[lines->used], key, sdp);
672       else
673         lines->lines[lines->used].keylim = ptr;
674 
675       if (key->sword >= 0)
676         lines->lines[lines->used].keybeg =
677           SORTBegField(&lines->lines[lines->used], key, sdp);
678       else {
679         if (key->skipsblanks)
680           while (blanks[UCHAR(*beg)])
681             ++beg;
682         lines->lines[lines->used].keybeg = beg;
683       }
684     }
685 
686     ++lines->used;
687     beg = ptr + 1;
688   }
689 
690   buf->left = lim - beg;
691   return SORTNoError;
692 }
693 
694 /* Compare two strings containing decimal fractions < 1.  Each string
695    should begin with a decimal point followed immediately by the digits
696    of the fraction.  Strings not of this form are considered to be zero. */
SORTFracCompare(register UcharPtr a,register UcharPtr b)697 static Int4 SORTFracCompare(register UcharPtr a, register UcharPtr b)
698 {
699   register tmpa = UCHAR(*a), tmpb = UCHAR(*b);
700 
701   if (tmpa == '.' && tmpb == '.') {
702     do
703       tmpa = UCHAR(*++a), tmpb = UCHAR(*++b);
704     while (tmpa == tmpb && digits[tmpa]);
705     if (digits[tmpa] && digits[tmpb])
706       return tmpa - tmpb;
707     if (digits[tmpa]) {
708       while (tmpa == '0')
709         tmpa = UCHAR(*++a);
710       if (digits[tmpa])
711         return 1;
712       return 0;
713     }
714     if (digits[tmpb]) {
715       while (tmpb == '0')
716         tmpb = UCHAR(*++b);
717       if (digits[tmpb])
718         return -1;
719       return 0;
720     }
721     return 0;
722   } else if (tmpa == '.') {
723     do
724       tmpa = UCHAR(*++a);
725     while (tmpa == '0');
726     if (digits[tmpa])
727       return 1;
728     return 0;
729   } else if (tmpb == '.') {
730     do
731       tmpb = UCHAR(*++b);
732     while (tmpb == '0');
733     if (digits[tmpb])
734       return -1;
735     return 0;
736   }
737   return 0;
738 }
739 
740 /* Compare two strings as numbers without explicitly converting them to
741    machine numbers.  Comparatively slow for short strings, but asymptotically
742    hideously fast. */
SORTNumCompare(register UcharPtr a,register UcharPtr b)743 static Int4 SORTNumCompare(register UcharPtr a, register UcharPtr b)
744 {
745   register Int4 tmpa, tmpb, loga, logb, tmp;
746 
747   tmpa = UCHAR(*a), tmpb = UCHAR(*b);
748 
749   if (tmpa == '-') {
750     tmpa = UCHAR(*++a);
751     if (tmpb != '-') {
752       if (digits[tmpa] && digits[tmpb])
753         return -1;
754       return 0;
755     }
756     tmpb = UCHAR(*++b);
757 
758     while (tmpa == '0')
759       tmpa = UCHAR(*++a);
760     while (tmpb == '0')
761       tmpb = UCHAR(*++b);
762 
763     while (tmpa == tmpb && digits[tmpa])
764       tmpa = UCHAR(*++a), tmpb = UCHAR(*++b);
765 
766     if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa]))
767       return -SORTFracCompare(a, b);
768 
769     if (digits[tmpa])
770       for (loga = 1; digits[UCHAR(*++a)]; ++loga)
771         continue;
772     else
773       loga = 0;
774 
775     if (digits[tmpb])
776       for (logb = 1; digits[UCHAR(*++b)]; ++logb)
777         continue;
778     else
779       logb = 0;
780 
781     if ((tmp = logb - loga) != 0)
782       return tmp;
783 
784     if (! loga)
785       return 0;
786 
787     return tmpb - tmpa;
788   } else if (tmpb == '-') {
789     if (digits[UCHAR(tmpa)] && digits[UCHAR(*++b)])
790       return 1;
791     return 0;
792   } else {
793     while (tmpa == '0')
794       tmpa = UCHAR(*++a);
795     while (tmpb == '0')
796       tmpb = UCHAR(*++b);
797 
798     while (tmpa == tmpb && digits[tmpa])
799       tmpa = UCHAR(*++a), tmpb = UCHAR(*++b);
800 
801     if ((tmpa == '.' && !digits[tmpb]) || (tmpb == '.' && !digits[tmpa]))
802       return SORTFracCompare(a, b);
803 
804     if (digits[tmpa])
805       for (loga = 1; digits[UCHAR(*++a)]; ++loga)
806         continue;
807     else
808       loga = 0;
809 
810     if (digits[tmpb])
811       for (logb = 1; digits[UCHAR(*++b)]; ++logb)
812         continue;
813     else
814       logb = 0;
815 
816     if ((tmp = loga - logb) != 0)
817       return tmp;
818 
819     if (! loga)
820       return 0;
821 
822     return tmpa - tmpb;
823   }
824 }
825 
826 /* Return an integer <= 12 associated with a month name (0 if the name
827    is not recognized. */
SORTGetMonth(UcharPtr s,Int4 len)828 static Int4 SORTGetMonth(UcharPtr s, Int4 len)
829 {
830   Char month[4];
831   register Int4 i, lo = 0, hi = 12;
832 
833   if (len < 3)
834     return 0;
835 
836   for (i = 0; i < 3; ++i)
837     month[i] = fold_tolower[UCHAR(s[i])];
838   month[3] = '\0';
839 
840   while (hi - lo > 1)
841     if (StringCmp(month, monthtab[(lo + hi) / 2].name) < 0)
842       hi = (lo + hi) / 2;
843     else
844       lo = (lo + hi) / 2;
845   if (!StringCmp(month, monthtab[lo].name))
846     return monthtab[lo].val;
847   return 0;
848 }
849 
850 /* Compare two lines trying every key in sequence until there
851    are no more keys or a difference is found. */
SORTKeyCompare(SORTLinePtr a,SORTLinePtr b,SORTDataPtr sdp)852 static Int4 SORTKeyCompare(SORTLinePtr a, SORTLinePtr b, SORTDataPtr sdp)
853 {
854   register UcharPtr texta, textb, lima, limb, translate;
855   register Int4Ptr ignore;
856   SORTKeyFieldPtr key;
857   Int4 diff = 0, iter = 0, lena, lenb;
858 
859   for (key = sdp->keyhead->next; key; key = key->next, ++iter) {
860     ignore = key->ignore;
861     translate = key->translate;
862 
863     /* Find the beginning and limit of each field. */
864     if (iter) {
865       if (key->eword >= 0)
866         lima = SORTLimField(a, key, sdp), limb = SORTLimField(b, key, sdp);
867       else
868         lima = a->text + a->length, limb = b->text + b->length;
869 
870       if (key->sword >= 0)
871         texta = SORTBegField(a, key, sdp), textb = SORTBegField(b, key, sdp);
872       else {
873         texta = a->text, textb = b->text;
874         if (key->skipsblanks) {
875           while (texta < lima && blanks[UCHAR(*texta)])
876             ++texta;
877           while (textb < limb && blanks[UCHAR(*textb)])
878             ++textb;
879         }
880       }
881     } else {
882       /* For the first iteration only, the key positions have
883          been precomputed for us. */
884       texta = a->keybeg, lima = a->keylim;
885       textb = b->keybeg, limb = b->keylim;
886     }
887 
888     /* Find the lengths. */
889     lena = lima - texta, lenb = limb - textb;
890     if (lena < 0)
891       lena = 0;
892     if (lenb < 0)
893       lenb = 0;
894 
895     /* Actually compare the fields. */
896     if (key->numeric) {
897       if (*lima || *limb) {
898         Uchar savea = *lima, saveb = *limb;
899 
900         *lima = *limb = '\0';
901         diff = SORTNumCompare(texta, textb);
902         *lima = savea, *limb = saveb;
903       } else
904         diff = SORTNumCompare(texta, textb);
905 
906       if (diff)
907         return key->reverse ? -diff : diff;
908       continue;
909     } else if (key->month) {
910       diff = SORTGetMonth(texta, lena) - SORTGetMonth(textb, lenb);
911       if (diff)
912         return key->reverse ? -diff : diff;
913       continue;
914     } else if (ignore && translate)
915       while (texta < lima && textb < limb) {
916         while (texta < lima && ignore[UCHAR(*texta)])
917           ++texta;
918         while (textb < limb && ignore[UCHAR(*textb)])
919           ++textb;
920         if (texta < lima && textb < limb &&
921             translate[UCHAR(*texta++)] != translate[UCHAR(*textb++)]) {
922           diff = translate[UCHAR(*--texta)] - translate[UCHAR(*--textb)];
923           break;
924         }
925       } else if (ignore)
926 	while (texta < lima && textb < limb) {
927           while (texta < lima && ignore[UCHAR(*texta)])
928             ++texta;
929           while (textb < limb && ignore[UCHAR(*textb)])
930             ++textb;
931           if (texta < lima && textb < limb && *texta++ != *textb++) {
932             diff = *--texta - *--textb;
933             break;
934           }
935         } else if (translate)
936           while (texta < lima && textb < limb) {
937 	    if (translate[UCHAR(*texta++)] != translate[UCHAR(*textb++)]) {
938               diff = translate[UCHAR(*--texta)] - translate[UCHAR(*--textb)];
939               break;
940             }
941 	  } else
942             diff = MemCmp(texta, textb, MIN(lena, lenb));
943 
944     if (diff)
945       return key->reverse ? -diff : diff;
946     if ((diff = lena - lenb) != 0)
947       return key->reverse ? -diff : diff;
948   }
949   return 0;
950 }
951 
952 /* Compare two lines, returning negative, zero, or positive depending on
953    whether A compares less than, equal to, or greater than B. */
SORTCompare(register SORTLinePtr a,register SORTLinePtr b,SORTDataPtr sdp)954 static Int4 SORTCompare(register SORTLinePtr a, register SORTLinePtr b,
955                     SORTDataPtr sdp)
956 {
957   Int4 diff, tmpa, tmpb, min;
958 
959   if (sdp->keyhead->next) {
960     if ((diff = SORTKeyCompare(a, b, sdp)) != 0)
961       return diff;
962     if (!sdp->unique) {
963       tmpa = a->length, tmpb = b->length;
964       diff = MemCmp(a->text, b->text, MIN(tmpa, tmpb));
965       if (! diff)
966         diff = tmpa - tmpb;
967     }
968   } else {
969     tmpa = a->length, tmpb = b->length;
970     min = MIN(tmpa, tmpb);
971     if (min == 0)
972       diff = tmpa - tmpb;
973     else {
974       UcharPtr ap = a->text;
975       UcharPtr bp = b->text;
976 
977       diff = *ap - *bp;
978       if (diff == 0) {
979         diff = MemCmp(ap, bp, min);
980         if (diff == 0)
981           diff = tmpa - tmpb;
982       }
983     }
984   }
985   return sdp->reverse ? -diff : diff;
986 }
987 
988 /* Check that the lines read from the given FP come in order.  Return
989    1 if they do and 0 if there is a disorder. */
990 
SORTCheckOrder(FILE * fp,SORTDataPtr sdp)991 static Int4 SORTCheckOrder(FILE *fp, SORTDataPtr sdp)
992 {
993   SORTBuffer  buf;              /* Input buffer. */
994   SORTLines lines;		/* Lines scanned from the buffer. */
995   SORTLine  temp;		/* Copy of previous line. */
996   Int4 cc;			/* Character count. */
997   Int4 cmp;			/* Result of calling compare. */
998   Int4 alloc, i, success = 1;
999 
1000   SORTInitBuf(&buf, sdp->mergealloc);
1001   SORTInitLines(&lines, sdp->mergealloc/sdp->linelength + 1);
1002   alloc = sdp->linelength;
1003   if((temp.text = (UcharPtr)MemNew(alloc)) == NULL)
1004     return -1;
1005 
1006   cc = SORTFillBuf(&buf, fp);
1007   SORTFindLines(&buf, &lines, sdp);
1008 
1009   if (cc)
1010     do {
1011       /* Compare each line in the buffer with its successor. */
1012       for (i = 0; i < lines.used - 1; ++i) {
1013         cmp = SORTCompare(&lines.lines[i], &lines.lines[i + 1], sdp);
1014         if ((sdp->unique && cmp >= 0) || cmp > 0) {
1015           success = 0;
1016           goto finish;
1017         }
1018       }
1019 
1020       /* Save the last line of the buffer and refill the buffer. */
1021       if (lines.lines[lines.used - 1].length > alloc) {
1022         while (lines.lines[lines.used - 1].length + 1 > alloc)
1023           alloc *= 2;
1024         if((temp.text = (UcharPtr) Realloc(temp.text, alloc)) == NULL)
1025           return -1;
1026       }
1027       MemCpy((CharPtr)temp.text, (CharPtr)lines.lines[lines.used - 1].text,
1028              lines.lines[lines.used - 1].length + 1);
1029       temp.length = lines.lines[lines.used - 1].length;
1030 
1031       cc = SORTFillBuf(&buf, fp);
1032       if (cc) {
1033         SORTFindLines(&buf, &lines, sdp);
1034         /* Make sure the line saved from the old buffer contents is
1035            less than or equal to the first line of the new buffer. */
1036         cmp = SORTCompare(&temp, &lines.lines[0], sdp);
1037         if ((sdp->unique && cmp >= 0) || cmp > 0) {
1038           success = 0;
1039           break;
1040         }
1041       }
1042     }
1043     while (cc);
1044 
1045  finish:
1046   FileClose(fp);
1047   MemFree(buf.buf);
1048   MemFree((CharPtr) lines.lines);
1049   MemFree(temp.text);
1050   return success;
1051 }
1052 
1053 /* Merge lines from FPS onto OFP.  NFPS cannot be greater than NMERGE.
1054    Close FPS before returning. */
1055 
SORTMergeFPS(FILE * fps[],register Int4 nfps,FILE * ofp,SORTDataPtr sdp)1056 static Int4 SORTMergeFPS(FILE *fps[], register Int4 nfps,
1057                          FILE *ofp, SORTDataPtr sdp)
1058 {
1059   SORTBuffer buffer[NMERGE]; /* Input buffers for each file. */
1060   SORTLines  lines[NMERGE];	/* Line tables for each buffer. */
1061   SORTLine   saved;		/* Saved line for unique check. */
1062   Int4 savedflag = 0;		/* True if there is a saved line. */
1063   Int4 savealloc = 0;		/* Size allocated for the saved line. */
1064   Int4 cur[NMERGE];		/* Current line in each line table. */
1065   Int4 ord[NMERGE];		/* Table representing a permutation of fps,
1066 				   such that lines[ord[0]].lines[cur[ord[0]]]
1067 				   is the smallest line and will be next
1068 				   output. */
1069   register Int4 i, j, t;
1070 
1071   /* Allocate space for a saved line if necessary. */
1072   if (sdp->unique) {
1073     if((saved.text = (UcharPtr)MemNew(savealloc = sdp->linelength)) == NULL)
1074       return -1;
1075   }
1076 
1077   /* Read initial lines from each input file. */
1078   for (i = 0; i < nfps; ++i) {
1079     SORTInitBuf(&buffer[i], sdp->mergealloc);
1080     /* If a file is empty, eliminate it from future consideration. */
1081     while (i < nfps && !SORTFillBuf(&buffer[i], fps[i])) {
1082       FileClose(fps[i]);
1083       --nfps;
1084       for (j = i; j < nfps; ++j)
1085         fps[j] = fps[j + 1];
1086     }
1087     if (i == nfps)
1088       MemFree(buffer[i].buf);
1089     else {
1090       SORTInitLines(&lines[i], sdp->mergealloc/sdp->linelength + 1);
1091       SORTFindLines(&buffer[i], &lines[i], sdp);
1092       cur[i] = 0;
1093     }
1094   }
1095 
1096   /* Set up the ord table according to comparisons among input lines.
1097      Since this only reorders two items if one is strictly greater than
1098      the other, it is stable. */
1099   for (i = 0; i < NMERGE; i++) {
1100     ord [i] = 0;
1101   }
1102   for (i = 0; i < nfps; ++i)
1103     ord[i] = i;
1104   for (i = 1; i < nfps; ++i)
1105     if (SORTCompare(&lines[ord[i - 1]].lines[cur[ord[i - 1]]],
1106                     &lines[ord[i]].lines[cur[ord[i]]], sdp) > 0)
1107       t = ord[i - 1], ord[i - 1] = ord[i], ord[i] = t, i = 0;
1108 
1109   /* Repeatedly output the smallest line until no input remains. */
1110   while (nfps) {
1111     /* If uniqified output is turned out, output only the last of
1112        an identical series of lines. */
1113     if (sdp->unique) {
1114       if (savedflag &&
1115           SORTCompare(&saved, &lines[ord[0]].lines[cur[ord[0]]], sdp)) {
1116         if(FileWrite(saved.text, 1, saved.length, ofp) != (Uint4) saved.length)
1117           return -1;
1118         putc('\n', ofp);
1119       }
1120       if (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1) {
1121         while (savealloc < lines[ord[0]].lines[cur[ord[0]]].length + 1)
1122           savealloc *= 2;
1123         if((saved.text = (UcharPtr) Realloc(saved.text, savealloc)) == NULL)
1124           return -1;
1125       }
1126       saved.length = lines[ord[0]].lines[cur[ord[0]]].length;
1127       MemCpy((CharPtr)saved.text,
1128              (CharPtr)lines[ord[0]].lines[cur[ord[0]]].text,
1129              saved.length + 1);
1130       savedflag = 1;
1131     } else {
1132       if(FileWrite(lines[ord[0]].lines[cur[ord[0]]].text, 1,
1133                    lines[ord[0]].lines[cur[ord[0]]].length, ofp) !=
1134          (Uint4) lines[ord[0]].lines[cur[ord[0]]].length)
1135         return -1;
1136       putc('\n', ofp);
1137     }
1138 
1139     /* Check if we need to read more lines into core. */
1140     if (++cur[ord[0]] == lines[ord[0]].used) {
1141       /* missing '{' can lead to mistakes in later edits... */
1142       if (SORTFillBuf(&buffer[ord[0]], fps[ord[0]])) {
1143         SORTFindLines(&buffer[ord[0]], &lines[ord[0]], sdp);
1144         cur[ord[0]] = 0;
1145       } else {
1146         /* We reached EOF on fps[ord[0]]. */
1147         for (i = 1; i < nfps; ++i)
1148           if (ord[i] > ord[0])
1149             --ord[i];
1150         --nfps;
1151         FileClose(fps[ord[0]]);
1152         MemFree(buffer[ord[0]].buf);
1153         MemFree((CharPtr) lines[ord[0]].lines);
1154         for (i = ord[0]; i < nfps; ++i) {
1155           fps[i] = fps[i + 1];
1156           buffer[i] = buffer[i + 1];
1157           lines[i] = lines[i + 1];
1158           cur[i] = cur[i + 1];
1159         }
1160         for (i = 0; i < nfps; ++i)
1161           ord[i] = ord[i + 1];
1162         continue;
1163       }
1164     }
1165 
1166       /* The new line just read in may be larger than other lines
1167 	 already in core; push it back in the queue until we encounter
1168 	 a line larger than it. */
1169     for (i = 1; i < nfps; ++i) {
1170       t = SORTCompare(&lines[ord[0]].lines[cur[ord[0]]],
1171                       &lines[ord[i]].lines[cur[ord[i]]], sdp);
1172       if (! t)
1173         t = ord[0] - ord[i];
1174       if (t < 0)
1175         break;
1176     }
1177     t = ord[0];
1178     for (j = 1; j < i; ++j)
1179       ord[j - 1] = ord[j];
1180     ord[i - 1] = t;
1181   }
1182 
1183   if (sdp->unique && savedflag) {
1184     if(FileWrite(saved.text, 1, saved.length, ofp) != (Uint4) saved.length)
1185       return -1;
1186     putc('\n', ofp);
1187     MemFree(saved.text);
1188   }
1189   return 0;
1190 }
1191 
1192 /* Sort the array LINES using TEMP for temporary space. */
SORTArrayLines(SORTLinePtr lines,Int4 nlines,SORTLinePtr temp,SORTDataPtr sdp)1193 static void SORTArrayLines(SORTLinePtr lines,  Int4 nlines,
1194                            SORTLinePtr temp, SORTDataPtr sdp)
1195 {
1196   register SORTLinePtr lo, hi, t;
1197   register Int4 nlo, nhi;
1198 
1199   if (nlines == 2) {
1200     if (SORTCompare(&lines[0], &lines[1], sdp) > 0)
1201       *temp = lines[0], lines[0] = lines[1], lines[1] = *temp;
1202     return;
1203   }
1204 
1205   nlo = nlines / 2;
1206   lo = lines;
1207   nhi = nlines - nlo;
1208   hi = lines + nlo;
1209 
1210   if (nlo > 1)
1211     SORTArrayLines(lo, nlo, temp, sdp);
1212 
1213   if (nhi > 1)
1214     SORTArrayLines(hi, nhi, temp, sdp);
1215 
1216   t = temp;
1217 
1218   while (nlo && nhi)
1219     if (SORTCompare(lo, hi, sdp) <= 0)
1220       *t++ = *lo++, --nlo;
1221     else
1222       *t++ = *hi++, --nhi;
1223   while (nlo--)
1224     *t++ = *lo++;
1225 
1226   for (lo = lines, nlo = nlines - nhi, t = temp; nlo; --nlo)
1227     *lo++ = *t++;
1228 }
1229 
1230 /* Check that each of the given FILES is ordered.
1231    Return a count of disordered files. */
SORTCheckOrderS(CharPtr files[],Int4 nfiles,SORTObjectPtr sop)1232 Int4 SORTCheckOrderS(CharPtr files[], Int4 nfiles, SORTObjectPtr sop)
1233 {
1234   Int4 i, disorders = 0;
1235   FILE *fp;
1236   SORTDataPtr sdp;
1237 
1238   if((sdp = (SORTDataPtr) sop) == NULL)
1239     return -1;
1240 
1241   for (i = 0; i < nfiles; ++i) {
1242     if((fp = FileOpen(files[i], "r")) == NULL)
1243       return -1;
1244     if (!SORTCheckOrder(fp, sdp)) {
1245       ErrLogPrintf("sort: disorder on %s\n", files[i]);
1246       ++disorders;
1247     }
1248     FileClose(fp);
1249   }
1250   return disorders;
1251 }
1252 
1253 /* Insert a key at the end of the list. */
SORTInsertKey(SORTKeyFieldPtr key,SORTKeyFieldPtr keyhead)1254 SORTErrorCode SORTInsertKey(SORTKeyFieldPtr key, SORTKeyFieldPtr keyhead)
1255 {
1256   SORTKeyFieldPtr k = keyhead;
1257 
1258   while (k->next) {
1259     k = k->next;
1260   }
1261   k->next = key;
1262   key->next = NULL;
1263 
1264   return SORTNoError;
1265 }
1266