1 /*
2     Numdiff - compare putatively similar files,
3     ignoring small numeric differences
4     Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  Ivano Primi  <ivprimi@libero.it>
5 
6     This program is free software: you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation, either version 3 of the License, or
9     (at your option) any later version.
10 
11     This program is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15 
16     You should have received a copy of the GNU General Public License
17     along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #include<stdio.h>
21 #include<stdlib.h>
22 #include<string.h>
23 #include"linesplit.h"
24 
25 #define O_DEF_SEP '\n'
26 #define ESC_CHAR  '\\'
27 
28 const unsigned char InvDigit = (unsigned char) -1;
29 
30 static
is_hex_digit(char ch)31 unsigned char is_hex_digit (char ch)
32 {
33   if (ch >= '0' && ch <= '9')
34     return ch - '0';
35   else
36     {
37       switch (ch)
38 	{
39 	case 'a':
40 	case 'A':
41 	  return 10;
42 	case 'b':
43 	case 'B':
44 	  return 11;
45 	case 'c':
46 	case 'C':
47 	  return 12;
48 	case 'd':
49 	case 'D':
50 	  return 13;
51 	case 'e':
52 	case 'E':
53 	  return 14;
54 	case 'f':
55 	case 'F':
56 	  return 15;
57 	default:
58 	  return InvDigit;
59 	}
60     }
61 }
62 
63 static
is_oct_digit(char ch)64 unsigned char is_oct_digit (char ch)
65 {
66   if (ch >= '0' && ch <= '7')
67     return ch - '0';
68   else
69     return InvDigit;
70 }
71 
72 static
is_hex(const char * pafter_escape,char * byte)73 int is_hex (const char* pafter_escape, char* byte)
74 {
75   unsigned char hb, lb;
76 
77   /*
78     The return value is the number of hexadecimal digits successfully read
79   */
80   if ( (hb = is_hex_digit (*pafter_escape)) != InvDigit )
81     {
82       int rv = 1;
83 
84       lb = is_hex_digit (*(pafter_escape+1));
85       if (lb == InvDigit)
86 	{
87 	  lb = hb;
88 	  hb = 0;
89 	}
90       else
91 	rv = 2;
92 
93       if ((byte))
94 	*byte = (char)(16 * hb + lb);
95       return rv;
96     }
97   else
98     {
99       if ((byte))
100 	*byte = -1;
101       return 0;
102     }
103 }
104 
105 static
is_oct(const char * pafter_escape,char * byte)106 int is_oct (const char* pafter_escape, char* byte)
107 {
108   unsigned char ho, mo, lo;
109 
110   /*
111     The return value is the number of octal digits successfully read
112   */
113   if ( (ho = is_oct_digit (*pafter_escape)) != InvDigit )
114     {
115       int rv = 1;
116 
117       mo = is_oct_digit (*(pafter_escape+1));
118       lo = is_oct_digit (*(pafter_escape+2));
119       if (mo == InvDigit)
120 	{
121 	  lo = ho;
122 	  ho = 0;
123 	  mo = 0;
124 	}
125       else
126 	{
127 	  rv++;
128 	  if (lo == InvDigit)
129 	    {
130 	      lo = mo;
131 	      mo = ho;
132 	      ho = 0;
133 	    }
134 	  else
135 	    rv++;
136 	}
137       if ((byte))
138 	*byte = (char)(64 * ho + 8 * mo + lo);
139       return (64 * (int)ho + 8 * (int)mo + (int)lo < 256 ? rv : 0);
140     }
141   else
142     {
143       if ((byte))
144 	*byte = -1;
145       return 0;
146     }
147 }
148 
149 static
process_character(const char * chp,const char ** new_chp)150 char process_character (const char* chp, const char** new_chp)
151 {
152   const char* nchp;
153   char byte;
154   int r;
155 
156   if (*chp == ESC_CHAR)
157     {
158       switch (*(chp+1))
159 	{
160 	case 'a':
161 	  byte = '\a';
162 	  nchp = chp+2;
163 	  break;
164 	case 'b':
165 	  byte = '\b';
166 	  nchp = chp+2;
167 	  break;
168 	case 'f':
169 	  byte = '\f';
170 	  nchp = chp+2;
171 	  break;
172 	case 'n':
173 	  byte = '\n';
174 	  nchp = chp+2;
175 	  break;
176 	case 'r':
177 	  byte = '\r';
178 	  nchp = chp+2;
179 	  break;
180 	case 't':
181 	  byte = '\t';
182 	  nchp = chp+2;
183 	  break;
184 	case 'v':
185 	  byte = '\v';
186 	  nchp = chp+2;
187 	  break;
188 	case 's':
189 	  byte = ' ';
190 	  nchp = chp+2;
191 	  break;
192 	case 'x':
193 	  if ( (r = is_hex (chp+2, &byte)) && byte != '\0' )
194 	    nchp = chp + (r + 2);
195 	  else
196 	    {
197 	      byte = *(chp+1);
198 	      nchp = chp+2;
199 	    }
200 	  break;
201 	case '0':
202 	case '1':
203 	case '2':
204 	case '3':
205 	case '4':
206 	case '5':
207 	case '6':
208 	case '7':
209 	  if ( (r = is_oct (chp+1, &byte)) && byte != '\0' )
210 	    nchp = chp + (r + 1);
211 	  else
212 	    {
213 	      byte = *(chp+1);
214 	      nchp = chp+2;
215 	    }
216 	  break;
217 	default:
218 	  byte = *(chp+1);
219 	  nchp = chp+2;
220 	  break;
221 	}
222     }
223   else
224     {
225       byte = *chp;
226       nchp = chp+1;
227     }
228   if ((new_chp))
229     *new_chp = nchp;
230   return byte;
231 }
232 
233 /*
234   Remark: process the substring [BPTR, EPTR) and return
235           the result (NULL in case of error while allocating memory for the result).
236           Precondition is EPTR >= BPTR.
237 */
238 static
process_substring(const char * bptr,const char * eptr)239 char* process_substring (const char* bptr, const char* eptr)
240 {
241   size_t subssize = eptr - bptr + 1;
242   const char* ptr;
243   char *pstr, *pstrp;
244 
245   if ( !(pstr = (char*) calloc(subssize, sizeof(char))) )
246     return NULL;
247   else
248     {
249       for (pstrp = pstr, ptr = bptr; ptr < eptr; pstrp++)
250 	{
251 	  *pstrp = process_character (ptr, &ptr);
252 	}
253       return pstr;
254     }
255 }
256 
257 /*
258   Create and return a vector of strings using the description
259   contained in the string pointed to by STR.
260   The items in STR (to each of which should correspond
261   a string in the returned vector) are separated by the
262   character SEPARATOR.
263   In case of error while allocating memory for the vector
264   and its elements return NULL.
265   Return NULL also if STR == NULL.
266 
267   Remark: SEPARATOR cannot be the nul character.
268           Return NULL if SEPARATOR is the nul character.
269 */
ssplit(const char * str,char separator)270 char** ssplit (const char* str, char separator)
271 {
272   size_t i, n;
273   const char *beg, *ptr, *ptr2sep;
274   char** sv;
275 
276   if (!str || separator == '\0')
277     return NULL;
278   for (beg = str; *beg == separator; beg++);
279   /*
280      Now BEG points to the first charatacrer of the buffer
281      pointed to by STR which is not equal to SEPARATOR.
282   */
283 
284   /*
285      First count the substrings contained
286      in the buffer pointed to by STR.
287   */
288   for (n = 1, ptr = beg; (ptr2sep = strchr (ptr, separator)) != NULL;
289        n++)
290     {
291       for (ptr = ptr2sep+1; *ptr == separator; ptr++);
292     }
293   /*
294     Now allocate memory for a vector of N+1 char*.
295     If the allocation fails, return NULL.
296   */
297   if ( !(sv = (char**) malloc ((n+1)*sizeof(char*))) )
298     return NULL;
299   sv[n] = NULL;
300 
301   for (i = 0, ptr = beg; (ptr2sep = strchr (ptr, separator)) != NULL;
302        i++)
303     {
304       sv[i] = process_substring (ptr, ptr2sep);
305       if (!sv[i])
306         {
307           delete_string_vector (sv);
308           return NULL;
309         }
310       for (ptr = ptr2sep+1; *ptr == separator; ptr++);
311     }
312   if (*ptr != '\0')
313     {
314       ptr2sep = strchr (ptr, '\0');
315       sv[i] = process_substring (ptr, ptr2sep);
316       if (!sv[i])
317         {
318           delete_string_vector (sv);
319           return NULL;
320         }
321     }
322   return sv;
323 }
324 
325 /*
326   Create and return a vector of strings using the characters
327   contained in the string pointed to by STR. To each
328   (eventually escaped) character in this string will
329   correspond exactly one string in the returned vector.
330   Return NULL if STR == NULL or in case of out of memory.
331 */
ssplit_former_way(const char * str)332 char** ssplit_former_way (const char* str)
333 {
334   if ((str))
335     {
336       size_t n, ls = strlen(str);
337       char **sv;
338       const char *ptr, *nptr;
339 
340       sv = (char**) calloc (ls + 1, sizeof(char*));
341       if (!sv)
342         return NULL;
343       for (n = 0, ptr = str; *ptr != '\0'; ptr = nptr, n++)
344         {
345           sv[n] = (char*) malloc (2 *sizeof(char));
346           if ((sv[n]))
347             {
348               sv[n][0] = process_character (ptr, &nptr);
349               sv[n][1] = '\0';
350             }
351           else
352             {
353               delete_string_vector (sv);
354               return NULL;
355             }
356         }
357       return sv;
358     }
359   else
360     return NULL;
361 }
362 
363 /*
364   Process the string pointed to by ISTR and return the result
365   (NULL in case of error while allocating memory for the result).
366 */
get_separating_string(const char * istr)367 char* get_separating_string (const char* istr)
368 {
369   return process_substring (istr, istr+strlen(istr));
370 }
371 
372 /*
373   Write to the file pointed to by FP the strings contained in
374   the vector SV. Use SEPARATOR to separate each string from
375   the following one.
376 */
print_string_vector(FILE * fp,const char ** sv,char separator)377 void print_string_vector (FILE* fp, const char** sv, char separator)
378 {
379   size_t n;
380 
381   if (!sv)
382     {
383       fputs ("<Empty>", fp);
384       fputc (separator, fp);
385     }
386   else
387     {
388       for (n = 0; sv[n] != NULL; n++)
389 	{
390 	  fprintf (fp, "\"%s\"%c", sv[n], separator);
391 	}
392     }
393 }
394 
395 /*
396   Rearrange the strings of the vector SV in descending order
397   with respect to their length.
398 
399   Rem.: Pre-condition is that SV is NULL-terminated.
400         This function is suitable only for small vectors,
401 	since it uses a bubble-sort algorithm.
402 */
sort_string_vector(char ** sv)403 void sort_string_vector (char** sv)
404 {
405   if ((sv))
406     {
407       size_t n, m, l, lmax, poslmax;
408       char *tmp;
409 
410       for (n = 0; sv[n] != NULL; n++)
411 	{
412 	  lmax = strlen(sv[n]);
413 	  poslmax = n;
414 	  for (m = n+1; sv[m] != NULL; m++)
415 	    {
416 	      if ( (l = strlen(sv[m])) > lmax )
417 		{
418 		  lmax = l;
419 		  poslmax = m;
420 		}
421 	    }
422 	  tmp = sv[n];
423 	  sv[n] = sv[poslmax];
424 	  sv[poslmax] = tmp;
425 	}
426     }
427 }
428 
429 /*
430   Remove duplicates from the vector SV.
431 */
remove_duplicates_from_string_vector(char ** sv)432 void remove_duplicates_from_string_vector (char** sv)
433 {
434   if ((sv))
435     {
436       size_t k, m, n;
437 
438       for (n = 0; sv[n] != NULL; n++)
439 	{
440           m = n+1;
441 	  while (sv[m] != NULL)
442 	    {
443               if (strcmp (sv[m], sv[n]) == 0)
444                 {
445                   free((void*)sv[m]);
446                   for (k = m+1; sv[k] != NULL; k++)
447                     sv[k-1] = sv[k];
448                   sv[k-1] = NULL;
449                 }
450               else
451                 m++;
452 	    }
453         }
454     }
455 }
456 
457 /*
458   Return 1 if the string pointed to by STR is found in the vector SV,
459   otherwise 0. 0 should be also returned if STR or SV is NULL.
460 */
is_string_in_vector(const char * str,const char ** sv)461 int is_string_in_vector (const char* str, const char** sv)
462 {
463   if ((sv) && (str))
464     {
465       size_t n;
466 
467       for (n = 0; sv[n] != NULL && strcmp(str, sv[n]) != 0; n++);
468       return (sv[n] == NULL ? 0 : 1);
469     }
470   else
471     return 0;
472 }
473 
474 /*
475   Return 0 if there is no string in the vector SV which contains the
476   character CH, otherwise return the length of the longest string
477   between those ones which contain the character CH.
478   0 should also be returned if SV is null.
479 */
is_char_in_vector(int ch,const char ** sv)480 size_t is_char_in_vector (int ch, const char** sv)
481 {
482   if ((sv))
483     {
484       size_t l, lm, n;
485 
486       for (lm = n = 0; sv[n] != NULL; n++)
487         {
488           if ( (strchr (sv[n], ch)) && (l = strlen (sv[n])) > lm)
489             lm = l;
490         }
491       return lm;
492     }
493   else
494     return 0;
495 }
496 
497 /*
498   Remove the memory allocated for the strings of the vector SV
499   and then free the memory allocated for the vector itself.
500 */
delete_string_vector(char ** sv)501 void delete_string_vector (char** sv)
502 {
503   size_t n;
504 
505   if ((sv))
506     {
507       for (n = 0; sv[n] != NULL; n++)
508 	{
509 	  free ((void*)sv[n]);
510 	}
511       free((void*)sv);
512     }
513 }
514 
515 /*
516   Return a pointer to the position following the initial
517   segment of STR that does not contain any string
518   from the vector SV. If such an initial segment does not
519   exist, return a pointer to STR.
520   Consider the string STR as ending at the first occurrence of EOS.
521 
522   SV must be NULL terminated, it cannot contain the empty ("") string
523   nor a string of length > 1 with EOS being one of its non-null characters
524   (but SV may well contain the string of length 1 having EOS as its
525   only non-null character).
526 
527   Rem.: EOS can be the null character.
528         If the string pointed to by STR does not contain any EOS
529         character, a buffer overrun will occur.
530 */
string_cspn(const char * str,const char ** sv,int eos)531 char* string_cspn (const char* str, const char** sv, int eos)
532 {
533   register const char *sviptr;
534   register const char *endptr;
535   register const char *nendptr;
536   register const char *ptr;
537   register size_t n;
538 
539   if (!str || !sv)
540     {
541       /* security check */
542       return NULL;
543     }
544   else
545     {
546       for (endptr = str; *endptr != eos; endptr++);
547       for (nendptr = str; nendptr < endptr; nendptr++)
548         {
549           for (n = 0; sv[n] != NULL; n++)
550             {
551               for (ptr = nendptr, sviptr = sv[n];
552                    *sviptr != '\0' && *sviptr == *ptr;
553                    sviptr++, ptr++);
554               if (*sviptr == '\0')
555                 return (char*)nendptr;
556             }
557         }
558       return (char*)nendptr;
559     }
560 }
561 
562 /*
563   Return a pointer to the position following the initial
564   segment of STR that consists entirely of strings
565   from the vector SV. If such an initial segment does not
566   exist, return a pointer to STR.
567   Consider the string STR as ending at the first occurrence of EOS.
568 
569   SV must be NULL terminated, it cannot contain the empty ("") string
570   nor a string of length > 1 with EOS being one of its non-null characters
571   (but SV may well contain the string of length 1 having EOS as its
572   only non-null character).
573 
574   Rem.: this function works under the assumption that the strings in
575         the vector SV are ordered according to their lengths, where SV[0]
576         is the string with the greatest length.
577 
578         EOS can be the null character.
579 */
string_spn(const char * str,const char ** sv,int eos)580 char* string_spn (const char* str, const char** sv, int eos)
581 {
582   register const char *ptr;
583   register const char *nptr;
584   register const char *sviptr;
585   register size_t n;
586 
587   if (!str || !sv)
588     {
589       /* security check */
590       return NULL;
591     }
592   else
593     {
594       ptr = str;
595       while (*ptr != eos)
596 	{
597           /*
598             Rem.: if strlen(sv[n])== 1 and sv[n][0] == EOS, then
599                   strstr(ptr, sv[n]) != ptr. Thus, whenever the following
600                   for cycle terminates, sv[n] can not be equal to the
601                   string "<EOS>" (i.e. the string having EOS as its only
602                   null character).
603           */
604 	  for (n = 0; sv[n] != NULL; n++)
605             {
606               for (nptr = ptr, sviptr = sv[n];
607                    *sviptr!='\0' && *sviptr == *nptr;
608                    sviptr++, nptr++);
609               if (*sviptr == '\0')
610                 {
611                   /*
612                     Rem.: if sv[n] does not contain any EOS, then
613                     by setting PTR to NPTR we do not
614                     skip any EOS.
615                   */
616                   ptr = nptr;
617                   break;
618                 }
619             }
620           if (!sv[n])
621 	    break;
622 	}
623       return (char*)ptr;
624     }
625 }
626 
627 #ifdef _TEST_LINE_SPLIT_
628 
629 #define I_DEF_SEP ' '
630 
631 static
print_help(const char * progname)632 void print_help (const char* progname)
633 {
634   printf ("Usage: %s STRING\n\n", progname);
635 }
636 
637 static
print_substring(FILE * fp,const char * bptr,const char * eptr,int nl)638 void print_substring (FILE* fp, const char* bptr, const char* eptr, int nl)
639 {
640   const char *ptr;
641 
642   if (eptr > bptr)
643     {
644       for (ptr = bptr; ptr != eptr; ptr++)
645 	{
646 	  putc (*ptr, fp);
647 	}
648       if ((nl))
649 	putc ('\n', fp);
650     }
651 }
652 
653 #define BUFFSIZE 1024
654 
main(int argc,char * argv[])655 int main (int argc, char* argv[])
656 {
657   if (argc != 2)
658     {
659       print_help(argv[0]);
660       return 1;
661     }
662   else
663     {
664       char** string_vector = NULL;
665       char** sv = NULL;
666       char linebuff[BUFFSIZE] = "";
667       char *rv, *ptr, *endptr;
668       size_t l;
669 
670       string_vector = ssplit (argv[1], I_DEF_SEP);
671       sv = ssplit_former_way (argv[1]);
672       remove_duplicates_from_string_vector (string_vector);
673       remove_duplicates_from_string_vector (sv);
674       sort_string_vector (string_vector);
675       print_string_vector (stdout, (const char**)string_vector, O_DEF_SEP);
676       l = is_char_in_vector (':', string_vector);
677       printf ("Length of the longest string containing \':\' = %zu\n", l);
678       puts ("\n\nSplitting the string in the former way produces the following result:");
679       print_string_vector (stdout, (const char**)sv, O_DEF_SEP);
680       do
681 	{
682 #ifdef _MINOR_TEST_
683 	  puts ("\nEnter a line of text (Ctrl+D to terminate)");
684 #endif
685 	  rv = fgets (linebuff, BUFFSIZE, stdin);
686 	  if ((rv))
687 	    {
688 #ifdef _MINOR_TEST_
689 	      ptr = string_cspn (linebuff, (const char**)string_vector, '\0');
690 	      fputs ("Cspn =", stdout);
691 	      print_substring (stdout, linebuff, ptr, 0);
692 	      fputs ("|EoS|\n", stdout);
693 
694 	      ptr = string_spn (linebuff, (const char**)string_vector, '\0');
695 	      fputs ("Spn  =", stdout);
696 	      print_substring (stdout, linebuff, ptr, 0);
697 	      fputs ("|EoS|\n", stdout);
698 #else
699 	      unsigned long fieldno;
700 
701 
702 	      for (fieldno = 1, ptr = linebuff; *ptr != '\0'; fieldno++)
703 		{
704 		  ptr = string_spn (ptr, (const char**)string_vector, '\0');
705 		  endptr = string_cspn (ptr, (const char**)string_vector, '\0');
706 		  if ((*ptr))
707 		    {
708 		      printf ("%3lu.>", fieldno);
709 		      print_substring (stdout, ptr, endptr, 0);
710 		      puts ("<");
711 		    }
712 		  ptr = endptr;
713 		}
714 	      putchar ('\n');
715 #endif /* _MINOR_TEST_ */
716 	    }
717 	} while ((rv));
718       delete_string_vector (string_vector);
719       return 0;
720     }
721 }
722 
723 #endif /* _TEST_LINE_SPLIT_ */
724