1 /*************************************************************************/
2 /* Copyright (c) 2004                                                    */
3 /* Daniel Sleator, David Temperley, and John Lafferty                    */
4 /* Copyright (c) 2009-2013 Linas Vepstas                                 */
5 /* All rights reserved                                                   */
6 /*                                                                       */
7 /* Use of the link grammar parsing system is subject to the terms of the */
8 /* license set forth in the LICENSE file included with this software.    */
9 /* This license allows free redistribution and use in source and binary  */
10 /* forms, with or without modification, subject to certain conditions.   */
11 /*                                                                       */
12 /*************************************************************************/
13 #ifndef _LINK_GRAMMAR_UTILITIES_H_
14 #define _LINK_GRAMMAR_UTILITIES_H_
15 
16 /* The _Win32 definitions are for native-Windows compilers.  This includes
17  * MSVC (only versions >=14 are supported) and MINGW (under MSYS or Cygwin).
18  * The _WIN32 definitions are not for Cygwin, which doesn't define _WIN32. */
19 
20 #include <ctype.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <wchar.h>
25 #include <wctype.h>
26 #include <locale.h>
27 #ifdef HAVE_LOCALE_T_IN_XLOCALE_H
28 #include <xlocale.h>
29 #endif /* HAVE_LOCALE_T_IN_XLOCALE_H */
30 
31 #include "error.h"
32 #include "lg_assert.h"
33 
34 #ifdef HAVE_ALLOCA_H
35 # include <alloca.h>
36 #elif defined __GNUC__
37 #ifndef alloca
38 # define alloca __builtin_alloca
39 #endif /* !alloca */
40 #elif defined _AIX
41 # define alloca __alloca
42 #elif defined _MSC_VER
43 # include <malloc.h>
44 # define alloca _alloca
45 #else
46 # include <stddef.h>
47 # ifdef  __cplusplus
48 extern "C"
49 # endif
50 void *alloca (size_t);
51 #endif
52 
53 #ifndef TLS
54 #ifdef _MSC_VER
55 #define TLS __declspec(thread)
56 #else
57 #define TLS
58 #endif /* _MSC_VER */
59 #endif /* !TLS */
60 
61 #ifdef _MSC_VER
62 /* These definitions are incorrect, as these functions are different(!)
63  * (non-standard functionality).
64  * See http://stackoverflow.com/questions/27754492 . Fortunately,
65  * MSVC 14 supports C99 functions so these definitions are now unneeded.
66  * (LG library compilation is unsupported for previous MSVC versions.)
67  * (Left here for documentation.) */
68 #if 0
69 /* MS Visual C uses non-standard string function names */
70 #define snprintf _snprintf
71 #define vsnprintf _vsnprintf
72 #endif
73 
74 #define HAVE__ALIGNED_MALLOC 1
75 
76 /* Avoid plenty of: warning C4090: 'function': different 'const' qualifiers.
77  * This happens, for example, when the argument is "const void **". */
78 #define free(x) free((void *)x)
79 #define realloc(x, s) realloc((void *)x, s)
80 #define memcpy(x, y, s) memcpy((void *)x, (void *)y, s)
81 #define qsort(x, y, z, w) qsort((void *)x, y, z, w)
82 #endif /* _MSC_VER */
83 
84 #if defined(HAVE_LOCALE_T_IN_LOCALE_H) || defined(HAVE_LOCALE_T_IN_XLOCALE_H)
85 #define HAVE_LOCALE_T 1
86 #endif /* HAVE_LOCALE_T_IN_LOCALE_H || HAVE_LOCALE_T_IN_XLOCALE_H) */
87 
88 #if defined _MSC_VER || defined __cplusplus
89 /* "restrict" is not a part of ISO C++.
90  * C++ compilers usually know it as __restrict. */
91 #define restrict __restrict
92 #endif
93 
94 #ifdef _WIN32
95 #include <windows.h>
96 #include <mbctype.h>
97 
98 /* Compatibility definitions. */
99 #ifndef strncasecmp
100 #define strncasecmp(a,b,s) strnicmp((a),(b),(s))
101 #endif
102 #undef rand_r  /* Avoid (a bad) definition on MinGW */
103 int rand_r(unsigned int *);
104 #ifndef __MINGW32__
105 /* No strtok_s in XP/2003 and their strtok_r is incompatible.
106  * Hence HAVE_STRTOK_R will not be defined and our own one will be used. */
107 #if _WINVER != 0x501 /* XP */ && _WINVER != 0x502 /* Server 2003 */
108 #define strtok_r strtok_s
109 #define HAVE_STRTOK_R
110 #endif /* _WINVER != XP|2003 */
111 
112 /* There is no ssize_t definition in native Windows. */
113 #include <BaseTsd.h>
114 typedef SSIZE_T ssize_t;
115 
116 /* Native windows has locale_t, and hence HAVE_LOCALE_T is defined here.
117  * However, MinGW currently doesn't have locale_t. If/when it has locale_t,
118  * "configure" will define HAVE_LOCALE_T for it. */
119 #define HAVE_LOCALE_T
120 #endif
121 
122 #ifdef HAVE_LOCALE_T
123 #define locale_t _locale_t
124 #define iswupper_l  _iswupper_l
125 #define iswalpha_l  _iswalpha_l
126 #define iswdigit_l  _iswdigit_l
127 #define iswspace_l  _iswspace_l
128 #define towlower_l  _towlower_l
129 #define towupper_l  _towupper_l
130 #define strtod_l    _strtod_l
131 #define freelocale _free_locale
132 #endif /* HAVE_LOCALE_T */
133 
134 /* strndup() is missing in Windows. */
135 char * strndup (const char *str, size_t size);
136 
137 /* Users report that the default mbrtowc that comes with windows and/or
138  * cygwin just doesn't work very well. So we use our own custom version,
139  * instead.
140  */
141 #ifdef mbrtowc
142 #undef mbrtowc
143 #endif
144 size_t lg_mbrtowc(wchar_t *, const char *, size_t n, mbstate_t *ps);
145 #define mbrtowc(w,s,n,x) lg_mbrtowc(w,s,n,x)
146 #endif /* _WIN32 */
147 
148 /* MSVC isspace asserts in debug mode, and mingw sometime returns true,
149  * when passed utf8. OSX returns TRUE on char values 0x85 and 0xa0).
150  * Since it is defined to return TRUE only on 6 characters, all of which
151  * are in the range [0..127], just limit its arguments to 7 bits. */
152 #define lg_isspace(c) ((0 < c) && (c < 127) && isspace(c))
153 
154 void lg_strerror(int err_no, char *buf, size_t len);
155 
156 #if defined(__sun__)
157 int strncasecmp(const char *s1, const char *s2, size_t n);
158 /* This does not appear to be in string.h header file in sunos
159    (Or in linux when I compile with -ansi) */
160 #endif
161 
162 /* Cygwin < 2.6.0 doesn't have locale_t. */
163 #ifdef HAVE_LOCALE_T
164 locale_t newlocale_LC_CTYPE(const char *);
165 #else
166 typedef int locale_t;
167 #define iswupper_l(c, l) iswupper(c)
168 #define iswalpha_l(c, l) iswalpha(c)
169 #define iswdigit_l(c, l) iswdigit(c)
170 #define iswspace_l(c, l) iswspace(c)
171 #define towlower_l(c, l) towlower(c)
172 #define towupper_l(c, l) towupper(c)
173 #define freelocale(l)
174 #endif /* HAVE_LOCALE_T */
175 
176 #if HAVE__ALIGNED_MALLOC
177 #define aligned_alloc(alignment, size) _aligned_malloc (size, alignment)
178 #define aligned_free(p) _aligned_free(p)
179 #undef HAVE_POSIX_MEMALIGN
180 
181 #elif HAVE_ALIGNED_ALLOC
182 #define aligned_free(p) free(p)
183 #undef HAVE_POSIX_MEMALIGN
184 
185 #elif HAVE_POSIX_MEMALIGN
186 /* aligned_alloc() emulation will be defined in utilities.c. */
187 void *aligned_alloc(size_t alignment, size_t size);
188 #define aligned_free(p) free(p)
189 
190 #else
191 /* Fallback to just malloc(), as alignment is not critical here. */
192 #define NO_ALIGNED_MALLOC /* For generating a warning in utilities.c. */
193 #define aligned_alloc(alignment, size) malloc(size)
194 #define aligned_free(p) free(p)
195 #endif /* HAVE__ALIGNED_MALLOC */
196 
197 #define ALIGN(size, alignment) (((size)+(alignment-1))&~(alignment-1))
198 
199 #define STR(x) #x
200 #define STRINGIFY(x) STR(x)
201 
202 #if !defined(MIN)
203 #define MIN(X,Y)  ( ((X) < (Y)) ? (X) : (Y))
204 #endif
205 #if !defined(MAX)
206 #define MAX(X,Y)  ( ((X) > (Y)) ? (X) : (Y))
207 #endif
208 
209 /* In the following, the arguments should not have side effects.
210  * FIXME: Detect in "configure" and check HAVE_* */
211 #ifndef strdupa
212 #define strdupa(s) strcpy(alloca(strlen(s)+1), s)
213 #endif
214 #ifndef strndupa
215 #define strndupa(s, n) _strndupa3(alloca((n)+1), s, n)
_strndupa3(char * new_s,const char * s,size_t n)216 static inline char *_strndupa3(char *new_s, const char *s, size_t n)
217 {
218 	strncpy(new_s, s, n);
219 	new_s[n] = '\0';
220 
221 	return new_s;
222 }
223 #endif
224 
225 /* From ccan array_size.h and build_assert.h, which are under a CC0 license */
226 #define BUILD_ASSERT_OR_ZERO(cond) (sizeof(char [1 - 2*!(cond)]) - 1)
227 #if !defined(ARRAY_SIZE)
228 /**
229  * ARRAY_SIZE: Get the number of elements in a visible array
230  * @param arr The array whose size you want.
231  *
232  * This does not work on pointers, or arrays declared as [], or
233  * function parameters.  With correct compiler support, such usage
234  * will cause a build error (see build_assert).
235  */
236 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + _array_size_chk(arr))
237 
238 #if HAVE_BUILTIN_TYPES_COMPATIBLE_P && HAVE_TYPEOF
239 /* Two gcc extensions.
240  * &a[0] degrades to a pointer: a different type from an array */
241 #define _array_size_chk(arr)
__builtin_types_compatible_p(typeof (arr),typeof (& (arr)[0]))242 	BUILD_ASSERT_OR_ZERO(!__builtin_types_compatible_p(typeof(arr), \
243 							typeof(&(arr)[0])))
244 #else
245 #define _array_size_chk(arr) 0
246 #endif
247 #endif /* !defined(ARRAY_SIZE) */
248 
249 /* The GCC version we need must be >= 4.7, because it has to
250  * support C11. So it already supports all the features below. */
251 
252 /* Optimizations etc. that only gcc understands */
253 #if __GNUC__
254 #define GCC_DIAGNOSTIC
255 #define UNREACHABLE(x) (__extension__ ({if (x) __builtin_unreachable();}))
256 #define GNUC_MALLOC __attribute__ ((malloc))
257 #define GNUC_UNUSED __attribute__ ((unused))
258 #define NO_SAN __attribute__ ((no_sanitize_address, no_sanitize_undefined))
259 
260 /* Define when configuring with ASAN/UBSAN - for fast dict load (of course
261  * only when not debugging dict code.) */
262 #ifdef NO_SAN_DICT
263 #undef NO_SAN_DICT
264 #define NO_SAN_DICT NO_SAN
265 #else
266 #define NO_SAN_DICT
267 #endif
268 
269 #else
270 #define UNREACHABLE(x)
271 #define GNUC_MALLOC
272 #define GNUC_UNUSED
273 #define NO_SAN_DICT
274 #endif
275 
276 
277 /* Apply a pragma to a specific code section only.
278  * XXX According to the GCC docs, we cannot use here something like
279  * "#ifdef HAVE_x". Also -Wunknown-pragmas & -Wno-unknown-warning-option
280  * don't work in this situation. So "-Wmaybe-uninitialized", which
281  * is not recognized by clang, is defined separately. */
282 #ifdef GCC_DIAGNOSTIC
283 
284 #ifdef HAVE_MAYBE_UNINITIALIZED
285 #define PRAGMA_MAYBE_UNINITIALIZED \
286 	_Pragma("GCC diagnostic push") \
287 	_Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
288 #else
289 #define PRAGMA_MAYBE_UNINITIALIZED \
290 	_Pragma("GCC diagnostic push")
291 #endif /* HAVE_MAYBE_UNINITIALIZED */
292 
293 #define PRAGMA_START(x) \
294 	_Pragma("GCC diagnostic push") \
295 	_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \
296 	_Pragma(#x)
297 #define PRAGMA_END _Pragma("GCC diagnostic pop")
298 #else
299 #define PRAGMA_START(x)
300 #define PRAGMA_END
301 #define PRAGMA_MAYBE_UNINITIALIZED
302 #endif /* GCC_DIAGNOSTIC */
303 
304 /**
305  * Return the length, in codepoints/glyphs, of the utf8-encoded
306  * string.  The string is assumed to be null-terminated.
307  * This is needed when splitting words into morphemes.
308  */
309 static inline size_t utf8_strlen(const char *s)
310 {
311 	mbstate_t mbss;
312 	memset(&mbss, 0, sizeof(mbss));
313 #if _WIN32
314 	return MultiByteToWideChar(CP_UTF8, 0, s, -1, NULL, 0)-1;
315 #else
316 	return mbsrtowcs(NULL, &s, 0, &mbss);
317 #endif /* _WIN32 */
318 }
319 
320 /** Returns length of UTF8 character.
321  * Current algo is based on the first character only.
322  * If pointer is not pointing at first char, or not a valid value, returns -1.
323  * Returns 0 for NULL.
324  */
utf8_charlen(const char * xc)325 static inline int utf8_charlen(const char *xc)
326 {
327 	unsigned char c;
328 
329 	c = (unsigned char) *xc;
330 
331 	if (c == 0) return 0;
332 	if (c < 0x80) return 1;
333 	if ((c >= 0xc2) && (c < 0xe0)) return 2; /* First byte of a code point U +0080 - U +07FF */
334 	if ((c >= 0xe0) && (c < 0xf0)) return 3; /* First byte of a code point U +0800 - U +FFFF */
335 	if ((c >= 0xf0) && (c <= 0xf4)) return 4; /* First byte of a code point U +10000 - U +10FFFF */
336 	return -1; /* Fallthrough -- not the first byte of a code-point. */
337 }
338 
339 /**
340  * Copy `n` utf8 characters from `src` to `dest`.
341  * Return the number of bytes actually copied.
342  * The `dest` must have enough room to hold the copy.
343  */
utf8_strncpy(char * dest,const char * src,size_t n)344 static inline size_t utf8_strncpy(char *dest, const char *src, size_t n)
345 {
346 	size_t b = 0;
347 	while (0 < n)
348 	{
349 		size_t k = utf8_charlen(src);
350 		if (0 > (ssize_t)k) return 0; /* XXX Maybe print error. */
351 		b += k;
352 		while (0 < k) { *dest = *src; dest++; src++; k--; }
353 		n--;
354 		if (0x0 == *src) break;
355 	}
356 
357 	return b;
358 }
359 
is_utf8_upper(const char * s,locale_t dict_locale)360 static inline int is_utf8_upper(const char *s, locale_t dict_locale)
361 {
362 	mbstate_t mbs;
363 	wchar_t c;
364 	int nbytes;
365 
366 	memset(&mbs, 0, sizeof(mbs));
367 	nbytes = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
368 	if (nbytes < 0) return 0;  /* invalid mb sequence */
369 	if (iswupper_l(c, dict_locale)) return nbytes;
370 	return 0;
371 }
372 
is_utf8_alpha(const char * s,locale_t dict_locale)373 static inline int is_utf8_alpha(const char *s, locale_t dict_locale)
374 {
375 	mbstate_t mbs;
376 	wchar_t c;
377 	int nbytes;
378 
379 	memset(&mbs, 0, sizeof(mbs));
380 	nbytes = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
381 	if (nbytes < 0) return 0;  /* invalid mb sequence */
382 	if (iswalpha_l(c, dict_locale)) return nbytes;
383 	return 0;
384 }
385 
is_utf8_digit(const char * s,locale_t dict_locale)386 static inline int is_utf8_digit(const char *s, locale_t dict_locale)
387 {
388 	mbstate_t mbs;
389 	wchar_t c;
390 	int nbytes;
391 
392 	memset(&mbs, 0, sizeof(mbs));
393 	nbytes = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
394 	if (nbytes < 0) return 0;  /* invalid mb sequence */
395 	if (iswdigit_l(c, dict_locale)) return nbytes;
396 	return 0;
397 }
398 
is_utf8_space(const char * s,locale_t dict_locale)399 static inline int is_utf8_space(const char *s, locale_t dict_locale)
400 {
401 	mbstate_t mbs;
402 	wchar_t c;
403 	int nbytes;
404 
405 	memset(&mbs, 0, sizeof(mbs));
406 	nbytes = mbrtowc(&c, s, MB_CUR_MAX, &mbs);
407 	if (nbytes < 0) return 0;  /* invalid mb sequence */
408 	if (iswspace_l(c, dict_locale)) return nbytes;
409 
410 	/* 0xc2 0xa0 is U+00A0, c2 a0, NO-BREAK SPACE */
411 	/* For some reason, iswspace doesn't get this */
412 	if ((2==nbytes) && ((0xff & s[0]) == 0xc2) && ((0xff & s[1]) == 0xa0)) return 2;
413 	if ((2==nbytes) && (c == 0xa0)) return 2;
414 	return 0;
415 }
416 
417 #if 0 /* Not in use. */
418 static inline const char * skip_utf8_upper(const char * s, locale_t dict_locale)
419 {
420 	int nb = is_utf8_upper(s, dict_locale);
421 	while (nb)
422 	{
423 		s += nb;
424 		nb = is_utf8_upper(s, dict_locale);
425 	}
426 	return s;
427 }
428 
429 /**
430  * Return true if the initial upper-case letters of the
431  * two input strings match. Comparison stops when
432  * both strings descend to lowercase.
433  */
434 static inline bool utf8_upper_match(const char * s, const char * t,
435                                     locale_t dict_locale)
436 {
437 	mbstate_t mbs, mbt;
438 	wchar_t ws, wt;
439 	int ns, nt;
440 
441 	memset(&mbs, 0, sizeof(mbs));
442 	memset(&mbt, 0, sizeof(mbt));
443 
444 	ns = mbrtowc(&ws, s, MB_CUR_MAX, &mbs);
445 	nt = mbrtowc(&wt, t, MB_CUR_MAX, &mbt);
446 	if (ns < 0 || nt < 0) return false;  /* invalid mb sequence */
447 	while (iswupper_l(ws, dict_locale) || iswupper_l(wt, dict_locale))
448 	{
449 		if (ws != wt) return false;
450 		s += ns;
451 		t += nt;
452 		ns = mbrtowc(&ws, s, MB_CUR_MAX, &mbs);
453 		nt = mbrtowc(&wt, t, MB_CUR_MAX, &mbt);
454 		if (ns < 0 || nt < 0) return false;  /* invalid mb sequence */
455 	}
456 	return true;
457 }
458 #endif /* Not in use. */
459 
460 void downcase_utf8_str(char *to, const char * from, size_t usize, locale_t);
461 #if 0
462 void upcase_utf8_str(char *to, const char * from, size_t usize, locale_t);
463 #endif
464 
465 size_t lg_strlcpy(char * restrict dst, const char * restrict src, size_t dsize);
466 void safe_strcat(char *u, const char *v, size_t usize);
467 char *safe_strdup(const char *u);
468 
469 /* Simple, cheap, easy dynamic string. */
470 typedef struct
471 {
472 	char *str;
473 	size_t end;
474 	size_t len;
475 } dyn_str;
476 
477 dyn_str* dyn_str_new(void);
478 void dyn_str_delete(dyn_str*);
479 void dyn_strcat(dyn_str*, const char*);
480 void dyn_trimback(dyn_str*);
481 char * dyn_str_take(dyn_str*);
482 const char * dyn_str_value(dyn_str*);
483 size_t dyn_strlen(dyn_str*);
484 
485 size_t altlen(const char **);
486 
487 /* routines for allocating basic objects */
488 void init_memusage(void);
489 void * xalloc(size_t) GNUC_MALLOC;
490 void * exalloc(size_t) GNUC_MALLOC;
491 
492 /* Tracking the space usage can help with debugging */
493 #ifdef TRACK_SPACE_USAGE
494 void xfree(void *, size_t);
495 void exfree(void *, size_t);
496 #else /* TRACK_SPACE_USAGE */
xfree(void * p,size_t sz)497 static inline void xfree(void *p, size_t sz) { free(p); }
exfree(void * p,size_t sz)498 static inline void exfree(void *p, size_t sz) { free(p); };
499 #endif /* TRACK_SPACE_USAGE */
500 
501 size_t get_space_in_use(void);
502 size_t get_max_space_used(void);
503 
504 
505 char * get_default_locale(void);
506 void set_utf8_program_locale(void);
507 bool try_locale(const char *);
508 bool strtodC(const char *, float *);
509 
510 /**
511  * Returns the smallest power of two that is at least i and at least 1
512  */
next_power_of_two_up(size_t i)513 static inline size_t next_power_of_two_up(size_t i)
514 {
515 	size_t j=1;
516 	while (j<i) j <<= 1;
517 	return j;
518 }
519 
520 #endif /* _LINK_GRAMMAR_UTILITIES_H_ */
521