1 /*
2  * utls.c - Utility functions.
3  *
4  * Copyright (C) 2009-2011 by Hatuka*nezumi - IKEDA Soji.
5  *
6  * This file is part of the Sombok Package.  This program is free
7  * software; you can redistribute it and/or modify it under the terms of
8  * either the GNU General Public License or the Artistic License, as
9  * specified in the README file.
10  *
11  */
12 
13 #include "sombok_constants.h"
14 #include "sombok.h"
15 
16 /** @defgroup linebreak_utils utils
17  * @brief Callback functions used by linebreak
18  *@{*/
19 
20 /** @name Preprocessing callback
21  * gcstring_t *callback(linebreak_t *lbobj, void *data, unistr_t *str, unistr_t *text)
22  *
23  * Preprocessing behaviors specified by item of ``prep_func'' member of
24  * linebreak_t.  Corresponding item of ``prep_data'' member can be used to
25  * modify behavior.
26  * @param[in] obj linebreak object.
27  * @param[in] data an item of prep_data correspondig to callback.
28  * @param[in,out] substr pointer to Unicode string.
29  * @param[in] text whole text to be broken, or NULL.
30  * @return This callback is past twice by each substring of text:
31  *
32  * On the first pass, when text is not NULL, it should return the first
33  * occurrance in substr matching its criteria, update substr->str to be
34  * matching position and substr->len to be length.  Otherwise, should set
35  * NULL to substr->str.
36  * Return value shall be discarded.
37  *
38  * On the second pass, when text is NULL, it should return new grapheme
39  * cluster string created from substr. Return value should not share
40  * Unicode buffer with substr (i.e. use gcstring_newcopy()).
41  *
42  * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
43  */
44 /*@{*/
45 
46 static
startswith(unistr_t * unistr,size_t idx,char * str,size_t len,int cs)47 int startswith(unistr_t * unistr, size_t idx, char *str, size_t len,
48 	       int cs)
49 {
50     size_t i;
51     unichar_t uc, c;
52 
53     if (unistr->str == NULL)
54 	return 0;
55     if (unistr->len - idx < len)
56 	return 0;
57     for (i = 0; i < len; i++) {
58 	uc = unistr->str[idx + i];
59 	c = (unichar_t) str[i];
60 	if (!cs) {
61 	    if ((unichar_t) 'A' <= uc && uc <= (unichar_t) 'Z')
62 		uc += (unichar_t) ('a' - 'A');
63 	    if ((unichar_t) 'A' <= c && c <= (unichar_t) 'Z')
64 		c += (unichar_t) ('a' - 'A');
65 	}
66 	if (uc != c)
67 	    return 0;
68     }
69     return 1;
70 }
71 
72 #define is(str, i, c)				\
73     ((i) < (str)->len && (str)->str[i] == (c))
74 
75 #define _is_alpha(s)						\
76     (('a' <= (s) && (s) <= 'z') || ('A' <= (s) && (s) <= 'Z'))
77 #define is_alpha(str, i)				\
78     ((i) < (str)->len && _is_alpha((str)->str[i]))
79 
80 #define _is_digit(s)				\
81     ('0' <= (s) && (s) <= '9')
82 #define is_digit(str, i)				\
83     ((i) < (str)->len && _is_digit((str)->str[i]))
84 
85 #define _is_hexdig(s)							\
86     (_is_digit(s) || ('a' <= (s) && (s) <= 'f') || ('A' <= (s) && (s) <= 'F'))
87 #define is_hexdig(str, i)				\
88     ((i) < (str)->len && _is_hexdig((str)->str[i]))
89 
90 #define _is_sub_delim(s)						\
91     ((s) == '!' || (s) == '$' || (s) == '&' || (s) == '\'' || (s) == '(' || \
92      (s) == ')' || (s) == '*' || (s) == '+' || (s) == ',' || (s) == ';' || \
93      (s) == '=')
94 #define is_sub_delim(str, i)				\
95     ((i) < (str)->len && _is_sub_delim((str)->str[i]))
96 
97 #define _is_unreserved(s)					\
98     (_is_alpha(s) || _is_digit(s) ||				\
99      (s) == '-' || (s) == '.' || (s) == '_' || (s) == '~')
100 #define is_unreserved(str, i)				\
101     ((i) < (str)->len && _is_unreserved((str)->str[i]))
102 
103 #define _is_pct_encoded(s)			\
104     ((s) == '%' || _is_hexdig(s))
105 #define is_pct_encoded(str, i)					\
106     ((i) < (str)->len && _is_pct_encoded((str)->str[i]))
107 
108 #define _is_pchar(s)							\
109     (_is_unreserved(s) || _is_pct_encoded(s) || _is_sub_delim(s) ||	\
110      (s) == ':' || (s) == '@')
111 #define is_pchar(str, i)				\
112      ((i) < (str)->len && _is_pchar((str)->str[i]))
113 
114 /** Built-in preprocessing callback
115  *
116  * Built-in preprocessing callback to break or not to break URLs according to
117  * some rules by Chicago Manual of Style 15th ed.
118  * If data is NULL, prohibit break.
119  * Otherwise, allow break by rule above.
120  */
linebreak_prep_URIBREAK(linebreak_t * lbobj,void * data,unistr_t * str,unistr_t * text)121 gcstring_t *linebreak_prep_URIBREAK(linebreak_t * lbobj, void *data,
122 				    unistr_t * str, unistr_t * text)
123 {
124     gcstring_t *gcstr;
125     size_t i;
126     unichar_t *ptr;
127 
128     /* Pass I */
129 
130     if (text != NULL) {
131 	/*
132 	 * Search URL in str.
133 	 * Following code loosely refers RFC3986 but some practical
134 	 * assumptions are put:
135 	 *
136 	 * o Broken pct-encoded sequences (e.g. single "%") are allowed.
137 	 * o scheme names must end with alphanumeric, must be longer than
138 	 *   or equal to two octets, and must not contain more than one
139 	 *   non-alphanumeric ("+", "-" or ".").
140 	 * o URLs containing neither non-empty path, query part nor fragment
141 	 *   (e.g. "about:") are omitted: they are treated as ordinal words.
142 	 */
143 	for (ptr = NULL, i = 0; i < str->len; ptr = NULL, i++) {
144 	    int has_double_slash, has_authority, has_empty_path,
145 		has_no_query, has_no_fragment;
146 	    size_t alphadigit, nonalphadigit;
147 
148 	    /* skip non-alpha. */
149 	    if (!is_alpha(str, i))
150 		continue;
151 
152 	    ptr = str->str + i;
153 
154 	    /* "url:" - case insensitive */
155 	    if (startswith(str, i, "url:", 4, 0))
156 		i += 4;
157 
158 	    /* scheme */
159 	    if (is_alpha(str, i))
160 		i++;
161 	    else
162 		continue;
163 
164 	    nonalphadigit = 0;
165 	    alphadigit = 1;
166 	    while (1) {
167 		if (is_alpha(str, i) || is_digit(str, i))
168 		    alphadigit++;
169 		else if (is(str, i, '+') || is(str, i, '-') || is(str, i, '.'))
170 		    nonalphadigit++;
171 		else
172 		    break;
173 		i++;
174 	    }
175 	    if (alphadigit < 2 || 1 < nonalphadigit ||
176 	        ! (is_digit(str, i - 1) || is_alpha(str, i - 1)))
177 		continue;
178 
179 	    /* ":" */
180 	    if (is(str, i, ':'))
181 		i++;
182 	    else
183 		continue;
184 
185 	    /* hier-part */
186 	    has_double_slash = 0;
187 	    has_authority = 0;
188 	    has_empty_path = 0;
189 	    has_no_query = 0;
190 	    has_no_fragment = 0;
191 	    if (startswith(str, i, "//", 2, 0)) {
192 		/* "//" */
193 		has_double_slash = 1;
194 		i += 2;
195 
196 		/* authority - FIXME:syntax relaxed */
197 		if (is(str, i, '[') || is(str, i, ':') || is(str, i, '@') ||
198 		    is_unreserved(str, i) || is_pct_encoded(str, i) ||
199 		    is_sub_delim(str, i)) {
200 		    has_authority = 1;
201 		    i++;
202 		    while (is(str, i, '[') || is(str, i, ']') ||
203 			   is(str, i, ':') || is(str, i, '@') ||
204 			   is_unreserved(str, i) || is_pct_encoded(str, i) ||
205 			   is_sub_delim(str, i))
206 			i++;
207 		}
208 	    }
209 
210 	    /* path */
211 	    if (has_double_slash) {
212 		if (has_authority)
213 		    goto path_abempty;
214 		else
215 		    goto path_absolute;
216 	    } /* else goto path_rootless; */
217 
218 	    /* path_rootless: */
219 	    if (is_pchar(str, i)) { /* FIXME:path-noscheme not concerned */
220 		i++;
221 		while (is_pchar(str, i))
222 		    i++;
223 		goto path_abempty;
224 	    } else {
225 		has_empty_path = 1;
226 		goto path_empty;
227 	    }
228 
229 	  path_absolute:
230 	    if (startswith(str, i, "//", 2, 0))
231 		continue;
232 	    else if (is(str, i, '/')) {
233 		i++;
234 		if (is_pchar(str, i)) {
235 		    i++;
236 		    while (is_pchar(str, i))
237 			i++;
238 		}
239 		goto path_abempty;
240 	    } else
241 		continue;
242 
243 	  path_abempty:
244 	    if (is(str, i, '/')) {
245 		i++;
246 		while (is(str, i, '/') || is_pchar(str, i))
247 		    i++;
248 	    } /* else goto path_empty; */
249 
250 	  path_empty:
251 	    ;
252 
253 	    /* query */
254 	    if (is(str, i, '?')) {
255 		i++;
256 		while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
257 		    i++;
258 	    } else
259 		has_no_query = 1;
260 
261 	    /* fragment */
262 	    if (is(str, i, '#')) {
263 		i++;
264 		while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
265 		    i++;
266 	    } else
267 		has_no_fragment = 1;
268 
269 	    if (has_empty_path && has_no_query && has_no_fragment)
270 		continue;
271 
272 	    break;
273 	}
274 
275 	if (ptr != NULL)
276 	    str->len = i - (ptr - str->str);
277 	str->str = ptr;
278 	return NULL;
279     }
280 
281     /* Pass II */
282 
283     if ((gcstr = gcstring_newcopy(str, lbobj)) == NULL) {
284 	lbobj->errnum = errno ? errno : ENOMEM;
285 	return NULL;
286     }
287 
288     /* non-break URI. */
289     if (data == NULL) {
290 	for (i = 1; i < gcstr->gclen; i++)
291 	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
292 	return gcstr;
293     }
294 
295     /* break URI. */
296     if (startswith((unistr_t *) gcstr, 0, "url:", 4, 0)) {
297 	gcstr->gcstr[4].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
298 	i = 5;
299     } else
300 	i = 1;
301     for (; i < gcstr->gclen; i++) {
302 	unichar_t u, v;
303 	u = gcstr->str[gcstr->gcstr[i - 1].idx];
304 	v = gcstr->str[gcstr->gcstr[i].idx];
305 
306 	/*
307 	 * Some rules based on CMoS 15th ed.
308 	 * 17.11 1.1: [/] ÷ [^/]
309 	 * 17.11 2:   [-] ×
310 	 * 6.17 2:   [.] ×
311 	 * 17.11 1.2: ÷ [-~.,_?#%]
312 	 * 17.11 1.3: ÷ [=&]
313 	 * 17.11 1.3: [=&] ÷
314 	 * Default:  ALL × ALL
315 	 */
316 	if (u == '/' && v != '/')
317 	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
318 	else if (u == '-' || u == '.')
319 	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
320 	else if (v == '-' || v == '~' || v == '.' || v == ',' ||
321 		 v == '_' || v == '?' || v == '#' || v == '%' ||
322 		 u == '=' || v == '=' || u == '&' || v == '&')
323 	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
324 	else
325 	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
326     }
327 
328     /* Won't break punctuations at end of matches. */
329     for (i = gcstr->gclen - 1; 1 <= i; i--) {
330 	unichar_t u = gcstr->str[gcstr->gcstr[i].idx];
331 	if (gcstr->gcstr[i].flag == LINEBREAK_FLAG_ALLOW_BEFORE &&
332 	    (u == '"' || u == '.' || u == ':' || u == ';' || u == ',' ||
333 	     u == '>'))
334 	    gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
335 	else
336 	    break;
337     }
338     return gcstr;
339 }
340 
341 /*@}*/
342 
343 /** @name Sizing callback
344  * double callback(linebreak_t *obj, double len, gcstring_t *pre, gcstring_t *spc, gcstring_t *str)
345  *
346  * Sizing behavior specified by ``sizing_func'' member of linebreak_t.
347  * ``sizing_data'' member can be used to modify behavior.
348  * @param[in] obj linebreak object.
349  * @param[in] len Number of columns of preceding grapheme cluster string.
350  * @param[in] pre Preceding grapheme cluster string.
351  * @param[in] spc Trailing spaces of preceding string.
352  * @param[in] str Appended grapheme cluster string.
353  * @return number of columns of pre+spc+str.
354  * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
355  */
356 
357 /*@{*/
358 
359 /** Built-in Sizing callback
360  *
361  * Built-in Sizing callback based on UAX #11.
362  */
linebreak_sizing_UAX11(linebreak_t * obj,double len,gcstring_t * pre,gcstring_t * spc,gcstring_t * str)363 double linebreak_sizing_UAX11(linebreak_t * obj, double len,
364 			      gcstring_t * pre, gcstring_t * spc,
365 			      gcstring_t * str)
366 {
367     gcstring_t *spcstr;
368 
369     if ((!spc || !spc->str || !spc->len) &&
370 	(!str || !str->str || !str->len))
371 	return len;
372 
373     if (!spc || !spc->str)
374 	spcstr = gcstring_copy(str);
375     else if ((spcstr = gcstring_concat(spc, str)) == NULL)
376 	return -1.0;
377     len += (double) gcstring_columns(spcstr);
378     gcstring_destroy(spcstr);
379     return len;
380 }
381 
382 /*@}*/
383 
384 /** @name Formatting callback
385  * gcstring_t *callback(linebreak_t *lbobj, linebreak_state_t state, gcstring_t *gcstr)
386  *
387  * Formatting behaviors specified by ``format_func'' member of linebreak_t.
388  * ``formt_data'' member can be used to modify behavior.
389  * @param[in] obj linebreak object.
390  * @param[in] state state.
391  * @param[in] gcstr text fragment.
392  * @return new text fragment or, if no modification needed, NULL.
393  * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
394  *
395  * Following table describes behavior of built-in format callbacks.
396  *
397  * @verbatim
398  * state| SIMPLE          | NEWLINE           | TRIM
399  * -----+-----------------+-------------------+-------------------
400  * SOT  |
401  * SOP  |                       not modify
402  * SOL  |
403  * LINE |
404  * EOL  | append newline  | replace by newline| replace by newline
405  * EOP  | not modify      | replace by newline| remove SPACEs
406  * EOT  | not modify      | replace by newline| remove SPACEs
407  * ----------------------------------------------------------------
408  * @endverbatim
409  */
410 
411 /*@{*/
412 
413 /** Built-in formatting callback
414  *
415  */
linebreak_format_SIMPLE(linebreak_t * lbobj,linebreak_state_t state,gcstring_t * gcstr)416 gcstring_t *linebreak_format_SIMPLE(linebreak_t * lbobj,
417 				    linebreak_state_t state,
418 				    gcstring_t * gcstr)
419 {
420     gcstring_t *t, *result;
421     unistr_t unistr;
422 
423     switch (state) {
424     case LINEBREAK_STATE_EOL:
425 	if ((result = gcstring_copy(gcstr)) == NULL)
426 	    return NULL;
427 	unistr.str = lbobj->newline.str;
428 	unistr.len = lbobj->newline.len;
429 	if ((t = gcstring_new(&unistr, lbobj)) == NULL)
430 	    return NULL;
431 	if (gcstring_append(result, t) == NULL) {
432 	    t->str = NULL;
433 	    gcstring_destroy(t);
434 	    return NULL;
435 	}
436 	t->str = NULL;
437 	gcstring_destroy(t);
438 	return result;
439 
440     default:
441 	errno = 0;
442 	return NULL;
443     }
444 }
445 
446 /** Built-in formatting callback
447  *
448  */
linebreak_format_NEWLINE(linebreak_t * lbobj,linebreak_state_t state,gcstring_t * gcstr)449 gcstring_t *linebreak_format_NEWLINE(linebreak_t * lbobj,
450 				     linebreak_state_t state,
451 				     gcstring_t * gcstr)
452 {
453     gcstring_t *result;
454     unistr_t unistr;
455 
456     switch (state) {
457     case LINEBREAK_STATE_EOL:
458     case LINEBREAK_STATE_EOP:
459     case LINEBREAK_STATE_EOT:
460 	unistr.str = lbobj->newline.str;
461 	unistr.len = lbobj->newline.len;
462 	if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
463 	    return NULL;
464 	return result;
465 
466     default:
467 	errno = 0;
468 	return NULL;
469     }
470 }
471 
472 /** Built-in formatting callback
473  *
474  */
linebreak_format_TRIM(linebreak_t * lbobj,linebreak_state_t state,gcstring_t * gcstr)475 gcstring_t *linebreak_format_TRIM(linebreak_t * lbobj,
476 				  linebreak_state_t state,
477 				  gcstring_t * gcstr)
478 {
479     gcstring_t *result;
480     unistr_t unistr = { NULL, 0 };
481     size_t i;
482 
483     switch (state) {
484     case LINEBREAK_STATE_EOL:
485 	unistr.str = lbobj->newline.str;
486 	unistr.len = lbobj->newline.len;
487 	if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
488 	    return NULL;
489 	return result;
490 
491     case LINEBREAK_STATE_EOP:
492     case LINEBREAK_STATE_EOT:
493 	if (gcstr->str == NULL || gcstr->len == 0) {
494 	    if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
495 		return NULL;
496 	    return result;
497 	}
498 	for (i = 0; i < gcstr->gclen && gcstr->gcstr[i].lbc == LB_SP; i++);
499 	if ((result = gcstring_substr(gcstr, i, gcstr->gclen)) == NULL)
500 	    return NULL;
501 	return result;
502 
503     default:
504 	errno = 0;
505 	return NULL;
506     }
507 }
508 
509 /*@}*/
510 
511 /** @name Urgent breaking callbacks
512  * gcstring_t *callback(linebreak_t *lbobj, gcstring_t *str)
513  *
514  * Urgent breaking behaviors specified by ``urgent_func'' member of
515  * linebreak_t. ``urgent_data'' member can be used to modify behavior.
516  * @param[in] obj linebreak object.
517  * @param[in] str text to be broken.
518  * @return new text or, if no modification needed, NULL.
519  * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
520  *
521  * There are two built-in urgent breaking callbacks.
522  */
523 
524 /*@{*/
525 
526 /** Built-in urgent brealing callback
527  *
528  * Abort processing.  lbobj->errnum is set to LINEBREAK_ELONG.
529  */
linebreak_urgent_ABORT(linebreak_t * lbobj,gcstring_t * str)530 gcstring_t *linebreak_urgent_ABORT(linebreak_t * lbobj, gcstring_t * str)
531 {
532     lbobj->errnum = LINEBREAK_ELONG;
533     return NULL;
534 }
535 
536 /** Built-in urgent brealing callback
537  *
538  * Force breaking lines.
539  */
linebreak_urgent_FORCE(linebreak_t * lbobj,gcstring_t * str)540 gcstring_t *linebreak_urgent_FORCE(linebreak_t * lbobj, gcstring_t * str)
541 {
542     gcstring_t *result, *s, empty = { NULL, 0, NULL, 0, 0, lbobj };
543 
544     if (!str || !str->len)
545 	return gcstring_new(NULL, lbobj);
546 
547     result = gcstring_new(NULL, lbobj);
548     s = gcstring_copy(str);
549     while (1) {
550 	size_t i;
551 	gcstring_t *t;
552 	double cols;
553 
554 	for (i = 0; i < s->gclen; i++) {
555 	    t = gcstring_substr(s, 0, i + 1);
556 	    if (lbobj->sizing_func != NULL)
557 		cols =
558 		    (*(lbobj->sizing_func)) (lbobj, 0.0, &empty, &empty,
559 					     t);
560 	    else
561 		cols = (double) t->gclen;
562 	    gcstring_destroy(t);
563 
564 	    if (lbobj->colmax < cols)
565 		break;
566 	}
567 	if (0 < i) {
568 	    t = gcstring_substr(s, 0, i);
569 	    if (t->gclen) {
570 		t->gcstr[0].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
571 		gcstring_append(result, t);
572 	    }
573 	    gcstring_destroy(t);
574 	    t = gcstring_substr(s, i, s->gclen - i);
575 	    gcstring_destroy(s);
576 	    s = t;
577 
578 	    if (!s->gclen)
579 		break;
580 	} else {
581 	    if (s->gclen) {
582 		s->gcstr[0].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
583 		gcstring_append(result, s);
584 	    }
585 	    break;
586 	}
587     }
588     gcstring_destroy(s);
589     return result;
590 }
591 
592 /*@}*/
593 
594 /** @name Preprocessing callbacks - obsoleted form
595  * gcstring_t *callback(linebreak_t *lbobj, unistr_t *str)
596 
597  * Preprocessing behaviors specified by ``user_func'' member of linebreak_t.
598  * ``user_data'' member can be used to modify behavior.
599  * @param[in] obj linebreak object.
600  * @param[in] str Unicode string (not grapheme cluster string) to be processed.
601  * @return new grapheme cluster string.  NULL means no data.
602  * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
603  *
604  * Currently no built-in preprocessing callbacks are defined.
605  * NOTE: Feature of this callback described here is planned to be changed
606  * by next release.
607  */
608