1 /*
2 * utls.c - Utility functions.
3 *
4 * Copyright (C) 2009-2011 by Hatuka*nezumi - IKEDA Soji.
5 *
6 * This file is part of the Sombok Package. This program is free
7 * software; you can redistribute it and/or modify it under the terms of
8 * either the GNU General Public License or the Artistic License, as
9 * specified in the README file.
10 *
11 */
12
13 #include "sombok_constants.h"
14 #include "sombok.h"
15
16 /** @defgroup linebreak_utils utils
17 * @brief Callback functions used by linebreak
18 *@{*/
19
20 /** @name Preprocessing callback
21 * gcstring_t *callback(linebreak_t *lbobj, void *data, unistr_t *str, unistr_t *text)
22 *
23 * Preprocessing behaviors specified by item of ``prep_func'' member of
24 * linebreak_t. Corresponding item of ``prep_data'' member can be used to
25 * modify behavior.
26 * @param[in] obj linebreak object.
27 * @param[in] data an item of prep_data correspondig to callback.
28 * @param[in,out] substr pointer to Unicode string.
29 * @param[in] text whole text to be broken, or NULL.
30 * @return This callback is past twice by each substring of text:
31 *
32 * On the first pass, when text is not NULL, it should return the first
33 * occurrance in substr matching its criteria, update substr->str to be
34 * matching position and substr->len to be length. Otherwise, should set
35 * NULL to substr->str.
36 * Return value shall be discarded.
37 *
38 * On the second pass, when text is NULL, it should return new grapheme
39 * cluster string created from substr. Return value should not share
40 * Unicode buffer with substr (i.e. use gcstring_newcopy()).
41 *
42 * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
43 */
44 /*@{*/
45
46 static
startswith(unistr_t * unistr,size_t idx,char * str,size_t len,int cs)47 int startswith(unistr_t * unistr, size_t idx, char *str, size_t len,
48 int cs)
49 {
50 size_t i;
51 unichar_t uc, c;
52
53 if (unistr->str == NULL)
54 return 0;
55 if (unistr->len - idx < len)
56 return 0;
57 for (i = 0; i < len; i++) {
58 uc = unistr->str[idx + i];
59 c = (unichar_t) str[i];
60 if (!cs) {
61 if ((unichar_t) 'A' <= uc && uc <= (unichar_t) 'Z')
62 uc += (unichar_t) ('a' - 'A');
63 if ((unichar_t) 'A' <= c && c <= (unichar_t) 'Z')
64 c += (unichar_t) ('a' - 'A');
65 }
66 if (uc != c)
67 return 0;
68 }
69 return 1;
70 }
71
72 #define is(str, i, c) \
73 ((i) < (str)->len && (str)->str[i] == (c))
74
75 #define _is_alpha(s) \
76 (('a' <= (s) && (s) <= 'z') || ('A' <= (s) && (s) <= 'Z'))
77 #define is_alpha(str, i) \
78 ((i) < (str)->len && _is_alpha((str)->str[i]))
79
80 #define _is_digit(s) \
81 ('0' <= (s) && (s) <= '9')
82 #define is_digit(str, i) \
83 ((i) < (str)->len && _is_digit((str)->str[i]))
84
85 #define _is_hexdig(s) \
86 (_is_digit(s) || ('a' <= (s) && (s) <= 'f') || ('A' <= (s) && (s) <= 'F'))
87 #define is_hexdig(str, i) \
88 ((i) < (str)->len && _is_hexdig((str)->str[i]))
89
90 #define _is_sub_delim(s) \
91 ((s) == '!' || (s) == '$' || (s) == '&' || (s) == '\'' || (s) == '(' || \
92 (s) == ')' || (s) == '*' || (s) == '+' || (s) == ',' || (s) == ';' || \
93 (s) == '=')
94 #define is_sub_delim(str, i) \
95 ((i) < (str)->len && _is_sub_delim((str)->str[i]))
96
97 #define _is_unreserved(s) \
98 (_is_alpha(s) || _is_digit(s) || \
99 (s) == '-' || (s) == '.' || (s) == '_' || (s) == '~')
100 #define is_unreserved(str, i) \
101 ((i) < (str)->len && _is_unreserved((str)->str[i]))
102
103 #define _is_pct_encoded(s) \
104 ((s) == '%' || _is_hexdig(s))
105 #define is_pct_encoded(str, i) \
106 ((i) < (str)->len && _is_pct_encoded((str)->str[i]))
107
108 #define _is_pchar(s) \
109 (_is_unreserved(s) || _is_pct_encoded(s) || _is_sub_delim(s) || \
110 (s) == ':' || (s) == '@')
111 #define is_pchar(str, i) \
112 ((i) < (str)->len && _is_pchar((str)->str[i]))
113
114 /** Built-in preprocessing callback
115 *
116 * Built-in preprocessing callback to break or not to break URLs according to
117 * some rules by Chicago Manual of Style 15th ed.
118 * If data is NULL, prohibit break.
119 * Otherwise, allow break by rule above.
120 */
linebreak_prep_URIBREAK(linebreak_t * lbobj,void * data,unistr_t * str,unistr_t * text)121 gcstring_t *linebreak_prep_URIBREAK(linebreak_t * lbobj, void *data,
122 unistr_t * str, unistr_t * text)
123 {
124 gcstring_t *gcstr;
125 size_t i;
126 unichar_t *ptr;
127
128 /* Pass I */
129
130 if (text != NULL) {
131 /*
132 * Search URL in str.
133 * Following code loosely refers RFC3986 but some practical
134 * assumptions are put:
135 *
136 * o Broken pct-encoded sequences (e.g. single "%") are allowed.
137 * o scheme names must end with alphanumeric, must be longer than
138 * or equal to two octets, and must not contain more than one
139 * non-alphanumeric ("+", "-" or ".").
140 * o URLs containing neither non-empty path, query part nor fragment
141 * (e.g. "about:") are omitted: they are treated as ordinal words.
142 */
143 for (ptr = NULL, i = 0; i < str->len; ptr = NULL, i++) {
144 int has_double_slash, has_authority, has_empty_path,
145 has_no_query, has_no_fragment;
146 size_t alphadigit, nonalphadigit;
147
148 /* skip non-alpha. */
149 if (!is_alpha(str, i))
150 continue;
151
152 ptr = str->str + i;
153
154 /* "url:" - case insensitive */
155 if (startswith(str, i, "url:", 4, 0))
156 i += 4;
157
158 /* scheme */
159 if (is_alpha(str, i))
160 i++;
161 else
162 continue;
163
164 nonalphadigit = 0;
165 alphadigit = 1;
166 while (1) {
167 if (is_alpha(str, i) || is_digit(str, i))
168 alphadigit++;
169 else if (is(str, i, '+') || is(str, i, '-') || is(str, i, '.'))
170 nonalphadigit++;
171 else
172 break;
173 i++;
174 }
175 if (alphadigit < 2 || 1 < nonalphadigit ||
176 ! (is_digit(str, i - 1) || is_alpha(str, i - 1)))
177 continue;
178
179 /* ":" */
180 if (is(str, i, ':'))
181 i++;
182 else
183 continue;
184
185 /* hier-part */
186 has_double_slash = 0;
187 has_authority = 0;
188 has_empty_path = 0;
189 has_no_query = 0;
190 has_no_fragment = 0;
191 if (startswith(str, i, "//", 2, 0)) {
192 /* "//" */
193 has_double_slash = 1;
194 i += 2;
195
196 /* authority - FIXME:syntax relaxed */
197 if (is(str, i, '[') || is(str, i, ':') || is(str, i, '@') ||
198 is_unreserved(str, i) || is_pct_encoded(str, i) ||
199 is_sub_delim(str, i)) {
200 has_authority = 1;
201 i++;
202 while (is(str, i, '[') || is(str, i, ']') ||
203 is(str, i, ':') || is(str, i, '@') ||
204 is_unreserved(str, i) || is_pct_encoded(str, i) ||
205 is_sub_delim(str, i))
206 i++;
207 }
208 }
209
210 /* path */
211 if (has_double_slash) {
212 if (has_authority)
213 goto path_abempty;
214 else
215 goto path_absolute;
216 } /* else goto path_rootless; */
217
218 /* path_rootless: */
219 if (is_pchar(str, i)) { /* FIXME:path-noscheme not concerned */
220 i++;
221 while (is_pchar(str, i))
222 i++;
223 goto path_abempty;
224 } else {
225 has_empty_path = 1;
226 goto path_empty;
227 }
228
229 path_absolute:
230 if (startswith(str, i, "//", 2, 0))
231 continue;
232 else if (is(str, i, '/')) {
233 i++;
234 if (is_pchar(str, i)) {
235 i++;
236 while (is_pchar(str, i))
237 i++;
238 }
239 goto path_abempty;
240 } else
241 continue;
242
243 path_abempty:
244 if (is(str, i, '/')) {
245 i++;
246 while (is(str, i, '/') || is_pchar(str, i))
247 i++;
248 } /* else goto path_empty; */
249
250 path_empty:
251 ;
252
253 /* query */
254 if (is(str, i, '?')) {
255 i++;
256 while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
257 i++;
258 } else
259 has_no_query = 1;
260
261 /* fragment */
262 if (is(str, i, '#')) {
263 i++;
264 while (is(str, i, '/') || is(str, i, '?') || is_pchar(str, i))
265 i++;
266 } else
267 has_no_fragment = 1;
268
269 if (has_empty_path && has_no_query && has_no_fragment)
270 continue;
271
272 break;
273 }
274
275 if (ptr != NULL)
276 str->len = i - (ptr - str->str);
277 str->str = ptr;
278 return NULL;
279 }
280
281 /* Pass II */
282
283 if ((gcstr = gcstring_newcopy(str, lbobj)) == NULL) {
284 lbobj->errnum = errno ? errno : ENOMEM;
285 return NULL;
286 }
287
288 /* non-break URI. */
289 if (data == NULL) {
290 for (i = 1; i < gcstr->gclen; i++)
291 gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
292 return gcstr;
293 }
294
295 /* break URI. */
296 if (startswith((unistr_t *) gcstr, 0, "url:", 4, 0)) {
297 gcstr->gcstr[4].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
298 i = 5;
299 } else
300 i = 1;
301 for (; i < gcstr->gclen; i++) {
302 unichar_t u, v;
303 u = gcstr->str[gcstr->gcstr[i - 1].idx];
304 v = gcstr->str[gcstr->gcstr[i].idx];
305
306 /*
307 * Some rules based on CMoS 15th ed.
308 * 17.11 1.1: [/] ÷ [^/]
309 * 17.11 2: [-] ×
310 * 6.17 2: [.] ×
311 * 17.11 1.2: ÷ [-~.,_?#%]
312 * 17.11 1.3: ÷ [=&]
313 * 17.11 1.3: [=&] ÷
314 * Default: ALL × ALL
315 */
316 if (u == '/' && v != '/')
317 gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
318 else if (u == '-' || u == '.')
319 gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
320 else if (v == '-' || v == '~' || v == '.' || v == ',' ||
321 v == '_' || v == '?' || v == '#' || v == '%' ||
322 u == '=' || v == '=' || u == '&' || v == '&')
323 gcstr->gcstr[i].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
324 else
325 gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
326 }
327
328 /* Won't break punctuations at end of matches. */
329 for (i = gcstr->gclen - 1; 1 <= i; i--) {
330 unichar_t u = gcstr->str[gcstr->gcstr[i].idx];
331 if (gcstr->gcstr[i].flag == LINEBREAK_FLAG_ALLOW_BEFORE &&
332 (u == '"' || u == '.' || u == ':' || u == ';' || u == ',' ||
333 u == '>'))
334 gcstr->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
335 else
336 break;
337 }
338 return gcstr;
339 }
340
341 /*@}*/
342
343 /** @name Sizing callback
344 * double callback(linebreak_t *obj, double len, gcstring_t *pre, gcstring_t *spc, gcstring_t *str)
345 *
346 * Sizing behavior specified by ``sizing_func'' member of linebreak_t.
347 * ``sizing_data'' member can be used to modify behavior.
348 * @param[in] obj linebreak object.
349 * @param[in] len Number of columns of preceding grapheme cluster string.
350 * @param[in] pre Preceding grapheme cluster string.
351 * @param[in] spc Trailing spaces of preceding string.
352 * @param[in] str Appended grapheme cluster string.
353 * @return number of columns of pre+spc+str.
354 * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
355 */
356
357 /*@{*/
358
359 /** Built-in Sizing callback
360 *
361 * Built-in Sizing callback based on UAX #11.
362 */
linebreak_sizing_UAX11(linebreak_t * obj,double len,gcstring_t * pre,gcstring_t * spc,gcstring_t * str)363 double linebreak_sizing_UAX11(linebreak_t * obj, double len,
364 gcstring_t * pre, gcstring_t * spc,
365 gcstring_t * str)
366 {
367 gcstring_t *spcstr;
368
369 if ((!spc || !spc->str || !spc->len) &&
370 (!str || !str->str || !str->len))
371 return len;
372
373 if (!spc || !spc->str)
374 spcstr = gcstring_copy(str);
375 else if ((spcstr = gcstring_concat(spc, str)) == NULL)
376 return -1.0;
377 len += (double) gcstring_columns(spcstr);
378 gcstring_destroy(spcstr);
379 return len;
380 }
381
382 /*@}*/
383
384 /** @name Formatting callback
385 * gcstring_t *callback(linebreak_t *lbobj, linebreak_state_t state, gcstring_t *gcstr)
386 *
387 * Formatting behaviors specified by ``format_func'' member of linebreak_t.
388 * ``formt_data'' member can be used to modify behavior.
389 * @param[in] obj linebreak object.
390 * @param[in] state state.
391 * @param[in] gcstr text fragment.
392 * @return new text fragment or, if no modification needed, NULL.
393 * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
394 *
395 * Following table describes behavior of built-in format callbacks.
396 *
397 * @verbatim
398 * state| SIMPLE | NEWLINE | TRIM
399 * -----+-----------------+-------------------+-------------------
400 * SOT |
401 * SOP | not modify
402 * SOL |
403 * LINE |
404 * EOL | append newline | replace by newline| replace by newline
405 * EOP | not modify | replace by newline| remove SPACEs
406 * EOT | not modify | replace by newline| remove SPACEs
407 * ----------------------------------------------------------------
408 * @endverbatim
409 */
410
411 /*@{*/
412
413 /** Built-in formatting callback
414 *
415 */
linebreak_format_SIMPLE(linebreak_t * lbobj,linebreak_state_t state,gcstring_t * gcstr)416 gcstring_t *linebreak_format_SIMPLE(linebreak_t * lbobj,
417 linebreak_state_t state,
418 gcstring_t * gcstr)
419 {
420 gcstring_t *t, *result;
421 unistr_t unistr;
422
423 switch (state) {
424 case LINEBREAK_STATE_EOL:
425 if ((result = gcstring_copy(gcstr)) == NULL)
426 return NULL;
427 unistr.str = lbobj->newline.str;
428 unistr.len = lbobj->newline.len;
429 if ((t = gcstring_new(&unistr, lbobj)) == NULL)
430 return NULL;
431 if (gcstring_append(result, t) == NULL) {
432 t->str = NULL;
433 gcstring_destroy(t);
434 return NULL;
435 }
436 t->str = NULL;
437 gcstring_destroy(t);
438 return result;
439
440 default:
441 errno = 0;
442 return NULL;
443 }
444 }
445
446 /** Built-in formatting callback
447 *
448 */
linebreak_format_NEWLINE(linebreak_t * lbobj,linebreak_state_t state,gcstring_t * gcstr)449 gcstring_t *linebreak_format_NEWLINE(linebreak_t * lbobj,
450 linebreak_state_t state,
451 gcstring_t * gcstr)
452 {
453 gcstring_t *result;
454 unistr_t unistr;
455
456 switch (state) {
457 case LINEBREAK_STATE_EOL:
458 case LINEBREAK_STATE_EOP:
459 case LINEBREAK_STATE_EOT:
460 unistr.str = lbobj->newline.str;
461 unistr.len = lbobj->newline.len;
462 if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
463 return NULL;
464 return result;
465
466 default:
467 errno = 0;
468 return NULL;
469 }
470 }
471
472 /** Built-in formatting callback
473 *
474 */
linebreak_format_TRIM(linebreak_t * lbobj,linebreak_state_t state,gcstring_t * gcstr)475 gcstring_t *linebreak_format_TRIM(linebreak_t * lbobj,
476 linebreak_state_t state,
477 gcstring_t * gcstr)
478 {
479 gcstring_t *result;
480 unistr_t unistr = { NULL, 0 };
481 size_t i;
482
483 switch (state) {
484 case LINEBREAK_STATE_EOL:
485 unistr.str = lbobj->newline.str;
486 unistr.len = lbobj->newline.len;
487 if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
488 return NULL;
489 return result;
490
491 case LINEBREAK_STATE_EOP:
492 case LINEBREAK_STATE_EOT:
493 if (gcstr->str == NULL || gcstr->len == 0) {
494 if ((result = gcstring_newcopy(&unistr, lbobj)) == NULL)
495 return NULL;
496 return result;
497 }
498 for (i = 0; i < gcstr->gclen && gcstr->gcstr[i].lbc == LB_SP; i++);
499 if ((result = gcstring_substr(gcstr, i, gcstr->gclen)) == NULL)
500 return NULL;
501 return result;
502
503 default:
504 errno = 0;
505 return NULL;
506 }
507 }
508
509 /*@}*/
510
511 /** @name Urgent breaking callbacks
512 * gcstring_t *callback(linebreak_t *lbobj, gcstring_t *str)
513 *
514 * Urgent breaking behaviors specified by ``urgent_func'' member of
515 * linebreak_t. ``urgent_data'' member can be used to modify behavior.
516 * @param[in] obj linebreak object.
517 * @param[in] str text to be broken.
518 * @return new text or, if no modification needed, NULL.
519 * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
520 *
521 * There are two built-in urgent breaking callbacks.
522 */
523
524 /*@{*/
525
526 /** Built-in urgent brealing callback
527 *
528 * Abort processing. lbobj->errnum is set to LINEBREAK_ELONG.
529 */
linebreak_urgent_ABORT(linebreak_t * lbobj,gcstring_t * str)530 gcstring_t *linebreak_urgent_ABORT(linebreak_t * lbobj, gcstring_t * str)
531 {
532 lbobj->errnum = LINEBREAK_ELONG;
533 return NULL;
534 }
535
536 /** Built-in urgent brealing callback
537 *
538 * Force breaking lines.
539 */
linebreak_urgent_FORCE(linebreak_t * lbobj,gcstring_t * str)540 gcstring_t *linebreak_urgent_FORCE(linebreak_t * lbobj, gcstring_t * str)
541 {
542 gcstring_t *result, *s, empty = { NULL, 0, NULL, 0, 0, lbobj };
543
544 if (!str || !str->len)
545 return gcstring_new(NULL, lbobj);
546
547 result = gcstring_new(NULL, lbobj);
548 s = gcstring_copy(str);
549 while (1) {
550 size_t i;
551 gcstring_t *t;
552 double cols;
553
554 for (i = 0; i < s->gclen; i++) {
555 t = gcstring_substr(s, 0, i + 1);
556 if (lbobj->sizing_func != NULL)
557 cols =
558 (*(lbobj->sizing_func)) (lbobj, 0.0, &empty, &empty,
559 t);
560 else
561 cols = (double) t->gclen;
562 gcstring_destroy(t);
563
564 if (lbobj->colmax < cols)
565 break;
566 }
567 if (0 < i) {
568 t = gcstring_substr(s, 0, i);
569 if (t->gclen) {
570 t->gcstr[0].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
571 gcstring_append(result, t);
572 }
573 gcstring_destroy(t);
574 t = gcstring_substr(s, i, s->gclen - i);
575 gcstring_destroy(s);
576 s = t;
577
578 if (!s->gclen)
579 break;
580 } else {
581 if (s->gclen) {
582 s->gcstr[0].flag = LINEBREAK_FLAG_ALLOW_BEFORE;
583 gcstring_append(result, s);
584 }
585 break;
586 }
587 }
588 gcstring_destroy(s);
589 return result;
590 }
591
592 /*@}*/
593
594 /** @name Preprocessing callbacks - obsoleted form
595 * gcstring_t *callback(linebreak_t *lbobj, unistr_t *str)
596
597 * Preprocessing behaviors specified by ``user_func'' member of linebreak_t.
598 * ``user_data'' member can be used to modify behavior.
599 * @param[in] obj linebreak object.
600 * @param[in] str Unicode string (not grapheme cluster string) to be processed.
601 * @return new grapheme cluster string. NULL means no data.
602 * If error occurred, callback must set lbobj->errnum nonzero then return NULL.
603 *
604 * Currently no built-in preprocessing callbacks are defined.
605 * NOTE: Feature of this callback described here is planned to be changed
606 * by next release.
607 */
608