1 /* Copyright 2010-2021 Free Software Foundation, Inc.
2
3 This program is free software: you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation, either version 3 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program. If not, see <http://www.gnu.org/licenses/>. */
15
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <locale.h>
23 #ifndef _WIN32
24 #include <langinfo.h>
25 #else /* _WIN32 */
26 /* Workaround for problems caused in mingw.org's MinGW build by
27 Gnulib's wchar.h overriding the wint_t type definition, which
28 causes compilation errors when perl.h is included below, because
29 perl.h includes ctype.h. */
30 #include <ctype.h>
31 #endif
32 #include <wchar.h>
33 #include <wctype.h>
34
35 /* See "How do I use all this in extensions" in 'man perlguts'. */
36 #define PERL_NO_GET_CONTEXT
37
38 #include "EXTERN.h"
39 #include "perl.h"
40 #include "XSUB.h"
41
42 #include "ppport.h"
43
44 #include "xspara.h"
45
46 #include "text.h"
47
48 typedef struct {
49 TEXT space; /* Pending space, to be output before the pending word. */
50 TEXT word; /* Pending word. If outputting this would have led to
51 the line to be too long, the line should have been cut before
52 saving it. */
53
54 /* When word.end == 0, this indicates a word of length 0. */
55 int invisible_pending_word;
56
57 /* Length of space in multibyte characters. */
58 int space_counter;
59
60 /* Characters added so far in current word. */
61 int word_counter;
62
63 /* -2 means we are not at the end of a sentence (undefined in Perl),
64 1 means we are at the end of a sentence and French spacing is off,
65 -1 means we are at the end of a sentence and French spacing is on.
66 0 means it is "inhibited". */
67 int end_sentence;
68
69 int max; /* Maximum length of line. */
70 int indent_length; /* Columns to indent this line. */
71 int indent_length_next; /* Columns to indent the rest of the lines. */
72 int counter; /* Columns so far on this line. */
73
74 int lines_counter; /* Lines so far added in paragraph. */
75 int end_line_count; /* Number of newlines so far in an output unit, i.e.
76 with add_text or add_next. */
77
78 wint_t last_letter; /* Last letter in word, used to decide if we're
79 at the end of a sentence. */
80
81 /* Options set with set_space_protection. */
82 int protect_spaces; /* Line break forbidden, as in @w. */
83 int ignore_columns; /* Don't cut line at right margin. Used by
84 @flushleft and @flushright. */
85 int keep_end_lines; /* A newline in the input ends a line in the output.
86 Used by @flushleft and @flushright. */
87 int french_spacing; /* Only one space, not two, after a full stop. */
88 int double_width_no_break; /* No line break between double width chars. */
89
90 /* No wrapping of lines and spaces are kept as-is. */
91 int unfilled;
92
93 /* Do not terminate with a final newline. */
94 int no_final_newline;
95
96 /* Terminate with any trailing space. */
97 int add_final_space;
98
99 int in_use;
100 } PARAGRAPH;
101
102 static PARAGRAPH state;
103
104 #ifdef _WIN32
105
106 #define WIN32_LEAN_AND_MEAN
107 #include <windows.h>
108 #include <errno.h>
109
110 /* If Gnulib overrides wint_t with a wider type, we cannot use
111 iswspace etc. names, whose prototypes were seen with the original
112 wint_t in effect. */
113 #ifdef GNULIB_defined_wint_t
114 # undef iswspace
115 # define iswspace(w) w32_iswspace(w)
116 # undef iswupper
117 # define iswupper(w) w32_iswupper(w)
118 #endif
119
120 char *
w32_setlocale(int category,const char * value)121 w32_setlocale (int category, const char *value)
122 {
123 if (_stricmp (value, "en_us.utf-8") != 0)
124 return NULL;
125
126 /* Switch to the Windows U.S. English locale with its default
127 codeset. We will handle the non-ASCII text ourselves, so the
128 codeset is unimportant, and Windows doesn't support UTF-8 as the
129 codeset anyway. */
130 return setlocale (category, "ENU");
131 }
132 #define setlocale(c,v) w32_setlocale(c,v)
133
134 size_t
mbrlen(const char * __restrict__ mbs,size_t n,mbstate_t * __restrict__ ps)135 mbrlen (const char * __restrict__ mbs, size_t n, mbstate_t * __restrict__ ps)
136 {
137 unsigned char byte1 = *mbs;
138
139 if (ps != NULL)
140 {
141 errno = ENOSYS;
142 return -1;
143 }
144
145 return
146 ((byte1 & 0x80) == 0) ? 1 : ((byte1 & 0x20) == 0) ? 2 :
147 ((byte1 & 0x10) == 0) ? 3 : 4;
148 }
149
150 /* Convert a UTF-8 encoded multibyte string to a wide character. */
151 size_t
mbrtowc(wchar_t * __restrict__ pwc,const char * __restrict__ mbs,size_t n,mbstate_t * __restrict__ ps)152 mbrtowc (wchar_t * __restrict__ pwc, const char * __restrict__ mbs, size_t n,
153 mbstate_t * __restrict__ ps)
154 {
155 int len = mbrlen (mbs, n, ps);
156
157 if (mbs == NULL)
158 return 0;
159 else
160 {
161 wchar_t wc[2];
162 size_t n_utf16 = MultiByteToWideChar (CP_UTF8, MB_ERR_INVALID_CHARS,
163 mbs, len, wc, 2);
164 if (n_utf16 == 0)
165 {
166 errno = EILSEQ;
167 return (size_t)-1;
168 }
169 if (ps != NULL)
170 {
171 errno = ENOSYS;
172 return (size_t)-1;
173 }
174 /* We don't support UTF-16 surrogates, because the calling code
175 doesn't, and because character classification functions on
176 Windows don't support anything beyond the BMP anyway. So we
177 return the first character of the surrogate pair and set
178 errno. */
179 if (n_utf16 > 1)
180 errno = ENOSYS;
181 if (pwc != NULL)
182 *pwc = wc[0];
183
184 return len;
185 }
186 }
187
188 int
iswspace(wint_t wc)189 iswspace (wint_t wc)
190 {
191 /* See Unicode's Proplist.txt. */
192 if ((wc >= 0x09 && wc <= 0x0D)
193 || wc == 0x20
194 || wc == 0x85
195 || wc == 0xA0
196 || wc == 0x1680
197 || (wc >= 0x2000 && wc <= 0x200A)
198 || wc == 0x2028
199 || wc == 0x2029
200 || wc == 0x202F
201 || wc == 0x205F
202 || wc == 0x3000)
203 return 1;
204
205 return 0;
206 }
207
208 /* FIXME: Provide a real implementation. */
209 int
wcwidth(const wchar_t wc)210 wcwidth (const wchar_t wc)
211 {
212 return wc == 0 ? 0 : 1;
213 }
214
215 int
iswupper(wint_t wi)216 iswupper (wint_t wi)
217 {
218 WORD char_type;
219 wchar_t wc = wi;
220 BOOL status = GetStringTypeW (CT_CTYPE1, &wc, 1, &char_type);
221
222 if (!status || (char_type & C1_UPPER) == 0)
223 return 0;
224
225 return 1;
226 }
227
228 /* Avoid warnings due to redefinition of popen/pclose in Perl headers. */
229 #ifdef popen
230 # undef popen
231 # define popen(c,m) _popen(c,m)
232 #endif
233 #ifdef pclose
234 # undef pclose
235 # define pclose(f) _pclose(f)
236 #endif
237
238 #endif
239
240 int
xspara_init(int unused,char * unused2)241 xspara_init (int unused, char *unused2)
242 {
243 char *utf8_locale = 0;
244 int len;
245 char *cur;
246 char *dot;
247
248 dTHX;
249
250 #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
251 /* needed due to thread-safe locale handling in newer perls */
252 switch_to_global_locale();
253 #endif
254
255 if (setlocale (LC_CTYPE, "en_US.UTF-8")
256 || setlocale (LC_CTYPE, "en_US.utf8"))
257 goto success;
258
259 cur = setlocale (LC_CTYPE, 0); /* Name of current locale. */
260 if (!cur)
261 goto failure;
262 len = strlen (cur);
263 if (len >= 6 && !memcmp (".UTF-8", cur + len - 6, 6)
264 || len >= 5 && !memcmp (".utf8", cur + len - 5, 5)
265 || len >= 6 && !memcmp (".utf-8", cur + len - 6, 6)
266 || len >= 5 && !memcmp (".UTF8", cur + len - 5, 5))
267 {
268 setlocale (LC_CTYPE, ""); /* Use the locale from the environment. */
269 goto success;
270 }
271
272 /* Otherwise try altering the current locale name. */
273 dot = strchr (cur, '.');
274 if (!dot)
275 dot = cur + len;
276 utf8_locale = malloc (len + 6 + 1); /* enough to add ".UTF-8" to end */
277 memcpy (utf8_locale, cur, dot - cur);
278 dot = utf8_locale + (dot - cur);
279 memcpy (dot, ".UTF-8", 7);
280 if (setlocale (LC_CTYPE, utf8_locale))
281 goto success;
282
283 memcpy (dot, ".utf8", 6);
284 if (setlocale (LC_CTYPE, utf8_locale))
285 goto success;
286
287 /* Otherwise, look for any UTF-8 locale in the output of "locale -a". */
288 {
289 FILE *p;
290 char *line = 0;
291 size_t n = 0;
292 ssize_t ret;
293 p = popen ("locale -a", "r");
294 if (!p)
295 goto failure;
296 while (1)
297 {
298 ret = getline (&line, &n, p);
299 if (ret == (ssize_t) -1)
300 {
301 free (line);
302 pclose (p);
303 goto failure;
304 }
305 if (strstr (line, "UTF-8") || strstr (line, "utf8"))
306 {
307 line[ret - 1] = '\0'; /* Remove trailing newline. */
308 if (setlocale (LC_CTYPE, line))
309 {
310 free (line);
311 pclose (p);
312 goto success;
313 }
314 }
315 }
316 }
317
318 if (1)
319 {
320 failure:
321 return 0; /* failure */
322 }
323 else
324 {
325 success: ;
326 free (utf8_locale);
327 #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
328 /* needed due to thread-safe locale handling in newer perls */
329 sync_locale();
330 #endif
331 /*
332 fprintf (stderr, "tried to set LC_CTYPE to UTF-8.\n");
333 fprintf (stderr, "character encoding is: %s\n",
334 nl_langinfo (CODESET));
335 */
336 return 1; /* success */
337 }
338 }
339
340 /* Array for storing paragraph states which aren't in use. */
341 static PARAGRAPH *state_array;
342 static int state_array_size;
343
344 /* The slot in state_array for saving the current state. */
345 static int current_state;
346
347 static void
xspara__switch_state(int id)348 xspara__switch_state (int id)
349 {
350 if (current_state == id)
351 return;
352 if (current_state != -1)
353 memcpy (&state_array[current_state], &state, sizeof (PARAGRAPH));
354
355 memcpy (&state, &state_array[id], sizeof (PARAGRAPH));
356 current_state = id;
357 }
358
359 int
xspara_new(HV * conf)360 xspara_new (HV *conf)
361 {
362 int i;
363
364 dTHX; /* Perl boiler plate */
365
366 TEXT saved_space, saved_word;
367
368 /* Find an unused slot in state_array */
369 for (i = 0; i < state_array_size; i++)
370 {
371 if (!state_array[i].in_use)
372 break;
373 }
374 if (i == state_array_size)
375 {
376 state_array = realloc (state_array,
377 (state_array_size += 10) * sizeof (PARAGRAPH));
378 memset (state_array + i, 0, 10 * sizeof (PARAGRAPH));
379 }
380
381 state_array[i].in_use = 1;
382 xspara__switch_state (i);
383
384 /* Zero formatter, reusing storage. */
385 saved_space = state.space;
386 saved_word = state.word;
387 memset (&state, 0, sizeof (state));
388 state.space = saved_space;
389 state.word = saved_word;
390 state.space.end = state.word.end = 0;
391 state.in_use = 1;
392
393 /* Default values. */
394 state.max = 72;
395 state.indent_length_next = -1; /* Special value meaning undefined. */
396 state.end_sentence = -2; /* Special value meaning undefined. */
397 state.last_letter = L'\0';
398
399 if (conf)
400 xspara_init_state (conf);
401
402 /* The paragraph ID. */
403 return i;
404 }
405
406
407 /* SV is a blessed reference to an integer containing the paragraph ID. */
408 void
xspara_set_state(SV * sv)409 xspara_set_state (SV *sv)
410 {
411 dTHX;
412
413 xspara__switch_state (SvIV (sv));
414 }
415
416 /* Set the state internal to this C module from the Perl hash. */
417 void
xspara_init_state(HV * hash)418 xspara_init_state (HV *hash)
419 {
420 #define FETCH(key) hv_fetch (hash, key, strlen (key), 0)
421 #define FETCH_INT(key,where) { val = FETCH(key); \
422 if (val) { where = SvIV (*val); } }
423
424 SV **val;
425
426 dTHX; /* This is boilerplate for interacting with Perl. */
427
428 /* Fetch all these so they are set, and reset for each paragraph. */
429 FETCH_INT("end_sentence", state.end_sentence);
430 FETCH_INT("max", state.max);
431
432 FETCH_INT("indent_length", state.indent_length);
433 FETCH_INT("indent_length_next", state.indent_length_next);
434 FETCH_INT("counter", state.counter);
435
436 FETCH_INT("word_counter", state.word_counter);
437
438 FETCH_INT("lines_counter", state.lines_counter);
439 FETCH_INT("end_line_count", state.end_line_count);
440
441 FETCH_INT("protect_spaces", state.protect_spaces);
442 FETCH_INT("ignore_columns", state.ignore_columns);
443 FETCH_INT("keep_end_lines", state.keep_end_lines);
444 FETCH_INT("frenchspacing", state.french_spacing);
445
446 FETCH_INT("unfilled", state.unfilled);
447 FETCH_INT("no_final_newline", state.no_final_newline);
448 FETCH_INT("add_final_space", state.add_final_space);
449
450 val = FETCH("word");
451 if (val)
452 {
453 fprintf (stderr, "Bug: setting 'word' is not supported.\n");
454 abort ();
455 }
456 val = FETCH("space");
457 if (val)
458 {
459 fprintf (stderr, "Bug: setting 'space' is not supported.\n");
460 abort ();
461 }
462 return;
463
464 #undef FETCH
465 #undef FETCH_INT
466 }
467
468 /* Move the state back into the Perl hash. */
469 void
xspara_get_state(HV * hash)470 xspara_get_state (HV *hash)
471 {
472 /* TODO: The last argument of hv_store would be a precomputed hash, which
473 would save the time of calculating it. */
474 #define STORE(key) hv_store (hash, key, strlen (key), val, 0)
475
476 SV *val;
477
478 /* Don't do anything. */
479 return;
480
481 dTHX; /* Perl boilerplate. */
482
483 val = newSViv (state.end_sentence);
484 STORE("end_sentence");
485
486 val = newSViv (state.counter);
487 STORE("counter");
488
489 val = newSViv (state.word_counter);
490 STORE("word_counter");
491
492 val = newSViv (state.lines_counter);
493 STORE("lines_counter");
494
495 return;
496
497
498 #undef STORE
499 }
500
501
502 /************************************************************************/
503
504
505 /* Append a newline character to RESULT. */
506 void
xspara__cut_line(TEXT * result)507 xspara__cut_line (TEXT *result)
508 {
509 if (!state.ignore_columns)
510 {
511 xspara__end_line ();
512
513 text_append (result, "\n");
514 }
515 }
516
517 int
xspara_end_line_count(void)518 xspara_end_line_count (void)
519 {
520 return state.end_line_count;
521 }
522
523 /* End a line (throwing away a pending space, which we don't need)
524 Note _end_line in Paragraph.pm returned "\n". */
525 void
xspara__end_line(void)526 xspara__end_line (void)
527 {
528 state.counter = 0;
529 state.space.end = 0;
530 state.space_counter = 0;
531
532 /* This will only be true for the first line of output. */
533 if (state.indent_length_next != -1)
534 {
535 state.indent_length = state.indent_length_next;
536 state.indent_length_next = -1;
537 }
538
539 state.lines_counter++;
540 state.end_line_count++;
541 }
542
543 char *
xspara_end_line(void)544 xspara_end_line (void)
545 {
546 state.end_line_count = 0;
547 xspara__end_line ();
548 return "\n";
549 }
550
551 /* Return concatenation of SPACE and WORD. */
552 char *
xspara_get_pending(void)553 xspara_get_pending (void)
554 {
555 static TEXT t;
556 text_reset (&t);
557 text_append_n (&t, state.space.text, state.space.end);
558 text_append_n (&t, state.word.text, state.word.end);
559 return t.text;
560 }
561
562 /* Append to RESULT pending space followed by pending word, clearing them
563 afterwards. Assume we don't need to wrap a line. Only add spaces without a
564 word if ADD_SPACES. */
565 void
xspara__add_pending_word(TEXT * result,int add_spaces)566 xspara__add_pending_word (TEXT *result, int add_spaces)
567 {
568 if (state.word.end == 0 && !state.invisible_pending_word && !add_spaces)
569 return;
570
571 if (state.indent_length > state.counter)
572 {
573 int i;
574 /* If we are not up to the left margin yet, output spaces to get there,
575 and ignore 'state.space', the pending space string. In this case
576 state.counter is probably 0. */
577
578 for (i = 0; i < state.indent_length - state.counter; i++)
579 text_append (result, " ");
580 state.counter = state.indent_length;
581
582 /* Do not output leading spaces after the indent, unless 'unfilled'
583 is on. */
584 if (!state.unfilled)
585 state.space.end = 0;
586 }
587
588 if (state.space.end > 0)
589 {
590 text_append_n (result, state.space.text, state.space.end);
591
592 state.counter += state.space_counter;
593 state.space.end = 0;
594 state.space_counter = 0;
595 }
596
597 if (state.word.end > 0 || state.invisible_pending_word)
598 {
599 text_append_n (result, state.word.text, state.word.end);
600 state.counter += state.word_counter;
601
602 state.word.end = 0;
603 state.word_counter = 0;
604 state.invisible_pending_word = 0;
605 }
606 }
607
608 /* Function for users of this module. */
609 char *
xspara_add_pending_word(int add_spaces)610 xspara_add_pending_word (int add_spaces)
611 {
612 static TEXT ret;
613
614 text_reset (&ret);
615 state.end_line_count = 0;
616 xspara__add_pending_word (&ret, add_spaces);
617 if (ret.text)
618 return ret.text;
619 else
620 return "";
621 }
622
623 /* End a paragraph. */
624 char *
xspara_end(void)625 xspara_end (void)
626 {
627 static TEXT ret;
628 text_reset (&ret);
629 state.end_line_count = 0;
630 xspara__add_pending_word (&ret, state.add_final_space);
631 if (!state.no_final_newline && state.counter != 0)
632 {
633 text_append (&ret, "\n");
634 state.lines_counter++;
635 state.end_line_count++;
636 }
637
638 /* Now it's time to forget about the state. */
639 state_array[current_state].in_use = 0;
640 state.in_use = 0;
641
642 /* Don't do this so we can get the closing line counts. */
643 /* current_state = -1; */
644
645 if (ret.text)
646 return ret.text;
647 else
648 return "";
649 }
650
651 /* Add WORD to paragraph in RESULT, not refilling WORD. If we go past the end
652 of the line start a new one. TRANSPARENT means that the letters in WORD
653 are ignored for the purpose of deciding whether a full stop ends a sentence
654 or not. */
655 void
xspara__add_next(TEXT * result,char * word,int word_len,int transparent)656 xspara__add_next (TEXT *result, char *word, int word_len, int transparent)
657 {
658 int disinhibit = 0;
659 if (!word)
660 return;
661
662 if (word_len >= 1 && word[word_len - 1] == '\b')
663 {
664 word[--word_len] = '\0';
665 disinhibit = 1;
666 }
667
668 text_append_n (&state.word, word, word_len);
669 if (word_len == 0 && word)
670 state.invisible_pending_word = 1;
671
672 if (!transparent)
673 {
674 if (disinhibit)
675 state.last_letter = L'a'; /* a lower-case letter */
676 else
677 {
678 /* Save last character in WORD */
679 char *p = word + word_len;
680 int len = 0;
681 while (p > word)
682 {
683 p--; len++;
684 if ((long) mbrlen(p, len, NULL) > 0)
685 {
686 wchar_t wc = L'\0';
687 mbrtowc (&wc, p, len, NULL);
688 if (!wcschr (L".?!\"')]", wc))
689 {
690 state.last_letter = wc;
691 break;
692 }
693 }
694 }
695
696 }
697 }
698
699 if (strchr (word, '\n'))
700 {
701 /* If there was a newline in the word we just added, put the entire
702 pending ouput in the results string, and start a new line. */
703 xspara__add_pending_word (result, 0);
704 xspara__end_line ();
705 }
706 else
707 {
708 /* The possibility of two-column characters is ignored here. */
709
710 /* Calculate length of multibyte string in characters. */
711 int len = 0;
712 int left = word_len;
713 wchar_t w;
714 char *p = word;
715
716 while (left > 0)
717 {
718 int char_len = mbrtowc (&w, p, left, NULL);
719 left -= char_len;
720 p += char_len;
721 len++;
722 }
723
724 state.word_counter += len;
725 }
726
727 /* TODO: Shift this into the "else" clause above, because
728 xspara__end_line would have set state.counter to 0. */
729 if (state.counter != 0
730 && state.counter + state.word_counter + state.space_counter
731 > state.max)
732 {
733 xspara__cut_line (result);
734 }
735 }
736
737 /* Like _add_next but zero end_line_count at beginning. */
738 char *
xspara_add_next(char * text,int text_len,int transparent)739 xspara_add_next (char *text, int text_len, int transparent)
740 {
741 static TEXT t;
742
743 text_reset (&t);
744 state.end_line_count = 0;
745 xspara__add_next (&t, text, text_len, transparent);
746
747 if (t.space > 0)
748 return t.text;
749 else
750 return "";
751 }
752
753 void
xspara_remove_end_sentence(void)754 xspara_remove_end_sentence (void)
755 {
756 state.end_sentence = 0;
757 }
758
759 void
xspara_add_end_sentence(int value)760 xspara_add_end_sentence (int value)
761 {
762 state.end_sentence = value;
763 }
764
765 void
xspara_allow_end_sentence(void)766 xspara_allow_end_sentence (void)
767 {
768 state.last_letter = L'a'; /* A lower-case letter. */
769 }
770
771 /* -1 in a parameter means leave that value as it is. */
772 void
xspara_set_space_protection(int protect_spaces,int ignore_columns,int keep_end_lines,int french_spacing,int double_width_no_break)773 xspara_set_space_protection (int protect_spaces,
774 int ignore_columns,
775 int keep_end_lines,
776 int french_spacing,
777 int double_width_no_break)
778 {
779 if (protect_spaces != -1)
780 state.protect_spaces = protect_spaces;
781 if (ignore_columns != -1)
782 state.ignore_columns = ignore_columns;
783 if (keep_end_lines != -1)
784 state.keep_end_lines = keep_end_lines;
785 if (double_width_no_break != -1)
786 state.double_width_no_break = double_width_no_break;
787
788 /*fprintf (stderr, "SETTING SPACE (%d, %d, %d, %d)\n",
789 protect_spaces,
790 ignore_columns,
791 keep_end_lines,
792 french_spacing);*/
793
794 /* If at the end of a sentence, and due to output the end of sentence
795 space, and we switch to French spacing, then make the space up to
796 two spaces.
797
798 FIXME: This seems back-to-front: We want two spaces if we switch FROM
799 French spacing. */
800
801 if (state.french_spacing == 0
802 && french_spacing != -1 && french_spacing != 0
803 && state.end_sentence != -2 && state.end_sentence != 0
804 && state.counter != 0
805 && state.space.end > 0
806 && state.word.end == 0 && !state.invisible_pending_word)
807 {
808 while (state.space_counter < 2)
809 {
810 text_append_n (&state.space, " ", 1);
811 state.space_counter++;
812 }
813
814 /* End of sentence done. */
815 state.end_sentence = -2;
816 }
817
818 if (french_spacing != -1)
819 {
820 state.french_spacing = french_spacing;
821 }
822
823 if (protect_spaces != -1 && state.protect_spaces)
824 {
825 if (state.word.end == 0)
826 {
827 /* In _add_pending_word this meant that an "empty word" would
828 be output. This makes "a @w{} b" -> "a b", not "a b", and
829 "a @w{}" at end of paragraph -> "a ", not "a". */
830
831 state.invisible_pending_word = 1;
832 }
833 }
834
835 return;
836 }
837
838 /*****************************************************************/
839
840 /* Return string to be added to paragraph contents, wrapping text. This
841 function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to
842 work correctly. */
843 char *
xspara_add_text(char * text)844 xspara_add_text (char *text)
845 {
846 char *p = text;
847 int len;
848 wchar_t wc;
849 size_t char_len;
850 static TEXT result;
851 dTHX;
852
853 text_reset (&result);
854
855 len = strlen (text); /* FIXME: Get this as an argument */
856 state.end_line_count = 0;
857
858 while (len > 0)
859 {
860 char_len = mbrtowc (&wc, p, len, NULL);
861 if ((long) char_len == 0)
862 break; /* Null character. Shouldn't happen. */
863 else if ((long) char_len < 0)
864 {
865 p++; len--; /* Invalid. Just try to keep going. */
866 continue;
867 }
868
869 /* 00A0 and 202F are non-breaking spaces in Unicode. */
870 if (iswspace (wc) && wc != L'\x00a0' && wc != L'\x202f')
871 {
872 state.last_letter = L'\0';
873
874 /* If protect_spaces is on, ... */
875 if (state.protect_spaces)
876 {
877 /* Append the spaces to the pending word. */
878 text_append_n (&state.word, p, char_len);
879 state.word_counter++;
880
881 if (strchr (state.word.text, '\n'))
882 {
883 /* Replace any '\n' with a ' '. Note that state.word_counter
884 will still be correct after this. */
885 char *ptr = state.word.text;
886 while (*ptr)
887 {
888 if (*ptr == '\n')
889 *ptr = ' ';
890 ptr++;
891 }
892 }
893
894 if (state.counter != 0
895 && state.counter + state.word_counter + state.space_counter
896 > state.max)
897 {
898 xspara__cut_line (&result);
899 }
900 }
901 else /* protect_spaces off */
902 {
903 int pending = state.invisible_pending_word;
904 xspara__add_pending_word (&result, 0);
905
906 if (state.counter != 0 || state.unfilled || pending)
907 {
908 /* If we are at the end of a sentence where two spaces
909 are required. */
910 if (state.end_sentence == 1
911 && !state.french_spacing
912 && !state.unfilled)
913 {
914 state.space.end = 0;
915 text_append_n (&state.space, " ", 2);
916 state.space_counter = 2;
917 }
918 else /* Not at end of sentence. */
919 {
920 /* Only save the first space. */
921 if (state.unfilled || state.space_counter < 1)
922 {
923 if (*p == '\n' || *p == '\r')
924 {
925 if (!state.unfilled)
926 {
927 text_append_n (&state.space, " ", 1);
928 state.space_counter++;
929 }
930 else if (*p == '\n')
931 {
932 xspara__add_pending_word (&result, 0);
933 xspara__end_line ();
934 text_append (&result, "\n");
935 }
936 }
937 else
938 {
939 text_append_n (&state.space, p, char_len);
940 state.space_counter++;
941 }
942 }
943 }
944 }
945 }
946
947 /* If not enough space in the line for the pending space, start
948 a new line. */
949 if (state.counter + state.space_counter > state.max)
950 {
951 xspara__cut_line (&result);
952 }
953
954 if (!state.unfilled && *p == '\n' && state.keep_end_lines)
955 {
956 xspara__end_line ();
957 text_append (&result, "\n");
958 }
959 p += char_len; len -= char_len;
960 }
961 else /************** Not a white space character. *****************/
962 {
963 int width = wcwidth (wc);
964 /*************** Double width character. *********************/
965 if (width == 2)
966 {
967 state.last_letter = L'\0';
968
969 /* We allow a line break in between Chinese characters even if
970 there was no space between them, unlike single-width
971 characters. */
972
973 /* Append wc to state.word. */
974 text_append_n (&state.word, p, char_len);
975
976 state.word_counter += 2;
977
978 if (state.counter != 0
979 && state.counter + state.word_counter > state.max)
980 {
981 xspara__cut_line (&result);
982 }
983 /* If protect_spaces is on, accumulate the characters so that
984 they can be pushed onto the next line if necessary. */
985 if (!state.protect_spaces && !state.double_width_no_break)
986 {
987 xspara__add_pending_word (&result, 0);
988 state.end_sentence = -2;
989 }
990 }
991 else if (wc == L'\b')
992 {
993 /* Code to say that a following full stop (or question or
994 exclamation mark) may be an end of sentence. */
995 xspara_allow_end_sentence ();
996 }
997 /*************** Word character ******************************/
998 /* Note: width == 0 includes accent characters which should not
999 properly increase the column count. This is not what the pure
1000 Perl code does, though. */
1001 else if (width == 1 || width == 0)
1002 {
1003 char *added_word;
1004 added_word = malloc (char_len + 1);
1005 memcpy (added_word, p, char_len);
1006 added_word[char_len] = '\0';
1007
1008 xspara__add_next (&result, added_word, char_len, 0);
1009 free (added_word);
1010
1011 /* Now check if it is considered as an end of sentence, and
1012 set state.end_sentence if it is. */
1013
1014 if (strchr (".?!", *p) && !state.unfilled)
1015 {
1016 /* Doesn't count if preceded by an upper-case letter. */
1017 if (!iswupper (state.last_letter))
1018 {
1019 if (state.french_spacing)
1020 state.end_sentence = -1;
1021 else
1022 state.end_sentence = 1;
1023 }
1024 }
1025 else if (strchr ("\"')]", *p))
1026 {
1027 /* '"', '\'', ']' and ')' are ignored for the purpose
1028 of deciding whether a full stop ends a sentence. */
1029 }
1030 else
1031 {
1032 /* Otherwise reset the end of sentence marker: a full stop in
1033 a string like "aaaa.bbbb" doesn't mark an end of
1034 sentence. */
1035 state.end_sentence = -2;
1036 state.last_letter = wc;
1037 }
1038 }
1039 else
1040 {
1041 /* Not printable, possibly a tab, or a combining character.
1042 Add it to the pending word without increasing the column
1043 count. */
1044 text_append_n (&state.word, p, char_len);
1045 }
1046 p += char_len; len -= char_len;
1047 }
1048 }
1049
1050 if (result.space > 0)
1051 return result.text;
1052 else
1053 return "";
1054 }
1055
1056
1057