1 /* Copyright 2010-2021 Free Software Foundation, Inc.
2 
3    This program is free software: you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation, either version 3 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
15 
16 #ifdef HAVE_CONFIG_H
17   #include <config.h>
18 #endif
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <locale.h>
23 #ifndef _WIN32
24 #include <langinfo.h>
25 #else  /* _WIN32 */
26 /* Workaround for problems caused in mingw.org's MinGW build by
27    Gnulib's wchar.h overriding the wint_t type definition, which
28    causes compilation errors when perl.h is included below, because
29    perl.h includes ctype.h.  */
30 #include <ctype.h>
31 #endif
32 #include <wchar.h>
33 #include <wctype.h>
34 
35 /* See "How do I use all this in extensions" in 'man perlguts'. */
36 #define PERL_NO_GET_CONTEXT
37 
38 #include "EXTERN.h"
39 #include "perl.h"
40 #include "XSUB.h"
41 
42 #include "ppport.h"
43 
44 #include "xspara.h"
45 
46 #include "text.h"
47 
48 typedef struct {
49     TEXT space; /* Pending space, to be output before the pending word. */
50     TEXT word; /* Pending word.  If outputting this would have led to
51                   the line to be too long, the line should have been cut before
52                   saving it. */
53 
54     /* When word.end == 0, this indicates a word of length 0. */
55     int invisible_pending_word;
56 
57     /* Length of space in multibyte characters. */
58     int space_counter;
59 
60     /* Characters added so far in current word. */
61     int word_counter;
62 
63     /* -2 means we are not at the end of a sentence (undefined in Perl),
64        1 means we are at the end of a sentence and French spacing is off,
65        -1 means we are at the end of a sentence and French spacing is on.
66        0 means it is "inhibited". */
67     int end_sentence;
68 
69     int max; /* Maximum length of line. */
70     int indent_length; /* Columns to indent this line. */
71     int indent_length_next; /* Columns to indent the rest of the lines. */
72     int counter; /* Columns so far on this line. */
73 
74     int lines_counter; /* Lines so far added in paragraph. */
75     int end_line_count; /* Number of newlines so far in an output unit, i.e.
76                            with add_text or add_next. */
77 
78     wint_t last_letter; /* Last letter in word, used to decide if we're
79                             at the end of a sentence. */
80 
81     /* Options set with set_space_protection. */
82     int protect_spaces; /* Line break forbidden, as in @w. */
83     int ignore_columns; /* Don't cut line at right margin.  Used by
84                            @flushleft and @flushright. */
85     int keep_end_lines; /* A newline in the input ends a line in the output.
86                            Used by @flushleft and @flushright. */
87     int french_spacing; /* Only one space, not two, after a full stop. */
88     int double_width_no_break; /* No line break between double width chars. */
89 
90     /* No wrapping of lines and spaces are kept as-is. */
91     int unfilled;
92 
93     /* Do not terminate with a final newline. */
94     int no_final_newline;
95 
96     /* Terminate with any trailing space. */
97     int add_final_space;
98 
99     int in_use;
100 } PARAGRAPH;
101 
102 static PARAGRAPH state;
103 
104 #ifdef _WIN32
105 
106 #define WIN32_LEAN_AND_MEAN
107 #include <windows.h>
108 #include <errno.h>
109 
110 /* If Gnulib overrides wint_t with a wider type, we cannot use
111    iswspace etc. names, whose prototypes were seen with the original
112    wint_t in effect.  */
113 #ifdef GNULIB_defined_wint_t
114 # undef iswspace
115 # define iswspace(w) w32_iswspace(w)
116 # undef iswupper
117 # define iswupper(w) w32_iswupper(w)
118 #endif
119 
120 char *
w32_setlocale(int category,const char * value)121 w32_setlocale (int category, const char *value)
122 {
123   if (_stricmp (value, "en_us.utf-8") != 0)
124     return NULL;
125 
126   /* Switch to the Windows U.S. English locale with its default
127      codeset.  We will handle the non-ASCII text ourselves, so the
128      codeset is unimportant, and Windows doesn't support UTF-8 as the
129      codeset anyway.  */
130   return setlocale (category, "ENU");
131 }
132 #define setlocale(c,v)  w32_setlocale(c,v)
133 
134 size_t
mbrlen(const char * __restrict__ mbs,size_t n,mbstate_t * __restrict__ ps)135 mbrlen (const char * __restrict__ mbs, size_t n, mbstate_t * __restrict__ ps)
136 {
137   unsigned char byte1 = *mbs;
138 
139   if (ps != NULL)
140     {
141       errno = ENOSYS;
142       return -1;
143     }
144 
145   return
146     ((byte1 & 0x80) == 0) ? 1 : ((byte1 & 0x20) == 0) ? 2 :
147     ((byte1 & 0x10) == 0) ? 3 : 4;
148 }
149 
150 /* Convert a UTF-8 encoded multibyte string to a wide character.  */
151 size_t
mbrtowc(wchar_t * __restrict__ pwc,const char * __restrict__ mbs,size_t n,mbstate_t * __restrict__ ps)152 mbrtowc (wchar_t * __restrict__ pwc, const char * __restrict__ mbs, size_t n,
153 	 mbstate_t * __restrict__ ps)
154 {
155   int len = mbrlen (mbs, n, ps);
156 
157   if (mbs == NULL)
158     return 0;
159   else
160     {
161       wchar_t wc[2];
162       size_t n_utf16 = MultiByteToWideChar (CP_UTF8, MB_ERR_INVALID_CHARS,
163 					    mbs, len, wc, 2);
164       if (n_utf16 == 0)
165 	{
166 	  errno = EILSEQ;
167 	  return (size_t)-1;
168 	}
169       if (ps != NULL)
170 	{
171 	  errno = ENOSYS;
172 	  return (size_t)-1;
173 	}
174       /* We don't support UTF-16 surrogates, because the calling code
175 	 doesn't, and because character classification functions on
176 	 Windows don't support anything beyond the BMP anyway.  So we
177 	 return the first character of the surrogate pair and set
178 	 errno.  */
179       if (n_utf16 > 1)
180 	errno = ENOSYS;
181       if (pwc != NULL)
182 	*pwc = wc[0];
183 
184       return len;
185     }
186 }
187 
188 int
iswspace(wint_t wc)189 iswspace (wint_t wc)
190 {
191   /* See Unicode's Proplist.txt.  */
192   if ((wc >= 0x09 && wc <= 0x0D)
193       || wc == 0x20
194       || wc == 0x85
195       || wc == 0xA0
196       || wc == 0x1680
197       || (wc >= 0x2000 && wc <= 0x200A)
198       || wc == 0x2028
199       || wc == 0x2029
200       || wc == 0x202F
201       || wc == 0x205F
202       || wc == 0x3000)
203     return 1;
204 
205   return 0;
206 }
207 
208 /* FIXME: Provide a real implementation.  */
209 int
wcwidth(const wchar_t wc)210 wcwidth (const wchar_t wc)
211 {
212   return wc == 0 ? 0 : 1;
213 }
214 
215 int
iswupper(wint_t wi)216 iswupper (wint_t wi)
217 {
218   WORD char_type;
219   wchar_t wc = wi;
220   BOOL status = GetStringTypeW (CT_CTYPE1, &wc, 1, &char_type);
221 
222   if (!status || (char_type & C1_UPPER) == 0)
223     return 0;
224 
225   return 1;
226 }
227 
228 /* Avoid warnings due to redefinition of popen/pclose in Perl headers.  */
229 #ifdef popen
230 # undef popen
231 # define popen(c,m) _popen(c,m)
232 #endif
233 #ifdef pclose
234 # undef pclose
235 # define pclose(f)  _pclose(f)
236 #endif
237 
238 #endif
239 
240 int
xspara_init(int unused,char * unused2)241 xspara_init (int unused, char *unused2)
242 {
243   char *utf8_locale = 0;
244   int len;
245   char *cur;
246   char *dot;
247 
248   dTHX;
249 
250 #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
251   /* needed due to thread-safe locale handling in newer perls */
252   switch_to_global_locale();
253 #endif
254 
255   if (setlocale (LC_CTYPE, "en_US.UTF-8")
256       || setlocale (LC_CTYPE, "en_US.utf8"))
257     goto success;
258 
259   cur = setlocale (LC_CTYPE, 0); /* Name of current locale. */
260   if (!cur)
261     goto failure;
262   len = strlen (cur);
263   if (len >= 6 && !memcmp (".UTF-8", cur + len - 6, 6)
264       || len >= 5 && !memcmp (".utf8", cur + len - 5, 5)
265       || len >= 6 && !memcmp (".utf-8", cur + len - 6, 6)
266       || len >= 5 && !memcmp (".UTF8", cur + len - 5, 5))
267     {
268       setlocale (LC_CTYPE, ""); /* Use the locale from the environment. */
269       goto success;
270     }
271 
272   /* Otherwise try altering the current locale name. */
273   dot = strchr (cur, '.');
274   if (!dot)
275     dot = cur + len;
276   utf8_locale = malloc (len + 6 + 1); /* enough to add ".UTF-8" to end */
277   memcpy (utf8_locale, cur, dot - cur);
278   dot = utf8_locale + (dot - cur);
279   memcpy (dot, ".UTF-8", 7);
280   if (setlocale (LC_CTYPE, utf8_locale))
281     goto success;
282 
283   memcpy (dot, ".utf8", 6);
284   if (setlocale (LC_CTYPE, utf8_locale))
285     goto success;
286 
287   /* Otherwise, look for any UTF-8 locale in the output of "locale -a". */
288   {
289   FILE *p;
290   char *line = 0;
291   size_t n = 0;
292   ssize_t ret;
293   p = popen ("locale -a", "r");
294   if (!p)
295     goto failure;
296   while (1)
297     {
298       ret = getline (&line, &n, p);
299       if (ret == (ssize_t) -1)
300         {
301           free (line);
302           pclose (p);
303           goto failure;
304         }
305       if (strstr (line, "UTF-8") || strstr (line, "utf8"))
306         {
307           line[ret - 1] = '\0';   /* Remove trailing newline. */
308           if (setlocale (LC_CTYPE, line))
309             {
310               free (line);
311               pclose (p);
312               goto success;
313             }
314         }
315     }
316   }
317 
318   if (1)
319     {
320 failure:
321       return 0; /* failure */
322     }
323   else
324     {
325 success: ;
326       free (utf8_locale);
327 #if PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION > 8)
328       /* needed due to thread-safe locale handling in newer perls */
329       sync_locale();
330 #endif
331       /*
332       fprintf (stderr, "tried to set LC_CTYPE to UTF-8.\n");
333       fprintf (stderr, "character encoding is: %s\n",
334                nl_langinfo (CODESET));
335        */
336       return 1; /* success */
337     }
338 }
339 
340 /* Array for storing paragraph states which aren't in use. */
341 static PARAGRAPH *state_array;
342 static int state_array_size;
343 
344 /* The slot in state_array for saving the current state. */
345 static int current_state;
346 
347 static void
xspara__switch_state(int id)348 xspara__switch_state (int id)
349 {
350   if (current_state == id)
351     return;
352   if (current_state != -1)
353     memcpy (&state_array[current_state], &state, sizeof (PARAGRAPH));
354 
355   memcpy (&state, &state_array[id], sizeof (PARAGRAPH));
356   current_state = id;
357 }
358 
359 int
xspara_new(HV * conf)360 xspara_new (HV *conf)
361 {
362   int i;
363 
364   dTHX; /* Perl boiler plate */
365 
366   TEXT saved_space, saved_word;
367 
368   /* Find an unused slot in state_array */
369   for (i = 0; i < state_array_size; i++)
370     {
371       if (!state_array[i].in_use)
372         break;
373     }
374   if (i == state_array_size)
375     {
376       state_array = realloc (state_array,
377                              (state_array_size += 10) * sizeof (PARAGRAPH));
378       memset (state_array + i, 0, 10 * sizeof (PARAGRAPH));
379     }
380 
381   state_array[i].in_use = 1;
382   xspara__switch_state (i);
383 
384   /* Zero formatter, reusing storage. */
385   saved_space = state.space;
386   saved_word = state.word;
387   memset (&state, 0, sizeof (state));
388   state.space = saved_space;
389   state.word = saved_word;
390   state.space.end = state.word.end = 0;
391   state.in_use = 1;
392 
393   /* Default values. */
394   state.max = 72;
395   state.indent_length_next = -1; /* Special value meaning undefined. */
396   state.end_sentence = -2; /* Special value meaning undefined. */
397   state.last_letter = L'\0';
398 
399   if (conf)
400     xspara_init_state (conf);
401 
402   /* The paragraph ID. */
403   return i;
404 }
405 
406 
407 /* SV is a blessed reference to an integer containing the paragraph ID. */
408 void
xspara_set_state(SV * sv)409 xspara_set_state (SV *sv)
410 {
411   dTHX;
412 
413   xspara__switch_state (SvIV (sv));
414 }
415 
416 /* Set the state internal to this C module from the Perl hash. */
417 void
xspara_init_state(HV * hash)418 xspara_init_state (HV *hash)
419 {
420 #define FETCH(key) hv_fetch (hash, key, strlen (key), 0)
421 #define FETCH_INT(key,where) { val = FETCH(key); \
422                                if (val) { where = SvIV (*val); } }
423 
424   SV **val;
425 
426   dTHX; /* This is boilerplate for interacting with Perl. */
427 
428   /* Fetch all these so they are set, and reset for each paragraph. */
429   FETCH_INT("end_sentence", state.end_sentence);
430   FETCH_INT("max", state.max);
431 
432   FETCH_INT("indent_length", state.indent_length);
433   FETCH_INT("indent_length_next", state.indent_length_next);
434   FETCH_INT("counter", state.counter);
435 
436   FETCH_INT("word_counter", state.word_counter);
437 
438   FETCH_INT("lines_counter", state.lines_counter);
439   FETCH_INT("end_line_count", state.end_line_count);
440 
441   FETCH_INT("protect_spaces", state.protect_spaces);
442   FETCH_INT("ignore_columns", state.ignore_columns);
443   FETCH_INT("keep_end_lines", state.keep_end_lines);
444   FETCH_INT("frenchspacing", state.french_spacing);
445 
446   FETCH_INT("unfilled", state.unfilled);
447   FETCH_INT("no_final_newline", state.no_final_newline);
448   FETCH_INT("add_final_space", state.add_final_space);
449 
450   val = FETCH("word");
451   if (val)
452     {
453       fprintf (stderr, "Bug: setting 'word' is not supported.\n");
454       abort ();
455     }
456   val = FETCH("space");
457   if (val)
458     {
459       fprintf (stderr, "Bug: setting 'space' is not supported.\n");
460       abort ();
461     }
462   return;
463 
464 #undef FETCH
465 #undef FETCH_INT
466 }
467 
468 /* Move the state back into the Perl hash. */
469 void
xspara_get_state(HV * hash)470 xspara_get_state (HV *hash)
471 {
472   /* TODO: The last argument of hv_store would be a precomputed hash, which
473      would save the time of calculating it. */
474 #define STORE(key) hv_store (hash, key, strlen (key), val, 0)
475 
476   SV *val;
477 
478   /* Don't do anything. */
479   return;
480 
481   dTHX; /* Perl boilerplate. */
482 
483   val = newSViv (state.end_sentence);
484   STORE("end_sentence");
485 
486   val = newSViv (state.counter);
487   STORE("counter");
488 
489   val = newSViv (state.word_counter);
490   STORE("word_counter");
491 
492   val = newSViv (state.lines_counter);
493   STORE("lines_counter");
494 
495   return;
496 
497 
498 #undef STORE
499 }
500 
501 
502 /************************************************************************/
503 
504 
505 /* Append a newline character to RESULT. */
506 void
xspara__cut_line(TEXT * result)507 xspara__cut_line (TEXT *result)
508 {
509   if (!state.ignore_columns)
510     {
511       xspara__end_line ();
512 
513       text_append (result, "\n");
514     }
515 }
516 
517 int
xspara_end_line_count(void)518 xspara_end_line_count (void)
519 {
520   return state.end_line_count;
521 }
522 
523 /* End a line (throwing away a pending space, which we don't need)
524    Note _end_line in Paragraph.pm returned "\n". */
525 void
xspara__end_line(void)526 xspara__end_line (void)
527 {
528   state.counter = 0;
529   state.space.end = 0;
530   state.space_counter = 0;
531 
532   /* This will only be true for the first line of output. */
533   if (state.indent_length_next != -1)
534     {
535       state.indent_length = state.indent_length_next;
536       state.indent_length_next = -1;
537     }
538 
539   state.lines_counter++;
540   state.end_line_count++;
541 }
542 
543 char *
xspara_end_line(void)544 xspara_end_line (void)
545 {
546   state.end_line_count = 0;
547   xspara__end_line ();
548   return "\n";
549 }
550 
551 /* Return concatenation of SPACE and WORD. */
552 char *
xspara_get_pending(void)553 xspara_get_pending (void)
554 {
555   static TEXT t;
556   text_reset (&t);
557   text_append_n (&t, state.space.text, state.space.end);
558   text_append_n (&t, state.word.text, state.word.end);
559   return t.text;
560 }
561 
562 /* Append to RESULT pending space followed by pending word, clearing them
563    afterwards.  Assume we don't need to wrap a line.  Only add spaces without a
564    word if ADD_SPACES. */
565 void
xspara__add_pending_word(TEXT * result,int add_spaces)566 xspara__add_pending_word (TEXT *result, int add_spaces)
567 {
568   if (state.word.end == 0 && !state.invisible_pending_word && !add_spaces)
569     return;
570 
571   if (state.indent_length > state.counter)
572     {
573       int i;
574       /* If we are not up to the left margin yet, output spaces to get there,
575          and ignore 'state.space', the pending space string.  In this case
576          state.counter is probably 0.  */
577 
578       for (i = 0; i < state.indent_length - state.counter; i++)
579         text_append (result, " ");
580       state.counter = state.indent_length;
581 
582       /* Do not output leading spaces after the indent, unless 'unfilled'
583          is on.  */
584       if (!state.unfilled)
585         state.space.end = 0;
586     }
587 
588   if (state.space.end > 0)
589     {
590       text_append_n (result, state.space.text, state.space.end);
591 
592       state.counter += state.space_counter;
593       state.space.end = 0;
594       state.space_counter = 0;
595     }
596 
597   if (state.word.end > 0 || state.invisible_pending_word)
598     {
599       text_append_n (result, state.word.text, state.word.end);
600       state.counter += state.word_counter;
601 
602       state.word.end = 0;
603       state.word_counter = 0;
604       state.invisible_pending_word = 0;
605     }
606 }
607 
608 /* Function for users of this module. */
609 char *
xspara_add_pending_word(int add_spaces)610 xspara_add_pending_word (int add_spaces)
611 {
612   static TEXT ret;
613 
614   text_reset (&ret);
615   state.end_line_count = 0;
616   xspara__add_pending_word (&ret, add_spaces);
617   if (ret.text)
618     return ret.text;
619   else
620     return "";
621 }
622 
623 /* End a paragraph. */
624 char *
xspara_end(void)625 xspara_end (void)
626 {
627   static TEXT ret;
628   text_reset (&ret);
629   state.end_line_count = 0;
630   xspara__add_pending_word (&ret, state.add_final_space);
631   if (!state.no_final_newline && state.counter != 0)
632     {
633       text_append (&ret, "\n");
634       state.lines_counter++;
635       state.end_line_count++;
636     }
637 
638   /* Now it's time to forget about the state. */
639   state_array[current_state].in_use = 0;
640   state.in_use = 0;
641 
642   /* Don't do this so we can get the closing line counts. */
643   /* current_state = -1; */
644 
645   if (ret.text)
646     return ret.text;
647   else
648     return "";
649 }
650 
651 /* Add WORD to paragraph in RESULT, not refilling WORD.  If we go past the end
652    of the line start a new one.  TRANSPARENT means that the letters in WORD
653    are ignored for the purpose of deciding whether a full stop ends a sentence
654    or not. */
655 void
xspara__add_next(TEXT * result,char * word,int word_len,int transparent)656 xspara__add_next (TEXT *result, char *word, int word_len, int transparent)
657 {
658   int disinhibit = 0;
659   if (!word)
660     return;
661 
662   if (word_len >= 1 && word[word_len - 1] == '\b')
663     {
664       word[--word_len] = '\0';
665       disinhibit = 1;
666     }
667 
668   text_append_n (&state.word, word, word_len);
669   if (word_len == 0 && word)
670     state.invisible_pending_word = 1;
671 
672   if (!transparent)
673     {
674       if (disinhibit)
675         state.last_letter = L'a'; /* a lower-case letter */
676       else
677         {
678           /* Save last character in WORD */
679           char *p = word + word_len;
680           int len = 0;
681           while (p > word)
682             {
683               p--; len++;
684               if ((long) mbrlen(p, len, NULL) > 0)
685                 {
686                   wchar_t wc = L'\0';
687                   mbrtowc (&wc, p, len, NULL);
688                   if (!wcschr (L".?!\"')]", wc))
689                     {
690                       state.last_letter = wc;
691                       break;
692                     }
693                 }
694             }
695 
696         }
697     }
698 
699   if (strchr (word, '\n'))
700     {
701       /* If there was a newline in the word we just added, put the entire
702          pending ouput in the results string, and start a new line. */
703       xspara__add_pending_word (result, 0);
704       xspara__end_line ();
705     }
706   else
707     {
708       /* The possibility of two-column characters is ignored here. */
709 
710       /* Calculate length of multibyte string in characters. */
711       int len = 0;
712       int left = word_len;
713       wchar_t w;
714       char *p = word;
715 
716       while (left > 0)
717         {
718           int char_len = mbrtowc (&w, p, left, NULL);
719           left -= char_len;
720           p += char_len;
721           len++;
722         }
723 
724       state.word_counter += len;
725     }
726 
727   /* TODO: Shift this into the "else" clause above, because
728      xspara__end_line would have set state.counter to 0. */
729   if (state.counter != 0
730       && state.counter + state.word_counter + state.space_counter
731           > state.max)
732     {
733       xspara__cut_line (result);
734     }
735 }
736 
737 /* Like _add_next but zero end_line_count at beginning. */
738 char *
xspara_add_next(char * text,int text_len,int transparent)739 xspara_add_next (char *text, int text_len, int transparent)
740 {
741   static TEXT t;
742 
743   text_reset (&t);
744   state.end_line_count = 0;
745   xspara__add_next (&t, text, text_len, transparent);
746 
747   if (t.space > 0)
748     return t.text;
749   else
750     return "";
751 }
752 
753 void
xspara_remove_end_sentence(void)754 xspara_remove_end_sentence (void)
755 {
756   state.end_sentence = 0;
757 }
758 
759 void
xspara_add_end_sentence(int value)760 xspara_add_end_sentence (int value)
761 {
762   state.end_sentence = value;
763 }
764 
765 void
xspara_allow_end_sentence(void)766 xspara_allow_end_sentence (void)
767 {
768   state.last_letter = L'a'; /* A lower-case letter. */
769 }
770 
771 /* -1 in a parameter means leave that value as it is. */
772 void
xspara_set_space_protection(int protect_spaces,int ignore_columns,int keep_end_lines,int french_spacing,int double_width_no_break)773 xspara_set_space_protection (int protect_spaces,
774                              int ignore_columns,
775                              int keep_end_lines,
776                              int french_spacing,
777                              int double_width_no_break)
778 {
779   if (protect_spaces != -1)
780     state.protect_spaces = protect_spaces;
781   if (ignore_columns != -1)
782     state.ignore_columns = ignore_columns;
783   if (keep_end_lines != -1)
784     state.keep_end_lines = keep_end_lines;
785   if (double_width_no_break != -1)
786     state.double_width_no_break = double_width_no_break;
787 
788   /*fprintf (stderr, "SETTING SPACE (%d, %d, %d, %d)\n",
789                                    protect_spaces,
790                                    ignore_columns,
791                                    keep_end_lines,
792                                    french_spacing);*/
793 
794   /* If at the end of a sentence, and due to output the end of sentence
795      space, and we switch to French spacing, then make the space up to
796      two spaces.
797 
798      FIXME: This seems back-to-front: We want two spaces if we switch FROM
799      French spacing. */
800 
801   if (state.french_spacing == 0
802       && french_spacing != -1 && french_spacing != 0
803       && state.end_sentence != -2 && state.end_sentence != 0
804       && state.counter != 0
805       && state.space.end > 0
806       && state.word.end == 0 && !state.invisible_pending_word)
807     {
808       while (state.space_counter < 2)
809         {
810           text_append_n (&state.space, " ", 1);
811           state.space_counter++;
812         }
813 
814       /* End of sentence done. */
815       state.end_sentence = -2;
816     }
817 
818   if (french_spacing != -1)
819     {
820       state.french_spacing = french_spacing;
821     }
822 
823  if (protect_spaces != -1 && state.protect_spaces)
824    {
825      if (state.word.end == 0)
826        {
827          /* In _add_pending_word this meant that an "empty word" would
828             be output.  This makes "a @w{} b" -> "a  b", not "a b", and
829             "a @w{}" at end of paragraph -> "a ", not "a". */
830 
831          state.invisible_pending_word = 1;
832        }
833    }
834 
835  return;
836 }
837 
838 /*****************************************************************/
839 
840 /* Return string to be added to paragraph contents, wrapping text. This
841    function relies on there being a UTF-8 locale in LC_CTYPE for mbrtowc to
842    work correctly. */
843 char *
xspara_add_text(char * text)844 xspara_add_text (char *text)
845 {
846   char *p = text;
847   int len;
848   wchar_t wc;
849   size_t char_len;
850   static TEXT result;
851   dTHX;
852 
853   text_reset (&result);
854 
855   len = strlen (text); /* FIXME: Get this as an argument */
856   state.end_line_count = 0;
857 
858   while (len > 0)
859     {
860       char_len = mbrtowc (&wc, p, len, NULL);
861       if ((long) char_len == 0)
862         break; /* Null character. Shouldn't happen. */
863       else if ((long) char_len < 0)
864         {
865           p++; len--; /* Invalid.  Just try to keep going. */
866           continue;
867         }
868 
869       /* 00A0 and 202F are non-breaking spaces in Unicode. */
870       if (iswspace (wc) && wc != L'\x00a0' && wc != L'\x202f')
871         {
872           state.last_letter = L'\0';
873 
874           /* If protect_spaces is on, ... */
875           if (state.protect_spaces)
876             {
877               /* Append the spaces to the pending word. */
878               text_append_n (&state.word, p, char_len);
879               state.word_counter++;
880 
881               if (strchr (state.word.text, '\n'))
882                 {
883                   /* Replace any '\n' with a ' '. Note that state.word_counter
884                      will still be correct after this. */
885                   char *ptr = state.word.text;
886                   while (*ptr)
887                     {
888                       if (*ptr == '\n')
889                         *ptr = ' ';
890                       ptr++;
891                     }
892                 }
893 
894               if (state.counter != 0
895                   && state.counter + state.word_counter + state.space_counter
896                      > state.max)
897                 {
898                   xspara__cut_line (&result);
899                 }
900             }
901           else /* protect_spaces off */
902             {
903               int pending = state.invisible_pending_word;
904               xspara__add_pending_word (&result, 0);
905 
906               if (state.counter != 0 || state.unfilled || pending)
907                 {
908                   /* If we are at the end of a sentence where two spaces
909                      are required. */
910                   if (state.end_sentence == 1
911                       && !state.french_spacing
912                       && !state.unfilled)
913                     {
914                       state.space.end = 0;
915                       text_append_n (&state.space, "  ", 2);
916                       state.space_counter = 2;
917                     }
918                   else /* Not at end of sentence. */
919                     {
920                       /* Only save the first space. */
921                       if (state.unfilled || state.space_counter < 1)
922                         {
923                           if (*p == '\n' || *p == '\r')
924                             {
925                               if (!state.unfilled)
926                                 {
927                                   text_append_n (&state.space, " ", 1);
928                                   state.space_counter++;
929                                 }
930                               else if (*p == '\n')
931                                 {
932                                   xspara__add_pending_word (&result, 0);
933                                   xspara__end_line ();
934                                   text_append (&result, "\n");
935                                 }
936                             }
937                           else
938                             {
939                               text_append_n (&state.space, p, char_len);
940                               state.space_counter++;
941                             }
942                         }
943                     }
944                 }
945             }
946 
947           /* If not enough space in the line for the pending space, start
948              a new line. */
949           if (state.counter + state.space_counter > state.max)
950             {
951               xspara__cut_line (&result);
952             }
953 
954           if (!state.unfilled && *p == '\n' && state.keep_end_lines)
955             {
956               xspara__end_line ();
957               text_append (&result, "\n");
958             }
959           p += char_len; len -= char_len;
960         }
961       else /************** Not a white space character. *****************/
962         {
963           int width = wcwidth (wc);
964           /*************** Double width character. *********************/
965           if (width == 2)
966             {
967               state.last_letter = L'\0';
968 
969               /* We allow a line break in between Chinese characters even if
970                  there was no space between them, unlike single-width
971                  characters. */
972 
973               /* Append wc to state.word. */
974               text_append_n (&state.word, p, char_len);
975 
976               state.word_counter += 2;
977 
978               if (state.counter != 0
979                   && state.counter + state.word_counter > state.max)
980                 {
981                   xspara__cut_line (&result);
982                 }
983               /* If protect_spaces is on, accumulate the characters so that
984                  they can be pushed onto the next line if necessary. */
985               if (!state.protect_spaces && !state.double_width_no_break)
986                 {
987                   xspara__add_pending_word (&result, 0);
988                   state.end_sentence = -2;
989                 }
990             }
991           else if (wc == L'\b')
992             {
993               /* Code to say that a following full stop (or question or
994                  exclamation mark) may be an end of sentence. */
995               xspara_allow_end_sentence ();
996             }
997           /*************** Word character ******************************/
998           /* Note: width == 0 includes accent characters which should not
999              properly increase the column count.  This is not what the pure
1000              Perl code does, though. */
1001           else if (width == 1 || width == 0)
1002             {
1003               char *added_word;
1004               added_word = malloc (char_len + 1);
1005               memcpy (added_word, p, char_len);
1006               added_word[char_len] = '\0';
1007 
1008               xspara__add_next (&result, added_word, char_len, 0);
1009               free (added_word);
1010 
1011               /* Now check if it is considered as an end of sentence, and
1012                  set state.end_sentence if it is. */
1013 
1014               if (strchr (".?!", *p) && !state.unfilled)
1015                 {
1016                   /* Doesn't count if preceded by an upper-case letter. */
1017                   if (!iswupper (state.last_letter))
1018                     {
1019                       if (state.french_spacing)
1020                         state.end_sentence = -1;
1021                       else
1022                         state.end_sentence = 1;
1023                     }
1024                 }
1025               else if (strchr ("\"')]", *p))
1026                 {
1027                   /* '"', '\'', ']' and ')' are ignored for the purpose
1028                    of deciding whether a full stop ends a sentence. */
1029                 }
1030               else
1031                 {
1032                   /* Otherwise reset the end of sentence marker: a full stop in
1033                      a string like "aaaa.bbbb" doesn't mark an end of
1034                      sentence. */
1035                   state.end_sentence = -2;
1036                   state.last_letter = wc;
1037                 }
1038             }
1039           else
1040             {
1041               /* Not printable, possibly a tab, or a combining character.
1042                  Add it to the pending word without increasing the column
1043                  count. */
1044               text_append_n (&state.word, p, char_len);
1045             }
1046           p += char_len; len -= char_len;
1047         }
1048     }
1049 
1050   if (result.space > 0)
1051     return result.text;
1052   else
1053     return "";
1054 }
1055 
1056 
1057