1 /* Copyright 2010-2019 Free Software Foundation, Inc.
2 
3    This program is free software: you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation, either version 3 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
15 
16 #ifdef HAVE_CONFIG_H
17   #include <config.h>
18 #endif
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <locale.h>
23 #ifndef _WIN32
24 #include <langinfo.h>
25 #else  /* _WIN32 */
26 /* Workaround for problems caused in mingw.org's MinGW build by
27    Gnulib's wchar.h overriding the wint_t type definition, which
28    causes compilation errors when perl.h is included below, because
29    perl.h includes ctype.h.  */
30 #include <ctype.h>
31 #endif
32 #include <wchar.h>
33 #include <wctype.h>
34 
35 /* See "How do I use all this in extensions" in 'man perlguts'. */
36 #define PERL_NO_GET_CONTEXT
37 
38 #include "EXTERN.h"
39 #include "perl.h"
40 #include "XSUB.h"
41 
42 #include "ppport.h"
43 
44 #include "miscxs.h"
45 
46 const char *whitespace_chars = " \t\f\v\r\n";
47 
48 int
xs_abort_empty_line(HV * self,HV * current,SV * additional_spaces_in)49 xs_abort_empty_line (HV *self, HV *current, SV *additional_spaces_in)
50 {
51   char *additional_spaces;
52   AV *contents_array;
53   SV **svp;
54   int contents_num;
55   HV *spaces_elt;
56   //char *key;
57   HV *test_extra = 0;
58   HV *command_extra = 0;
59 
60   HV *owning_elt = 0;
61   char *type;
62   SV *existing_text_sv;
63 
64   dTHX;
65 
66   /* Get additional text in UTF-8. */
67   if (additional_spaces_in)
68     {
69       STRLEN len;
70       static char *new_string;
71       additional_spaces = SvPV (additional_spaces_in, len);
72       if (!SvUTF8 (additional_spaces_in))
73         {
74           Safefree (new_string);
75           new_string = bytes_to_utf8 (additional_spaces, &len);
76           additional_spaces = new_string;
77         }
78     }
79   else
80     additional_spaces = "";
81 
82   svp = hv_fetch (current, "contents", strlen("contents"), 0);
83   if (!svp)
84     return 0;
85   contents_array = (AV *)SvRV(*svp);
86 
87   contents_num = av_len(contents_array) + 1;
88   if (contents_num == 0)
89     return 0;
90 
91   spaces_elt = (HV *) SvRV (*av_fetch (contents_array, contents_num - 1, 0));
92 
93   svp = hv_fetch (spaces_elt, "type", strlen ("type"), 0);
94   if (!svp)
95     return 0;
96 
97   type = SvPV_nolen (*svp);
98   if (!type)
99     return 0;
100 
101   /* Must be one of these types to continue. */
102   if (strcmp (type, "empty_line")
103        && strcmp (type, "empty_line_after_command")
104        && strcmp (type, "empty_spaces_before_argument")
105        && strcmp (type, "empty_spaces_after_close_brace"))
106     {
107       return 0;
108     }
109 
110   //fprintf (stderr, "ABORT EMPTY\n");
111 
112   svp = hv_fetch (spaces_elt, "extra", strlen ("extra"), 0);
113   if (svp)
114     {
115       test_extra = (HV *) SvRV (*svp);
116       svp = hv_fetch (test_extra, "command",
117                       strlen ("command"), 0);
118       if (svp)
119         {
120           owning_elt = (HV *) SvRV (*svp);
121           svp = hv_fetch (owning_elt, "extra", strlen ("extra"), 0);
122           if (svp)
123             command_extra = (HV *) SvRV (*svp);
124         }
125     }
126 
127   svp = hv_fetch (spaces_elt, "text", strlen ("text"), 0);
128   if (!svp)
129     return 0; /* or create it? change last arg from 0 to 1 */
130   existing_text_sv = *svp;
131 
132   /* Append the 'additional_spaces' argument. */
133   sv_utf8_upgrade (existing_text_sv);
134   sv_catpv (existing_text_sv, additional_spaces);
135 
136   if (!*SvPV_nolen (existing_text_sv)) /* existing text is empty */
137     {
138       /* Remove spaces_elt */
139       av_pop (contents_array);
140     }
141   else if (!strcmp (type, "empty_line"))
142     {
143       char *current_type;
144       AV *context_stack;
145       SV *top_context_sv;
146       char *top_context;
147       int top_index;
148 
149       svp = hv_fetch (current, "type", strlen ("type"), 0);
150       if (!svp)
151         current_type = 0;
152       else
153         current_type = SvPV_nolen (*svp);
154 
155       /* "Types with paragraphs".  Remove the type unless we are inside
156          one of these types. */
157       if (current_type
158           && strcmp (current_type, "before_item")
159           && strcmp (current_type, "text_root")
160           && strcmp (current_type, "document_root")
161           && strcmp (current_type, "brace_command_context"))
162         goto delete_type;
163 
164       /* Check the context stack. */
165       svp = hv_fetch (self, "context_stack", strlen ("context_stack"), 0);
166       if (!svp)
167         goto delete_type; /* shouldn't happen */
168       context_stack = (AV *) SvRV (*svp);
169       top_index = av_len (context_stack);
170       if (top_index < 0)
171         goto delete_type; /* shouldn't happen */
172       svp = av_fetch (context_stack, top_index, 0);
173       if (!svp)
174         goto delete_type; /* shouldn't happen */
175       top_context_sv = *svp;
176       top_context = SvPV_nolen (top_context_sv);
177 
178       /* Change type to "empty_spaces_before_paragraph" unless we are in
179          one of these contexts. */
180       if (strcmp (top_context, "math")
181           && strcmp (top_context, "menu")
182           && strcmp (top_context, "preformatted")
183           && strcmp (top_context, "rawpreformatted")
184           && strcmp (top_context, "def")
185           && strcmp (top_context, "inlineraw"))
186         {
187           hv_store (spaces_elt, "type", strlen ("type"),
188                     newSVpv ("empty_spaces_before_paragraph", 0), 0);
189         }
190       else
191         {
192 delete_type:
193           hv_delete (spaces_elt, "type", strlen ("type"), G_DISCARD);
194         }
195     }
196   else if (!strcmp (type, "empty_line_after_command")
197            || !strcmp (type, "empty_spaces_before_argument"))
198     {
199       STRLEN len;
200       char *ptr;
201 
202       if (owning_elt)
203         {
204           /* Remove spaces_elt */
205           av_pop (contents_array);
206 
207           ptr = SvPV(existing_text_sv, len);
208           /* Replace element reference with a simple string. */
209           if (!command_extra)
210             {
211               command_extra = newHV ();
212               hv_store (owning_elt, "extra", strlen ("extra"),
213                         newRV_inc((SV *)command_extra), 0);
214             }
215           hv_store (command_extra,
216                     "spaces_before_argument",
217                     strlen ("spaces_before_argument"),
218                     newSVpv(ptr, len),
219                     0);
220         }
221       else
222         {
223           hv_store (spaces_elt, "type", strlen ("type"),
224                     newSVpv ("empty_spaces_after_command", 0), 0);
225 
226         }
227     }
228   return 1;
229 }
230 
231 HV *
xs_merge_text(HV * self,HV * current,SV * text_in)232 xs_merge_text (HV *self, HV *current, SV *text_in)
233 {
234   AV *contents_array;
235 
236   int no_merge_with_following_text = 0;
237   char *text;
238   int leading_spaces;
239   SV *leading_spaces_sv = 0;
240   int call_ret;
241   SV *returned_sv;
242 
243   SV *contents_ref;
244   int contents_num;
245   HV *last_elt;
246   SV *existing_text_sv;
247   char *existing_text;
248   SV **svp;
249 
250   dTHX;
251 
252   dSP;
253 
254   /* Get text in UTF-8. */
255   {
256     STRLEN len;
257     static char *new_string;
258     text = SvPV (text_in, len);
259     if (!SvUTF8 (text_in))
260       {
261         Safefree (new_string);
262         new_string = bytes_to_utf8 (text, &len);
263         text = new_string;
264       }
265   }
266 
267   leading_spaces = strspn (text, whitespace_chars);
268   if (text[leading_spaces])
269     {
270       int contents_num;
271 
272       if (leading_spaces > 0)
273         {
274           leading_spaces_sv = newSVpv (text, leading_spaces);
275         }
276 
277       svp = hv_fetch (current,
278                       "contents", strlen ("contents"), 0);
279       contents_array = (AV *)SvRV(*svp);
280 
281       contents_num = av_len(contents_array) + 1;
282       if (contents_num > 0)
283         {
284           HV *last_elt;
285           char *type = 0;
286 
287           last_elt = (HV *)
288             SvRV (*av_fetch (contents_array, contents_num - 1, 0));
289 
290           svp = hv_fetch (last_elt, "type", strlen ("type"), 0);
291           if (svp)
292             type = SvPV_nolen (*svp);
293           if (type
294               && (!strcmp (type, "empty_line_after_command")
295                   || !strcmp (type, "empty_spaces_after_command")
296                   || !strcmp (type, "empty_spaces_before_argument")
297                   || !strcmp (type, "empty_spaces_after_close_brace")))
298             {
299               no_merge_with_following_text = 1;
300             }
301         }
302 
303       if (xs_abort_empty_line(self, current, leading_spaces_sv))
304         {
305           text += leading_spaces;
306         }
307 
308       /************************/
309       /* See 'perlcall' man page. */
310       ENTER;
311       SAVETMPS;
312 
313 
314       PUSHMARK(SP);
315       XPUSHs(sv_2mortal(newRV_inc((SV *)self)));
316       XPUSHs(sv_2mortal(newRV_inc((SV *)current)));
317       PUTBACK;
318 
319       call_ret = call_pv ("Texinfo::Parser::_begin_paragraph", G_SCALAR);
320 
321       SPAGAIN;
322 
323       returned_sv = POPs;
324 
325       /************************/
326 
327       if (returned_sv && SvRV(returned_sv))
328         {
329           current = (HV *)SvRV(returned_sv);
330         }
331 
332       FREETMPS;
333       LEAVE;
334     }
335 
336   svp = hv_fetch (current, "contents", strlen ("contents"), 0);
337   if (!svp)
338     {
339       contents_array = newAV ();
340       contents_ref = newRV_inc ((SV *) contents_array);
341       hv_store (current, "contents", strlen ("contents"),
342                 contents_ref, 0);
343       fprintf (stderr, "NEW CONTENTS %p\n", contents_array);
344       goto NEW_TEXT;
345     }
346   else
347     {
348       contents_ref = *svp;
349       contents_array = (AV *)SvRV(contents_ref);
350     }
351 
352   if (no_merge_with_following_text)
353     goto NEW_TEXT;
354 
355   contents_num = av_len(contents_array) + 1;
356   if (contents_num == 0)
357     goto NEW_TEXT;
358 
359   last_elt = (HV *)
360     SvRV (*av_fetch (contents_array, contents_num - 1, 0));
361   svp = hv_fetch (last_elt, "text", strlen ("text"), 0);
362   if (!svp)
363     goto NEW_TEXT;
364   existing_text_sv = *svp;
365   existing_text = SvPV_nolen (existing_text_sv);
366   if (strchr (existing_text, '\n'))
367     goto NEW_TEXT;
368 
369 MERGED_TEXT:
370   sv_catpv (existing_text_sv, text);
371   //fprintf (stderr, "MERGED TEXT: %s|||\n", text);
372 
373   if (0)
374     {
375       HV *hv;
376       SV *sv;
377 NEW_TEXT:
378       hv = newHV ();
379       sv = newSVpv (text, 0);
380       hv_store (hv, "text", strlen ("text"), sv, 0);
381       SvUTF8_on (sv);
382       hv_store (hv, "parent", strlen ("parent"),
383                 newRV_inc ((SV *)current), 0);
384       av_push (contents_array, newRV_inc ((SV *)hv));
385       //fprintf (stderr, "NEW TEXT: %s|||\n", text);
386     }
387 
388   return current;
389 }
390 
391 char *
xs_process_text(char * text)392 xs_process_text (char *text)
393 {
394   static char *new;
395   char *p, *q;
396 
397   dTHX;
398 
399   new = realloc (new, strlen (text) + 1);
400   strcpy (new, text);
401 
402   p = q = new;
403   while (*p)
404     {
405       if (*p == '-' && p[1] == '-')
406         {
407           if (p[2] == '-')
408             {
409               *q = '-'; q[1] = '-';
410               p += 3; q += 2;
411             }
412           else
413             {
414               *q = '-';
415               p += 2; q += 1;
416             }
417         }
418       else if (*p == '\'' && p[1] == '\'')
419         {
420           *q = '"';
421           p += 2; q += 1;
422         }
423       else if (*p == '`')
424         {
425           if (p[1] == '`')
426             {
427               *q = '"';
428               p += 2; q += 1;
429             }
430           else
431             {
432               *q = '\'';
433               p += 1; q += 1;
434             }
435         }
436       else
437         {
438           *q++ = *p++;
439         }
440     }
441   *q = '\0';
442 
443   return new;
444 }
445 
446 char *
xs_unicode_text(char * text,int in_code)447 xs_unicode_text (char *text, int in_code)
448 {
449   char *p, *q;
450   static char *new;
451   int new_space, new_len;
452 
453   dTHX; /* Perl boilerplate. */
454 
455   if (in_code)
456     return text;
457 
458   p = text;
459   new_space = strlen (text);
460   new = realloc (new, new_space + 1);
461   new_len = 0;
462 #define ADD3(s) \
463   if (new_len + 2 >= new_space - 1)               \
464     {                                             \
465       new_space += 2;                             \
466       new = realloc (new, new_space *= 2);        \
467     }                                             \
468   new[new_len++] = s[0];                          \
469   new[new_len++] = s[1];                          \
470   new[new_len++] = s[2];
471 
472 #define ADD1(s) \
473   if (new_len >= new_space - 1)                   \
474     new = realloc (new, (new_space *= 2) + 1);    \
475   new[new_len++] = s;
476 
477 #define ADDN(s, n) \
478   if (new_len + n - 1 >= new_space - 1)           \
479     {                                             \
480       new_space += n;                             \
481       new = realloc (new, (new_space *= 2) + 1);  \
482     }                                             \
483   memcpy(new + new_len, s, n);                    \
484   new_len += n;
485 
486   while (1)
487     {
488       q = p + strcspn (p, "-`'");
489       ADDN(p, q - p);
490       if (!*q)
491         break;
492       switch (*q)
493         {
494         case '-':
495           if (!memcmp (q, "---", 3))
496             {
497               p = q + 3;
498               /* Unicode em dash U+2014 (0xE2 0x80 0x94) */
499               ADD3("\xE2\x80\x94");
500             }
501           else if (!memcmp (q, "--", 2))
502             {
503               p = q + 2;
504               /* Unicode en dash U+2013 (0xE2 0x80 0x93) */
505               ADD3("\xE2\x80\x93");
506             }
507           else
508             {
509               p = q + 1;
510               ADD1(*q);
511             }
512           break;
513         case '`':
514           if (!memcmp (q, "``", 2))
515             {
516               p = q + 2;
517               /* U+201C E2 80 9C */
518               ADD3("\xE2\x80\x9C");
519             }
520           else
521             {
522               p = q + 1;
523               /* U+2018 E2 80 98 */
524               ADD3("\xE2\x80\x98");
525             }
526           break;
527         case '\'':
528           if (!memcmp (q, "''", 2))
529             {
530               p = q + 2;
531               /* U+201D E2 80 9D */
532               ADD3("\xE2\x80\x9D");
533             }
534           else
535             {
536               p = q + 1;
537               /* U+2019 E2 80 99 */
538               ADD3("\xE2\x80\x99");
539             }
540           break;
541         }
542     }
543 
544   new[new_len] = '\0';
545   return new;
546 }
547 
548 /* Return list ($at_command, $open_brace, $asterisk, $single_letter_command,
549        $separator_match) */
xs_parse_texi_regex(SV * text_in,char ** at_command,char ** open_brace,char ** asterisk,char ** single_letter_command,char ** separator_match,char ** new_text)550 void xs_parse_texi_regex (SV *text_in,
551                           char **at_command,
552                           char **open_brace,
553                           char **asterisk,
554                           char **single_letter_command,
555                           char **separator_match,
556                           char **new_text)
557 {
558   char *text;
559 
560   dTHX;
561 
562   /* Make sure the input is in UTF8. */
563   if (!SvUTF8 (text_in))
564     sv_utf8_upgrade (text_in);
565   text = SvPV_nolen (text_in);
566 
567   *at_command = *open_brace = *asterisk = *single_letter_command
568           = *separator_match = *new_text = 0;
569 
570   if (*text == '@' && isalnum(text[1]))
571     {
572       char *p, *q;
573       static char *s;
574 
575       p = text + 1;
576       q = text + 2;
577       while (isalnum (*q) || *q == '-' || *q == '_')
578         q++;
579 
580       s = realloc (s, q - p + 1);
581       memcpy (s, p, q - p);
582       s[q - p] = '\0';
583       *at_command = s;
584     }
585   else
586     {
587       if (*text == '{')
588         {
589           *open_brace = "{";
590           *separator_match = "{";
591         }
592 
593       else if (*text == '@'
594                  && text[1] && strchr ("([\"'~@&}{,.!?"
595                                        " \t\n"
596                                        "*-^`=:|/\\",
597                                        text[1]))
598         {
599           static char a[2];
600           *single_letter_command = a;
601           a[0] = text[1];
602           a[1] = '\0';
603         }
604 
605       else if (strchr ("{}@,:\t.\f", *text))
606         {
607           static char a[2];
608           *separator_match = a;
609           a[0] = *text;
610           a[1] = '\0';
611         }
612 
613       else
614         {
615           char *p;
616 
617           if (*text == '*')
618             *asterisk = "*";
619 
620           p = text;
621           p += strcspn (p, "{}@,:\t.\n\f");
622           if (p > text)
623             {
624               static char *s;
625               s = realloc (s, p - text + 1);
626               memcpy (s, text, p - text);
627               s[p - text] = '\0';
628               *new_text = s;
629             }
630         }
631     }
632 
633   return;
634 }
635