1/* Copyright 2010-2019 Free Software Foundation, Inc.
2
3   This program is free software: you can redistribute it and/or modify
4   it under the terms of the GNU General Public License as published by
5   the Free Software Foundation, either version 3 of the License, or
6   (at your option) any later version.
7
8   This program is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11   GNU General Public License for more details.
12
13   You should have received a copy of the GNU General Public License
14   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
15
16#ifdef HAVE_CONFIG_H
17  #include <config.h>
18#endif
19#include <stdlib.h>
20#include <stdio.h>
21#include <string.h>
22#include <locale.h>
23#ifndef _WIN32
24#include <langinfo.h>
25#else  /* _WIN32 */
26/* Workaround for problems caused in mingw.org's MinGW build by
27   Gnulib's wchar.h overriding the wint_t type definition, which
28   causes compilation errors when perl.h is included below, because
29   perl.h includes ctype.h.  */
30#include <ctype.h>
31#endif
32#include <wchar.h>
33#include <wctype.h>
34
35/* See "How do I use all this in extensions" in 'man perlguts'. */
36#define PERL_NO_GET_CONTEXT
37
38#include "EXTERN.h"
39#include "perl.h"
40#include "XSUB.h"
41
42#include "ppport.h"
43
44#include "miscxs.h"
45
46const char *whitespace_chars = " \t\f\v\r\n";
47
48int
49xs_abort_empty_line (HV *self, HV *current, SV *additional_spaces_in)
50{
51  char *additional_spaces;
52  AV *contents_array;
53  SV **svp;
54  int contents_num;
55  HV *spaces_elt;
56  //char *key;
57  HV *test_extra = 0;
58  HV *command_extra = 0;
59
60  HV *owning_elt = 0;
61  char *type;
62  SV *existing_text_sv;
63
64  dTHX;
65
66  /* Get additional text in UTF-8. */
67  if (additional_spaces_in)
68    {
69      STRLEN len;
70      static char *new_string;
71      additional_spaces = SvPV (additional_spaces_in, len);
72      if (!SvUTF8 (additional_spaces_in))
73        {
74          free (new_string);
75          new_string = bytes_to_utf8 (additional_spaces, &len);
76          additional_spaces = new_string;
77        }
78    }
79  else
80    additional_spaces = "";
81
82  svp = hv_fetch (current, "contents", strlen("contents"), 0);
83  if (!svp)
84    return 0;
85  contents_array = (AV *)SvRV(*svp);
86
87  contents_num = av_len(contents_array) + 1;
88  if (contents_num == 0)
89    return 0;
90
91  spaces_elt = (HV *) SvRV (*av_fetch (contents_array, contents_num - 1, 0));
92
93  svp = hv_fetch (spaces_elt, "type", strlen ("type"), 0);
94  if (!svp)
95    return 0;
96
97  type = SvPV_nolen (*svp);
98  if (!type)
99    return 0;
100
101  /* Must be one of these types to continue. */
102  if (strcmp (type, "empty_line")
103       && strcmp (type, "empty_line_after_command")
104       && strcmp (type, "empty_spaces_before_argument")
105       && strcmp (type, "empty_spaces_after_close_brace"))
106    {
107      return 0;
108    }
109
110  //fprintf (stderr, "ABORT EMPTY\n");
111
112  svp = hv_fetch (spaces_elt, "extra", strlen ("extra"), 0);
113  if (svp)
114    {
115      test_extra = (HV *) SvRV (*svp);
116      svp = hv_fetch (test_extra, "command",
117                      strlen ("command"), 0);
118      if (svp)
119        {
120          owning_elt = (HV *) SvRV (*svp);
121          svp = hv_fetch (owning_elt, "extra", strlen ("extra"), 0);
122          if (svp)
123            command_extra = (HV *) SvRV (*svp);
124        }
125    }
126
127  svp = hv_fetch (spaces_elt, "text", strlen ("text"), 0);
128  if (!svp)
129    return 0; /* or create it? change last arg from 0 to 1 */
130  existing_text_sv = *svp;
131
132  /* Append the 'additional_spaces' argument. */
133  sv_utf8_upgrade (existing_text_sv);
134  sv_catpv (existing_text_sv, additional_spaces);
135
136  if (!*SvPV_nolen (existing_text_sv)) /* existing text is empty */
137    {
138      /* Remove spaces_elt */
139      av_pop (contents_array);
140    }
141  else if (!strcmp (type, "empty_line"))
142    {
143      char *current_type;
144      AV *context_stack;
145      SV *top_context_sv;
146      char *top_context;
147      int top_index;
148
149      svp = hv_fetch (current, "type", strlen ("type"), 0);
150      if (!svp)
151        current_type = 0;
152      else
153        current_type = SvPV_nolen (*svp);
154
155      /* "Types with paragraphs".  Remove the type unless we are inside
156         one of these types. */
157      if (current_type
158          && strcmp (current_type, "before_item")
159          && strcmp (current_type, "text_root")
160          && strcmp (current_type, "document_root")
161          && strcmp (current_type, "brace_command_context"))
162        goto delete_type;
163
164      /* Check the context stack. */
165      svp = hv_fetch (self, "context_stack", strlen ("context_stack"), 0);
166      if (!svp)
167        goto delete_type; /* shouldn't happen */
168      context_stack = (AV *) SvRV (*svp);
169      top_index = av_len (context_stack);
170      if (top_index < 0)
171        goto delete_type; /* shouldn't happen */
172      svp = av_fetch (context_stack, top_index, 0);
173      if (!svp)
174        goto delete_type; /* shouldn't happen */
175      top_context_sv = *svp;
176      top_context = SvPV_nolen (top_context_sv);
177
178      /* Change type to "empty_spaces_before_paragraph" unless we are in
179         one of these contexts. */
180      if (strcmp (top_context, "math")
181          && strcmp (top_context, "menu")
182          && strcmp (top_context, "preformatted")
183          && strcmp (top_context, "rawpreformatted")
184          && strcmp (top_context, "def")
185          && strcmp (top_context, "inlineraw"))
186        {
187          hv_store (spaces_elt, "type", strlen ("type"),
188                    newSVpv ("empty_spaces_before_paragraph", 0), 0);
189        }
190      else
191        {
192delete_type:
193          hv_delete (spaces_elt, "type", strlen ("type"), G_DISCARD);
194        }
195    }
196  else if (!strcmp (type, "empty_line_after_command")
197           || !strcmp (type, "empty_spaces_before_argument"))
198    {
199      STRLEN len;
200      char *ptr;
201
202      if (owning_elt)
203        {
204          /* Remove spaces_elt */
205          av_pop (contents_array);
206
207          ptr = SvPV(existing_text_sv, len);
208          /* Replace element reference with a simple string. */
209          if (!command_extra)
210            {
211              command_extra = newHV ();
212              hv_store (owning_elt, "extra", strlen ("extra"),
213                        newRV_inc((SV *)command_extra), 0);
214            }
215          hv_store (command_extra,
216                    "spaces_before_argument",
217                    strlen ("spaces_before_argument"),
218                    newSVpv(ptr, len),
219                    0);
220        }
221      else
222        {
223          hv_store (spaces_elt, "type", strlen ("type"),
224                    newSVpv ("empty_spaces_after_command", 0), 0);
225
226        }
227    }
228  return 1;
229}
230
231HV *
232xs_merge_text (HV *self, HV *current, SV *text_in)
233{
234  AV *contents_array;
235
236  int no_merge_with_following_text = 0;
237  char *text;
238  int leading_spaces;
239  SV *leading_spaces_sv = 0;
240  int call_ret;
241  SV *returned_sv;
242
243  SV *contents_ref;
244  int contents_num;
245  HV *last_elt;
246  SV *existing_text_sv;
247  char *existing_text;
248  SV **svp;
249
250  dTHX;
251
252  dSP;
253
254  /* Get text in UTF-8. */
255  {
256    STRLEN len;
257    static char *new_string;
258    text = SvPV (text_in, len);
259    if (!SvUTF8 (text_in))
260      {
261        free (new_string);
262        new_string = bytes_to_utf8 (text, &len);
263        text = new_string;
264      }
265  }
266
267  leading_spaces = strspn (text, whitespace_chars);
268  if (text[leading_spaces])
269    {
270      int contents_num;
271
272      if (leading_spaces > 0)
273        {
274          leading_spaces_sv = newSVpv (text, leading_spaces);
275        }
276
277      svp = hv_fetch (current,
278                      "contents", strlen ("contents"), 0);
279      contents_array = (AV *)SvRV(*svp);
280
281      contents_num = av_len(contents_array) + 1;
282      if (contents_num > 0)
283        {
284          HV *last_elt;
285          char *type = 0;
286
287          last_elt = (HV *)
288            SvRV (*av_fetch (contents_array, contents_num - 1, 0));
289
290          svp = hv_fetch (last_elt, "type", strlen ("type"), 0);
291          if (svp)
292            type = SvPV_nolen (*svp);
293          if (type
294              && (!strcmp (type, "empty_line_after_command")
295                  || !strcmp (type, "empty_spaces_after_command")
296                  || !strcmp (type, "empty_spaces_before_argument")
297                  || !strcmp (type, "empty_spaces_after_close_brace")))
298            {
299              no_merge_with_following_text = 1;
300            }
301        }
302
303      if (xs_abort_empty_line(self, current, leading_spaces_sv))
304        {
305          text += leading_spaces;
306        }
307
308      /************************/
309      /* See 'perlcall' man page. */
310      ENTER;
311      SAVETMPS;
312
313
314      PUSHMARK(SP);
315      XPUSHs(sv_2mortal(newRV_inc((SV *)self)));
316      XPUSHs(sv_2mortal(newRV_inc((SV *)current)));
317      PUTBACK;
318
319      call_ret = call_pv ("Texinfo::Parser::_begin_paragraph", G_SCALAR);
320
321      SPAGAIN;
322
323      returned_sv = POPs;
324
325      /************************/
326
327      if (returned_sv && SvRV(returned_sv))
328        {
329          current = (HV *)SvRV(returned_sv);
330        }
331
332      FREETMPS;
333      LEAVE;
334    }
335
336  svp = hv_fetch (current, "contents", strlen ("contents"), 0);
337  if (!svp)
338    {
339      contents_array = newAV ();
340      contents_ref = newRV_inc ((SV *) contents_array);
341      hv_store (current, "contents", strlen ("contents"),
342                contents_ref, 0);
343      fprintf (stderr, "NEW CONTENTS %p\n", contents_array);
344      goto NEW_TEXT;
345    }
346  else
347    {
348      contents_ref = *svp;
349      contents_array = (AV *)SvRV(contents_ref);
350    }
351
352  if (no_merge_with_following_text)
353    goto NEW_TEXT;
354
355  contents_num = av_len(contents_array) + 1;
356  if (contents_num == 0)
357    goto NEW_TEXT;
358
359  last_elt = (HV *)
360    SvRV (*av_fetch (contents_array, contents_num - 1, 0));
361  svp = hv_fetch (last_elt, "text", strlen ("text"), 0);
362  if (!svp)
363    goto NEW_TEXT;
364  existing_text_sv = *svp;
365  existing_text = SvPV_nolen (existing_text_sv);
366  if (strchr (existing_text, '\n'))
367    goto NEW_TEXT;
368
369MERGED_TEXT:
370  sv_catpv (existing_text_sv, text);
371  //fprintf (stderr, "MERGED TEXT: %s|||\n", text);
372
373  if (0)
374    {
375      HV *hv;
376      SV *sv;
377NEW_TEXT:
378      hv = newHV ();
379      sv = newSVpv (text, 0);
380      hv_store (hv, "text", strlen ("text"), sv, 0);
381      SvUTF8_on (sv);
382      hv_store (hv, "parent", strlen ("parent"),
383                newRV_inc ((SV *)current), 0);
384      av_push (contents_array, newRV_inc ((SV *)hv));
385      //fprintf (stderr, "NEW TEXT: %s|||\n", text);
386    }
387
388  return current;
389}
390
391char *
392xs_process_text (char *text)
393{
394  static char *new;
395  char *p, *q;
396
397  dTHX;
398
399  new = realloc (new, strlen (text) + 1);
400  strcpy (new, text);
401
402  p = q = new;
403  while (*p)
404    {
405      if (*p == '-' && p[1] == '-')
406        {
407          if (p[2] == '-')
408            {
409              *q = '-'; q[1] = '-';
410              p += 3; q += 2;
411            }
412          else
413            {
414              *q = '-';
415              p += 2; q += 1;
416            }
417        }
418      else if (*p == '\'' && p[1] == '\'')
419        {
420          *q = '"';
421          p += 2; q += 1;
422        }
423      else if (*p == '`')
424        {
425          if (p[1] == '`')
426            {
427              *q = '"';
428              p += 2; q += 1;
429            }
430          else
431            {
432              *q = '\'';
433              p += 1; q += 1;
434            }
435        }
436      else
437        {
438          *q++ = *p++;
439        }
440    }
441  *q = '\0';
442
443  return new;
444}
445
446char *
447xs_unicode_text (char *text, int in_code)
448{
449  char *p, *q;
450  static char *new;
451  int new_space, new_len;
452
453  dTHX; /* Perl boilerplate. */
454
455  if (in_code)
456    return text;
457
458  p = text;
459  new_space = strlen (text);
460  new = realloc (new, new_space + 1);
461  new_len = 0;
462#define ADD3(s) \
463  if (new_len + 2 >= new_space - 1)               \
464    {                                             \
465      new_space += 2;                             \
466      new = realloc (new, new_space *= 2);        \
467    }                                             \
468  new[new_len++] = s[0];                          \
469  new[new_len++] = s[1];                          \
470  new[new_len++] = s[2];
471
472#define ADD1(s) \
473  if (new_len >= new_space - 1)                   \
474    new = realloc (new, (new_space *= 2) + 1);    \
475  new[new_len++] = s;
476
477#define ADDN(s, n) \
478  if (new_len + n - 1 >= new_space - 1)           \
479    {                                             \
480      new_space += n;                             \
481      new = realloc (new, (new_space *= 2) + 1);  \
482    }                                             \
483  memcpy(new + new_len, s, n);                    \
484  new_len += n;
485
486  while (1)
487    {
488      q = p + strcspn (p, "-`'");
489      ADDN(p, q - p);
490      if (!*q)
491        break;
492      switch (*q)
493        {
494        case '-':
495          if (!memcmp (q, "---", 3))
496            {
497              p = q + 3;
498              /* Unicode em dash U+2014 (0xE2 0x80 0x94) */
499              ADD3("\xE2\x80\x94");
500            }
501          else if (!memcmp (q, "--", 2))
502            {
503              p = q + 2;
504              /* Unicode en dash U+2013 (0xE2 0x80 0x93) */
505              ADD3("\xE2\x80\x93");
506            }
507          else
508            {
509              p = q + 1;
510              ADD1(*q);
511            }
512          break;
513        case '`':
514          if (!memcmp (q, "``", 2))
515            {
516              p = q + 2;
517              /* U+201C E2 80 9C */
518              ADD3("\xE2\x80\x9C");
519            }
520          else
521            {
522              p = q + 1;
523              /* U+2018 E2 80 98 */
524              ADD3("\xE2\x80\x98");
525            }
526          break;
527        case '\'':
528          if (!memcmp (q, "''", 2))
529            {
530              p = q + 2;
531              /* U+201D E2 80 9D */
532              ADD3("\xE2\x80\x9D");
533            }
534          else
535            {
536              p = q + 1;
537              /* U+2019 E2 80 99 */
538              ADD3("\xE2\x80\x99");
539            }
540          break;
541        }
542    }
543
544  new[new_len] = '\0';
545  return new;
546}
547
548/* Return list ($at_command, $open_brace, $asterisk, $single_letter_command,
549       $separator_match) */
550void xs_parse_texi_regex (SV *text_in,
551                          char **at_command,
552                          char **open_brace,
553                          char **asterisk,
554                          char **single_letter_command,
555                          char **separator_match,
556                          char **new_text)
557{
558  char *text;
559
560  dTHX;
561
562  /* Make sure the input is in UTF8. */
563  if (!SvUTF8 (text_in))
564    sv_utf8_upgrade (text_in);
565  text = SvPV_nolen (text_in);
566
567  *at_command = *open_brace = *asterisk = *single_letter_command
568          = *separator_match = *new_text = 0;
569
570  if (*text == '@' && isalnum(text[1]))
571    {
572      char *p, *q;
573      static char *s;
574
575      p = text + 1;
576      q = text + 2;
577      while (isalnum (*q) || *q == '-' || *q == '_')
578        q++;
579
580      s = realloc (s, q - p + 1);
581      memcpy (s, p, q - p);
582      s[q - p] = '\0';
583      *at_command = s;
584    }
585  else
586    {
587      if (*text == '{')
588        {
589          *open_brace = "{";
590          *separator_match = "{";
591        }
592
593      else if (*text == '@'
594                 && text[1] && strchr ("([\"'~@&}{,.!?"
595                                       " \t\n"
596                                       "*-^`=:|/\\",
597                                       text[1]))
598        {
599          static char a[2];
600          *single_letter_command = a;
601          a[0] = text[1];
602          a[1] = '\0';
603        }
604
605      else if (strchr ("{}@,:\t.\f", *text))
606        {
607          static char a[2];
608          *separator_match = a;
609          a[0] = *text;
610          a[1] = '\0';
611        }
612
613      else
614        {
615          char *p;
616
617          if (*text == '*')
618            *asterisk = "*";
619
620          p = text;
621          p += strcspn (p, "{}@,:\t.\n\f");
622          if (p > text)
623            {
624              static char *s;
625              s = realloc (s, p - text + 1);
626              memcpy (s, text, p - text);
627              s[p - text] = '\0';
628              *new_text = s;
629            }
630        }
631    }
632
633  return;
634}
635