1/* Copyright 2010-2019 Free Software Foundation, Inc. 2 3 This program is free software: you can redistribute it and/or modify 4 it under the terms of the GNU General Public License as published by 5 the Free Software Foundation, either version 3 of the License, or 6 (at your option) any later version. 7 8 This program is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 GNU General Public License for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 15 16#ifdef HAVE_CONFIG_H 17 #include <config.h> 18#endif 19#include <stdlib.h> 20#include <stdio.h> 21#include <string.h> 22#include <locale.h> 23#ifndef _WIN32 24#include <langinfo.h> 25#else /* _WIN32 */ 26/* Workaround for problems caused in mingw.org's MinGW build by 27 Gnulib's wchar.h overriding the wint_t type definition, which 28 causes compilation errors when perl.h is included below, because 29 perl.h includes ctype.h. */ 30#include <ctype.h> 31#endif 32#include <wchar.h> 33#include <wctype.h> 34 35/* See "How do I use all this in extensions" in 'man perlguts'. */ 36#define PERL_NO_GET_CONTEXT 37 38#include "EXTERN.h" 39#include "perl.h" 40#include "XSUB.h" 41 42#include "ppport.h" 43 44#include "miscxs.h" 45 46const char *whitespace_chars = " \t\f\v\r\n"; 47 48int 49xs_abort_empty_line (HV *self, HV *current, SV *additional_spaces_in) 50{ 51 char *additional_spaces; 52 AV *contents_array; 53 SV **svp; 54 int contents_num; 55 HV *spaces_elt; 56 //char *key; 57 HV *test_extra = 0; 58 HV *command_extra = 0; 59 60 HV *owning_elt = 0; 61 char *type; 62 SV *existing_text_sv; 63 64 dTHX; 65 66 /* Get additional text in UTF-8. */ 67 if (additional_spaces_in) 68 { 69 STRLEN len; 70 static char *new_string; 71 additional_spaces = SvPV (additional_spaces_in, len); 72 if (!SvUTF8 (additional_spaces_in)) 73 { 74 free (new_string); 75 new_string = bytes_to_utf8 (additional_spaces, &len); 76 additional_spaces = new_string; 77 } 78 } 79 else 80 additional_spaces = ""; 81 82 svp = hv_fetch (current, "contents", strlen("contents"), 0); 83 if (!svp) 84 return 0; 85 contents_array = (AV *)SvRV(*svp); 86 87 contents_num = av_len(contents_array) + 1; 88 if (contents_num == 0) 89 return 0; 90 91 spaces_elt = (HV *) SvRV (*av_fetch (contents_array, contents_num - 1, 0)); 92 93 svp = hv_fetch (spaces_elt, "type", strlen ("type"), 0); 94 if (!svp) 95 return 0; 96 97 type = SvPV_nolen (*svp); 98 if (!type) 99 return 0; 100 101 /* Must be one of these types to continue. */ 102 if (strcmp (type, "empty_line") 103 && strcmp (type, "empty_line_after_command") 104 && strcmp (type, "empty_spaces_before_argument") 105 && strcmp (type, "empty_spaces_after_close_brace")) 106 { 107 return 0; 108 } 109 110 //fprintf (stderr, "ABORT EMPTY\n"); 111 112 svp = hv_fetch (spaces_elt, "extra", strlen ("extra"), 0); 113 if (svp) 114 { 115 test_extra = (HV *) SvRV (*svp); 116 svp = hv_fetch (test_extra, "command", 117 strlen ("command"), 0); 118 if (svp) 119 { 120 owning_elt = (HV *) SvRV (*svp); 121 svp = hv_fetch (owning_elt, "extra", strlen ("extra"), 0); 122 if (svp) 123 command_extra = (HV *) SvRV (*svp); 124 } 125 } 126 127 svp = hv_fetch (spaces_elt, "text", strlen ("text"), 0); 128 if (!svp) 129 return 0; /* or create it? change last arg from 0 to 1 */ 130 existing_text_sv = *svp; 131 132 /* Append the 'additional_spaces' argument. */ 133 sv_utf8_upgrade (existing_text_sv); 134 sv_catpv (existing_text_sv, additional_spaces); 135 136 if (!*SvPV_nolen (existing_text_sv)) /* existing text is empty */ 137 { 138 /* Remove spaces_elt */ 139 av_pop (contents_array); 140 } 141 else if (!strcmp (type, "empty_line")) 142 { 143 char *current_type; 144 AV *context_stack; 145 SV *top_context_sv; 146 char *top_context; 147 int top_index; 148 149 svp = hv_fetch (current, "type", strlen ("type"), 0); 150 if (!svp) 151 current_type = 0; 152 else 153 current_type = SvPV_nolen (*svp); 154 155 /* "Types with paragraphs". Remove the type unless we are inside 156 one of these types. */ 157 if (current_type 158 && strcmp (current_type, "before_item") 159 && strcmp (current_type, "text_root") 160 && strcmp (current_type, "document_root") 161 && strcmp (current_type, "brace_command_context")) 162 goto delete_type; 163 164 /* Check the context stack. */ 165 svp = hv_fetch (self, "context_stack", strlen ("context_stack"), 0); 166 if (!svp) 167 goto delete_type; /* shouldn't happen */ 168 context_stack = (AV *) SvRV (*svp); 169 top_index = av_len (context_stack); 170 if (top_index < 0) 171 goto delete_type; /* shouldn't happen */ 172 svp = av_fetch (context_stack, top_index, 0); 173 if (!svp) 174 goto delete_type; /* shouldn't happen */ 175 top_context_sv = *svp; 176 top_context = SvPV_nolen (top_context_sv); 177 178 /* Change type to "empty_spaces_before_paragraph" unless we are in 179 one of these contexts. */ 180 if (strcmp (top_context, "math") 181 && strcmp (top_context, "menu") 182 && strcmp (top_context, "preformatted") 183 && strcmp (top_context, "rawpreformatted") 184 && strcmp (top_context, "def") 185 && strcmp (top_context, "inlineraw")) 186 { 187 hv_store (spaces_elt, "type", strlen ("type"), 188 newSVpv ("empty_spaces_before_paragraph", 0), 0); 189 } 190 else 191 { 192delete_type: 193 hv_delete (spaces_elt, "type", strlen ("type"), G_DISCARD); 194 } 195 } 196 else if (!strcmp (type, "empty_line_after_command") 197 || !strcmp (type, "empty_spaces_before_argument")) 198 { 199 STRLEN len; 200 char *ptr; 201 202 if (owning_elt) 203 { 204 /* Remove spaces_elt */ 205 av_pop (contents_array); 206 207 ptr = SvPV(existing_text_sv, len); 208 /* Replace element reference with a simple string. */ 209 if (!command_extra) 210 { 211 command_extra = newHV (); 212 hv_store (owning_elt, "extra", strlen ("extra"), 213 newRV_inc((SV *)command_extra), 0); 214 } 215 hv_store (command_extra, 216 "spaces_before_argument", 217 strlen ("spaces_before_argument"), 218 newSVpv(ptr, len), 219 0); 220 } 221 else 222 { 223 hv_store (spaces_elt, "type", strlen ("type"), 224 newSVpv ("empty_spaces_after_command", 0), 0); 225 226 } 227 } 228 return 1; 229} 230 231HV * 232xs_merge_text (HV *self, HV *current, SV *text_in) 233{ 234 AV *contents_array; 235 236 int no_merge_with_following_text = 0; 237 char *text; 238 int leading_spaces; 239 SV *leading_spaces_sv = 0; 240 int call_ret; 241 SV *returned_sv; 242 243 SV *contents_ref; 244 int contents_num; 245 HV *last_elt; 246 SV *existing_text_sv; 247 char *existing_text; 248 SV **svp; 249 250 dTHX; 251 252 dSP; 253 254 /* Get text in UTF-8. */ 255 { 256 STRLEN len; 257 static char *new_string; 258 text = SvPV (text_in, len); 259 if (!SvUTF8 (text_in)) 260 { 261 free (new_string); 262 new_string = bytes_to_utf8 (text, &len); 263 text = new_string; 264 } 265 } 266 267 leading_spaces = strspn (text, whitespace_chars); 268 if (text[leading_spaces]) 269 { 270 int contents_num; 271 272 if (leading_spaces > 0) 273 { 274 leading_spaces_sv = newSVpv (text, leading_spaces); 275 } 276 277 svp = hv_fetch (current, 278 "contents", strlen ("contents"), 0); 279 contents_array = (AV *)SvRV(*svp); 280 281 contents_num = av_len(contents_array) + 1; 282 if (contents_num > 0) 283 { 284 HV *last_elt; 285 char *type = 0; 286 287 last_elt = (HV *) 288 SvRV (*av_fetch (contents_array, contents_num - 1, 0)); 289 290 svp = hv_fetch (last_elt, "type", strlen ("type"), 0); 291 if (svp) 292 type = SvPV_nolen (*svp); 293 if (type 294 && (!strcmp (type, "empty_line_after_command") 295 || !strcmp (type, "empty_spaces_after_command") 296 || !strcmp (type, "empty_spaces_before_argument") 297 || !strcmp (type, "empty_spaces_after_close_brace"))) 298 { 299 no_merge_with_following_text = 1; 300 } 301 } 302 303 if (xs_abort_empty_line(self, current, leading_spaces_sv)) 304 { 305 text += leading_spaces; 306 } 307 308 /************************/ 309 /* See 'perlcall' man page. */ 310 ENTER; 311 SAVETMPS; 312 313 314 PUSHMARK(SP); 315 XPUSHs(sv_2mortal(newRV_inc((SV *)self))); 316 XPUSHs(sv_2mortal(newRV_inc((SV *)current))); 317 PUTBACK; 318 319 call_ret = call_pv ("Texinfo::Parser::_begin_paragraph", G_SCALAR); 320 321 SPAGAIN; 322 323 returned_sv = POPs; 324 325 /************************/ 326 327 if (returned_sv && SvRV(returned_sv)) 328 { 329 current = (HV *)SvRV(returned_sv); 330 } 331 332 FREETMPS; 333 LEAVE; 334 } 335 336 svp = hv_fetch (current, "contents", strlen ("contents"), 0); 337 if (!svp) 338 { 339 contents_array = newAV (); 340 contents_ref = newRV_inc ((SV *) contents_array); 341 hv_store (current, "contents", strlen ("contents"), 342 contents_ref, 0); 343 fprintf (stderr, "NEW CONTENTS %p\n", contents_array); 344 goto NEW_TEXT; 345 } 346 else 347 { 348 contents_ref = *svp; 349 contents_array = (AV *)SvRV(contents_ref); 350 } 351 352 if (no_merge_with_following_text) 353 goto NEW_TEXT; 354 355 contents_num = av_len(contents_array) + 1; 356 if (contents_num == 0) 357 goto NEW_TEXT; 358 359 last_elt = (HV *) 360 SvRV (*av_fetch (contents_array, contents_num - 1, 0)); 361 svp = hv_fetch (last_elt, "text", strlen ("text"), 0); 362 if (!svp) 363 goto NEW_TEXT; 364 existing_text_sv = *svp; 365 existing_text = SvPV_nolen (existing_text_sv); 366 if (strchr (existing_text, '\n')) 367 goto NEW_TEXT; 368 369MERGED_TEXT: 370 sv_catpv (existing_text_sv, text); 371 //fprintf (stderr, "MERGED TEXT: %s|||\n", text); 372 373 if (0) 374 { 375 HV *hv; 376 SV *sv; 377NEW_TEXT: 378 hv = newHV (); 379 sv = newSVpv (text, 0); 380 hv_store (hv, "text", strlen ("text"), sv, 0); 381 SvUTF8_on (sv); 382 hv_store (hv, "parent", strlen ("parent"), 383 newRV_inc ((SV *)current), 0); 384 av_push (contents_array, newRV_inc ((SV *)hv)); 385 //fprintf (stderr, "NEW TEXT: %s|||\n", text); 386 } 387 388 return current; 389} 390 391char * 392xs_process_text (char *text) 393{ 394 static char *new; 395 char *p, *q; 396 397 dTHX; 398 399 new = realloc (new, strlen (text) + 1); 400 strcpy (new, text); 401 402 p = q = new; 403 while (*p) 404 { 405 if (*p == '-' && p[1] == '-') 406 { 407 if (p[2] == '-') 408 { 409 *q = '-'; q[1] = '-'; 410 p += 3; q += 2; 411 } 412 else 413 { 414 *q = '-'; 415 p += 2; q += 1; 416 } 417 } 418 else if (*p == '\'' && p[1] == '\'') 419 { 420 *q = '"'; 421 p += 2; q += 1; 422 } 423 else if (*p == '`') 424 { 425 if (p[1] == '`') 426 { 427 *q = '"'; 428 p += 2; q += 1; 429 } 430 else 431 { 432 *q = '\''; 433 p += 1; q += 1; 434 } 435 } 436 else 437 { 438 *q++ = *p++; 439 } 440 } 441 *q = '\0'; 442 443 return new; 444} 445 446char * 447xs_unicode_text (char *text, int in_code) 448{ 449 char *p, *q; 450 static char *new; 451 int new_space, new_len; 452 453 dTHX; /* Perl boilerplate. */ 454 455 if (in_code) 456 return text; 457 458 p = text; 459 new_space = strlen (text); 460 new = realloc (new, new_space + 1); 461 new_len = 0; 462#define ADD3(s) \ 463 if (new_len + 2 >= new_space - 1) \ 464 { \ 465 new_space += 2; \ 466 new = realloc (new, new_space *= 2); \ 467 } \ 468 new[new_len++] = s[0]; \ 469 new[new_len++] = s[1]; \ 470 new[new_len++] = s[2]; 471 472#define ADD1(s) \ 473 if (new_len >= new_space - 1) \ 474 new = realloc (new, (new_space *= 2) + 1); \ 475 new[new_len++] = s; 476 477#define ADDN(s, n) \ 478 if (new_len + n - 1 >= new_space - 1) \ 479 { \ 480 new_space += n; \ 481 new = realloc (new, (new_space *= 2) + 1); \ 482 } \ 483 memcpy(new + new_len, s, n); \ 484 new_len += n; 485 486 while (1) 487 { 488 q = p + strcspn (p, "-`'"); 489 ADDN(p, q - p); 490 if (!*q) 491 break; 492 switch (*q) 493 { 494 case '-': 495 if (!memcmp (q, "---", 3)) 496 { 497 p = q + 3; 498 /* Unicode em dash U+2014 (0xE2 0x80 0x94) */ 499 ADD3("\xE2\x80\x94"); 500 } 501 else if (!memcmp (q, "--", 2)) 502 { 503 p = q + 2; 504 /* Unicode en dash U+2013 (0xE2 0x80 0x93) */ 505 ADD3("\xE2\x80\x93"); 506 } 507 else 508 { 509 p = q + 1; 510 ADD1(*q); 511 } 512 break; 513 case '`': 514 if (!memcmp (q, "``", 2)) 515 { 516 p = q + 2; 517 /* U+201C E2 80 9C */ 518 ADD3("\xE2\x80\x9C"); 519 } 520 else 521 { 522 p = q + 1; 523 /* U+2018 E2 80 98 */ 524 ADD3("\xE2\x80\x98"); 525 } 526 break; 527 case '\'': 528 if (!memcmp (q, "''", 2)) 529 { 530 p = q + 2; 531 /* U+201D E2 80 9D */ 532 ADD3("\xE2\x80\x9D"); 533 } 534 else 535 { 536 p = q + 1; 537 /* U+2019 E2 80 99 */ 538 ADD3("\xE2\x80\x99"); 539 } 540 break; 541 } 542 } 543 544 new[new_len] = '\0'; 545 return new; 546} 547 548/* Return list ($at_command, $open_brace, $asterisk, $single_letter_command, 549 $separator_match) */ 550void xs_parse_texi_regex (SV *text_in, 551 char **at_command, 552 char **open_brace, 553 char **asterisk, 554 char **single_letter_command, 555 char **separator_match, 556 char **new_text) 557{ 558 char *text; 559 560 dTHX; 561 562 /* Make sure the input is in UTF8. */ 563 if (!SvUTF8 (text_in)) 564 sv_utf8_upgrade (text_in); 565 text = SvPV_nolen (text_in); 566 567 *at_command = *open_brace = *asterisk = *single_letter_command 568 = *separator_match = *new_text = 0; 569 570 if (*text == '@' && isalnum(text[1])) 571 { 572 char *p, *q; 573 static char *s; 574 575 p = text + 1; 576 q = text + 2; 577 while (isalnum (*q) || *q == '-' || *q == '_') 578 q++; 579 580 s = realloc (s, q - p + 1); 581 memcpy (s, p, q - p); 582 s[q - p] = '\0'; 583 *at_command = s; 584 } 585 else 586 { 587 if (*text == '{') 588 { 589 *open_brace = "{"; 590 *separator_match = "{"; 591 } 592 593 else if (*text == '@' 594 && text[1] && strchr ("([\"'~@&}{,.!?" 595 " \t\n" 596 "*-^`=:|/\\", 597 text[1])) 598 { 599 static char a[2]; 600 *single_letter_command = a; 601 a[0] = text[1]; 602 a[1] = '\0'; 603 } 604 605 else if (strchr ("{}@,:\t.\f", *text)) 606 { 607 static char a[2]; 608 *separator_match = a; 609 a[0] = *text; 610 a[1] = '\0'; 611 } 612 613 else 614 { 615 char *p; 616 617 if (*text == '*') 618 *asterisk = "*"; 619 620 p = text; 621 p += strcspn (p, "{}@,:\t.\n\f"); 622 if (p > text) 623 { 624 static char *s; 625 s = realloc (s, p - text + 1); 626 memcpy (s, text, p - text); 627 s[p - text] = '\0'; 628 *new_text = s; 629 } 630 } 631 } 632 633 return; 634} 635