1% texlang.w 2% 3% Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org> 4% 5% This file is part of LuaTeX. 6% 7% LuaTeX is free software; you can redistribute it and/or modify it under 8% the terms of the GNU General Public License as published by the Free 9% Software Foundation; either version 2 of the License, or (at your 10% option) any later version. 11% 12% LuaTeX is distributed in the hope that it will be useful, but WITHOUT 13% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14% FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15% License for more details. 16% 17% You should have received a copy of the GNU General Public License along 18% with LuaTeX; if not, see <http://www.gnu.org/licenses/>. 19 20@ @c 21 22 23#include "ptexlib.h" 24#include <string.h> 25#include "lua/luatex-api.h" 26 27@ Low-level helpers 28 29@ @c 30#define noVERBOSE 31 32#define MAX_TEX_LANGUAGES 16384 33 34#define ex_hyphen_char int_par(ex_hyphen_char_code) 35 36static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL }; 37 38static int next_lang_id = 0; 39 40struct tex_language *new_language(int n) 41{ 42 struct tex_language *lang; 43 unsigned l; 44 if (n >= 0) { 45 l = (unsigned) n; 46 if (l != (MAX_TEX_LANGUAGES - 1)) 47 if (next_lang_id <= n) 48 next_lang_id = n + 1; 49 } else { 50 while (tex_languages[next_lang_id] != NULL) 51 next_lang_id++; 52 l = (unsigned) next_lang_id++; 53 } 54 if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) { 55 lang = xmalloc(sizeof(struct tex_language)); 56 tex_languages[l] = lang; 57 lang->id = (int) l; 58 lang->exceptions = 0; 59 lang->patterns = NULL; 60 lang->pre_hyphen_char = '-'; 61 lang->post_hyphen_char = 0; 62 lang->pre_exhyphen_char = 0; 63 lang->post_exhyphen_char = 0; 64 return lang; 65 } else { 66 return NULL; 67 } 68} 69 70struct tex_language *get_language(int n) 71{ 72 if (n >= 0 && n < MAX_TEX_LANGUAGES) { 73 if (tex_languages[n] != NULL) { 74 return tex_languages[n]; 75 } else { 76 return new_language(n); 77 } 78 } else { 79 return NULL; 80 } 81} 82 83@ @c 84void set_pre_hyphen_char(int n, int v) 85{ 86 struct tex_language *l = get_language((int) n); 87 if (l != NULL) 88 l->pre_hyphen_char = (int) v; 89} 90 91void set_post_hyphen_char(int n, int v) 92{ 93 struct tex_language *l = get_language((int) n); 94 if (l != NULL) 95 l->post_hyphen_char = (int) v; 96} 97 98 99void set_pre_exhyphen_char(int n, int v) 100{ 101 struct tex_language *l = get_language((int) n); 102 if (l != NULL) 103 l->pre_exhyphen_char = (int) v; 104} 105 106void set_post_exhyphen_char(int n, int v) 107{ 108 struct tex_language *l = get_language((int) n); 109 if (l != NULL) 110 l->post_exhyphen_char = (int) v; 111} 112 113 114int get_pre_hyphen_char(int n) 115{ 116 struct tex_language *l = get_language((int) n); 117 if (l == NULL) 118 return -1; 119 return (int) l->pre_hyphen_char; 120} 121 122int get_post_hyphen_char(int n) 123{ 124 struct tex_language *l = get_language((int) n); 125 if (l == NULL) 126 return -1; 127 return (int) l->post_hyphen_char; 128} 129 130 131int get_pre_exhyphen_char(int n) 132{ 133 struct tex_language *l = get_language((int) n); 134 if (l == NULL) 135 return -1; 136 return (int) l->pre_exhyphen_char; 137} 138 139int get_post_exhyphen_char(int n) 140{ 141 struct tex_language *l = get_language((int) n); 142 if (l == NULL) 143 return -1; 144 return (int) l->post_exhyphen_char; 145} 146 147@ @c 148void load_patterns(struct tex_language *lang, const unsigned char *buff) 149{ 150 if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0) 151 return; 152 if (lang->patterns == NULL) { 153 lang->patterns = hnj_hyphen_new(); 154 } 155 hnj_hyphen_load(lang->patterns, buff); 156} 157 158void clear_patterns(struct tex_language *lang) 159{ 160 if (lang == NULL) 161 return; 162 if (lang->patterns != NULL) { 163 hnj_hyphen_clear(lang->patterns); 164 } 165} 166 167void load_tex_patterns(int curlang, halfword head) 168{ 169 char *s = tokenlist_to_cstring(head, 1, NULL); 170 load_patterns(get_language(curlang), (unsigned char *) s); 171} 172 173 174@ @c 175#define STORE_CHAR(x) do { \ 176 unsigned xx = get_lc_code(x); \ 177 if (!xx) xx = x; \ 178 uindex = uni2string(uindex, xx); \ 179 } while (0) 180 181/* Cleans one word which is returned in |cleaned|, 182 returns the new offset into |buffer| */ 183 184const char *clean_hyphenation(const char *buff, char **cleaned) 185{ 186 int items = 0; 187 unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */ 188 unsigned uword[MAX_WORD_LEN + 1] = { 0 }; /* work buffer for unicode */ 189 int u = 0; /* unicode buffer value */ 190 int i = 0; /* index into buffer */ 191 char *uindex = (char *)word; 192 const char *s = buff; 193 194 while (*s && !isspace((unsigned char)*s)) { 195 word[i++] = (unsigned)*s; 196 s++; 197 if ((s-buff)>MAX_WORD_LEN) { 198 /* todo: this is too strict, should count unicode, not bytes */ 199 *cleaned = NULL; 200 tex_error("exception too long", NULL); 201 return s; 202 } 203 } 204 /* now convert the input to unicode */ 205 word[i] = '\0'; 206 utf2uni_strcpy(uword, (const char *)word); 207 208 /* build the new word string */ 209 i = 0; 210 while (uword[i]>0) { 211 u = uword[i++]; 212 if (u == '-') { /* skip */ 213 } else if (u == '=') { 214 STORE_CHAR('-'); 215 } else if (u == '{') { 216 u = uword[i++]; 217 items = 0; 218 while (u && u != '}') { 219 u = uword[i++]; 220 } 221 if (u == '}') { 222 items++; 223 u = uword[i++]; 224 } 225 while (u && u != '}') { 226 u = uword[i++]; 227 } 228 if (u == '}') { 229 items++; 230 u = uword[i++];; 231 } 232 if (u == '{') { 233 u = uword[i++];; 234 } 235 while (u && u != '}') { 236 STORE_CHAR(u); 237 u = uword[i++]; 238 } 239 if (u == '}') { 240 items++; 241 } 242 if (items != 3) { /* syntax error */ 243 *cleaned = NULL; 244 tex_error("exception syntax error", NULL); 245 return s; 246 } 247 } else { 248 STORE_CHAR(u); 249 } 250 } 251 *uindex = '\0'; 252 *cleaned = xstrdup((char *) word); 253 return s; 254} 255 256@ @c 257void load_hyphenation(struct tex_language *lang, const unsigned char *buff) 258{ 259 const char *s; 260 const char *value; 261 char *cleaned; 262 lua_State *L = Luas; 263 if (lang == NULL) 264 return; 265 if (lang->exceptions == 0) { 266 lua_newtable(L); 267 lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX); 268 } 269 lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions); 270 s = (const char *) buff; 271 while (*s) { 272 while (isspace((unsigned char)*s)) 273 s++; 274 if (*s) { 275 value = s; 276 s = clean_hyphenation(s, &cleaned); 277 if (cleaned != NULL) { 278 if ((s - value) > 0) { 279 lua_pushstring(L, cleaned); 280 lua_pushlstring(L, value, (size_t) (s - value)); 281 lua_rawset(L, -3); 282 } 283 free(cleaned); 284 } else { 285#ifdef VERBOSE 286 fprintf(stderr, "skipping invalid hyphenation exception: %s\n", 287 value); 288#endif 289 } 290 } 291 } 292} 293 294void clear_hyphenation(struct tex_language *lang) 295{ 296 if (lang == NULL) 297 return; 298 if (lang->exceptions != 0) { 299 luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions); 300 lang->exceptions = 0; 301 } 302} 303 304 305void load_tex_hyphenation(int curlang, halfword head) 306{ 307 char *s = tokenlist_to_cstring(head, 1, NULL); 308 load_hyphenation(get_language(curlang), (unsigned char *) s); 309} 310 311@ TODO: clean this up. The |delete_attribute_ref()| statements are not very 312 nice, but needed. Also, in the post-break, it would be nicer to get the 313 attribute list from |vlink(n)|. No rush, as it is currently not used much. 314 315@c 316halfword insert_discretionary(halfword t, halfword pre, halfword post, 317 halfword replace) 318{ 319 halfword g, n; 320 int f; 321 n = new_node(disc_node, syllable_disc); 322 try_couple_nodes(n, vlink(t)); 323 couple_nodes(t, n); 324 if (replace != null) 325 f = font(replace); 326 else 327 f = get_cur_font(); /* for compound words following explicit hyphens */ 328 for (g = pre; g != null; g = vlink(g)) { 329 font(g) = f; 330 if (node_attr(t) != null) { 331 delete_attribute_ref(node_attr(g)); 332 node_attr(g) = node_attr(t); 333 attr_list_ref(node_attr(t)) += 1; 334 } 335 } 336 for (g = post; g != null; g = vlink(g)) { 337 font(g) = f; 338 if (node_attr(t) != null) { 339 delete_attribute_ref(node_attr(g)); 340 node_attr(g) = node_attr(t); 341 attr_list_ref(node_attr(t)) += 1; 342 } 343 } 344 for (g = replace; g != null; g = vlink(g)) { 345 if (node_attr(t) != null) { 346 delete_attribute_ref(node_attr(g)); 347 node_attr(g) = node_attr(t); 348 attr_list_ref(node_attr(t)) += 1; 349 } 350 } 351 if (node_attr(t) != null) { 352 delete_attribute_ref(node_attr(vlink(t))); 353 node_attr(vlink(t)) = node_attr(t); 354 attr_list_ref(node_attr(t)) += 1; 355 } 356 t = vlink(t); 357 set_disc_field(pre_break(t), pre); 358 set_disc_field(post_break(t), post); 359 set_disc_field(no_break(t), replace); 360 return t; 361} 362 363halfword insert_syllable_discretionary(halfword t, lang_variables * lan) 364{ 365 halfword g, n; 366 n = new_node(disc_node, syllable_disc); 367 couple_nodes(n, vlink(t)); 368 couple_nodes(t, n); 369 delete_attribute_ref(node_attr(n)); 370 if (node_attr(t) != null) { 371 node_attr(n) = node_attr(t); 372 attr_list_ref(node_attr(t))++; 373 } else { 374 node_attr(n) = null; 375 } 376 if (lan->pre_hyphen_char > 0) { 377 g = raw_glyph_node(); 378 set_to_character(g); 379 character(g) = lan->pre_hyphen_char; 380 font(g) = font(t); 381 lang_data(g) = lang_data(t); 382 if (node_attr(t) != null) { 383 node_attr(g) = node_attr(t); 384 attr_list_ref(node_attr(t))++; 385 } 386 set_disc_field(pre_break(n), g); 387 } 388 389 if (lan->post_hyphen_char > 0) { 390 t = vlink(n); 391 g = raw_glyph_node(); 392 set_to_character(g); 393 character(g) = lan->post_hyphen_char; 394 font(g) = font(t); 395 lang_data(g) = lang_data(t); 396 if (node_attr(t) != null) { 397 node_attr(g) = node_attr(t); 398 attr_list_ref(node_attr(t)) += 1; 399 } 400 set_disc_field(post_break(n), g); 401 } 402 return n; 403} 404 405halfword insert_word_discretionary(halfword t, lang_variables * lan) 406{ 407 halfword pre = null, pos = null; 408 if (lan->pre_exhyphen_char > 0) 409 pre = insert_character(null, lan->pre_exhyphen_char); 410 if (lan->post_exhyphen_char > 0) 411 pos = insert_character(null, lan->post_exhyphen_char); 412 return insert_discretionary(t, pre, pos, null); 413} 414 415@ @c 416halfword compound_word_break(halfword t, int clang) 417{ 418 int disc; 419 lang_variables langdata; 420 langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang); 421 langdata.post_exhyphen_char = get_post_exhyphen_char(clang); 422 disc = insert_word_discretionary(t, &langdata); 423 return disc; 424} 425 426 427halfword insert_complex_discretionary(halfword t, lang_variables * lan, 428 halfword pre, halfword pos, 429 halfword replace) 430{ 431 (void) lan; 432 return insert_discretionary(t, pre, pos, replace); 433} 434 435 436halfword insert_character(halfword t, int c) 437{ 438 halfword p; 439 p = new_node(glyph_node, 0); 440 set_to_character(p); 441 character(p) = c; 442 if (t != null) { 443 couple_nodes(t, p); 444 } 445 return p; 446} 447 448@ @c 449void set_disc_field(halfword f, halfword t) 450{ 451 if (t != null) { 452 couple_nodes(f, t); 453 tlink(f) = tail_of_list(t); 454 } else { 455 vlink(f) = null; 456 tlink(f) = null; 457 } 458} 459 460 461 462@ @c 463static char *hyphenation_exception(int exceptions, char *w) 464{ 465 char *ret = NULL; 466 lua_State *L = Luas; 467 lua_checkstack(L, 2); 468 lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions); 469 if (lua_istable(L, -1)) { /* ?? */ 470 lua_pushstring(L, w); /* word table */ 471 lua_rawget(L, -2); 472 if (lua_isstring(L, -1)) { 473 ret = xstrdup(lua_tostring(L, -1)); 474 } 475 lua_pop(L, 2); 476 } else { 477 lua_pop(L, 1); 478 } 479 return ret; 480} 481 482 483@ @c 484char *exception_strings(struct tex_language *lang) 485{ 486 const char *value; 487 size_t size = 0, current = 0; 488 size_t l = 0; 489 char *ret = NULL; 490 lua_State *L = Luas; 491 if (lang->exceptions == 0) 492 return NULL; 493 lua_checkstack(L, 2); 494 lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions); 495 if (lua_istable(L, -1)) { 496 /* iterate and join */ 497 lua_pushnil(L); /* first key */ 498 while (lua_next(L, -2) != 0) { 499 value = lua_tolstring(L, -1, &l); 500 if (current + 2 + l > size) { 501 ret = 502 xrealloc(ret, 503 (unsigned) ((size + size / 5) + current + l + 504 1024)); 505 size = (size + size / 5) + current + l + 1024; 506 } 507 *(ret + current) = ' '; 508 strcpy(ret + current + 1, value); 509 current += l + 1; 510 lua_pop(L, 1); 511 } 512 } 513 return ret; 514} 515 516 517@ the sequence from |wordstart| to |r| can contain only normal characters 518it could be faster to modify a halfword pointer and return an integer 519 520@c 521static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len) 522{ 523 halfword g = null, gg = null; 524 register unsigned i = *j; 525 i++; /* this puts uword[i] on the |{| */ 526 while (i < (unsigned) len && uword[i + 1] != '}') { 527 if (g == null) { 528 gg = new_char(0, (int) uword[i + 1]); 529 g = gg; 530 } else { 531 halfword s = new_char(0, (int) uword[i + 1]); 532 couple_nodes(g, s); 533 g = vlink(g); 534 } 535 i++; 536 } 537 *j = ++i; 538 return gg; 539} 540 541static int count_exception_part(unsigned int *j, unsigned int *uword, int len) 542{ 543 int ret = 0; 544 register unsigned i = *j; 545 i++; /* this puts uword[i] on the |{| */ 546 while (i < (unsigned) len && uword[i + 1] != '}') { 547 ret++; 548 i++; 549 } 550 *j = ++i; 551 return ret; 552} 553 554 555@ @c 556static const char *PAT_ERROR[] = { 557 "Exception discretionaries should contain three pairs of braced items.", 558 "No intervening spaces are allowed.", 559 NULL 560}; 561 562static void do_exception(halfword wordstart, halfword r, char *replacement) 563{ 564 unsigned i; 565 halfword t; 566 unsigned len; 567 int clang; 568 lang_variables langdata; 569 unsigned uword[MAX_WORD_LEN + 1] = { 0 }; 570 utf2uni_strcpy(uword, replacement); 571 len = u_length(uword); 572 i = 0; 573 t = wordstart; 574 clang = char_lang(wordstart); 575 langdata.pre_hyphen_char = get_pre_hyphen_char(clang); 576 langdata.post_hyphen_char = get_post_hyphen_char(clang); 577 578 for (i = 0; i < len; i++) { 579 if (uword[i + 1] == '-') { /* a hyphen follows */ 580 while (vlink(t) != r 581 && (type(t) != glyph_node || !is_simple_character(t))) 582 t = vlink(t); 583 if (vlink(t) == r) 584 break; 585 insert_syllable_discretionary(t, &langdata); 586 t = vlink(t); /* skip the new disc */ 587 } else if (uword[i + 1] == '=') { 588 /* do nothing ? */ 589 t = vlink(t); 590 } else if (uword[i + 1] == '{') { 591 halfword gg, hh, replace = null; 592 int repl; 593 gg = find_exception_part(&i, uword, (int) len); 594 if (i == len || uword[i + 1] != '{') { 595 tex_error("broken pattern 1", PAT_ERROR); 596 } 597 hh = find_exception_part(&i, uword, (int) len); 598 if (i == len || uword[i + 1] != '{') { 599 tex_error("broken pattern 2", PAT_ERROR); 600 } 601 repl = count_exception_part(&i, uword, (int) len); 602 if (i == len) { 603 tex_error("broken pattern 3", PAT_ERROR); 604 } 605 /*i++; *//* jump over the last right brace */ 606 if (vlink(t) == r) 607 break; 608 if (repl > 0) { 609 halfword q = t; 610 replace = vlink(q); 611 while (repl > 0 && q != null) { 612 q = vlink(q); 613 if (type(q) == glyph_node) { 614 repl--; 615 } 616 } 617 try_couple_nodes(t, vlink(q)); 618 vlink(q) = null; 619 } 620 t = insert_discretionary(t, gg, hh, replace); 621 t = vlink(t); /* skip the new disc */ 622 } else { 623 t = vlink(t); 624 } 625 } 626} 627 628@ This is a documentation section from the pascal web file. It is not 629true any more, but I do not have time right now to rewrite it -- Taco 630 631When the line-breaking routine is unable to find a feasible sequence of 632breakpoints, it makes a second pass over the paragraph, attempting to 633hyphenate the hyphenatable words. The goal of hyphenation is to insert 634discretionary material into the paragraph so that there are more 635potential places to break. 636 637The general rules for hyphenation are somewhat complex and technical, 638because we want to be able to hyphenate words that are preceded or 639followed by punctuation marks, and because we want the rules to work 640for languages other than English. We also must contend with the fact 641that hyphens might radically alter the ligature and kerning structure 642of a word. 643 644A sequence of characters will be considered for hyphenation only if it 645belongs to a ``potentially hyphenatable part'' of the current paragraph. 646This is a sequence of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node, 647$p_1\ldots p_{m-1}$ are either character or ligature or whatsit or 648implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust 649or mark or whatsit or explicit kern node. (Therefore hyphenation is 650disabled by boxes, math formulas, and discretionary nodes already inserted 651by the user.) The ligature nodes among $p_1\ldots p_{m-1}$ are effectively 652expanded into the original non-ligature characters; the kern nodes and 653whatsits are ignored. Each character |c| is now classified as either a 654nonletter (if |lc_code(c)=0|), a lowercase letter (if 655|lc_code(c)=c|), or an uppercase letter (otherwise); an uppercase letter 656is treated as if it were |lc_code(c)| for purposes of hyphenation. The 657characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let 658$c_1$ be the first letter that is not in the middle of a ligature. Whatsit 659nodes preceding $c_1$ are ignored; a whatsit found after $c_1$ will be the 660terminating node $p_m$. All characters that do not have the same font as 661$c_1$ will be treated as nonletters. The |hyphen_char| for that font 662must be between 0 and 255, otherwise hyphenation will not be attempted. 663\TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as 664possible; however, |n| must be less than 64, so a character that would 665otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must 666not be in the middle of a ligature. In this way we obtain a string of 667letters $c_1\ldots c_n$ that are generated by nodes $p_a\ldots p_b$, where 668|1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this string qualifies for hyphenation; 669however, |uc_hyph| must be positive, if $c_1$ is uppercase. 670 671The hyphenation process takes place in three stages. First, the candidate 672sequence $c_1\ldots c_n$ is found; then potential positions for hyphens 673are determined by referring to hyphenation tables; and finally, the nodes 674$p_a\ldots p_b$ are replaced by a new sequence of nodes that includes the 675discretionary breaks found. 676 677Fortunately, we do not have to do all this calculation very often, because 678of the way it has been taken out of \TeX's inner loop. For example, when 679the second edition of the author's 700-page book {\sl Seminumerical 680Algorithms} was typeset by \TeX, only about 1.2 hyphenations needed to be 681@^Knuth, Donald Ervin@> 682tried per paragraph, since the line breaking algorithm needed to use two 683passes on only about 5 per cent of the paragraphs. 684 685 686When a word been set up to contain a candidate for hyphenation, 687\TeX\ first looks to see if it is in the user's exception dictionary. If not, 688hyphens are inserted based on patterns that appear within the given word, 689using an algorithm due to Frank~M. Liang. 690@^Liang, Franklin Mark@> 691 692 693@ This is incompatible with TEX because the first word of a paragraph 694can be hyphenated, but most european users seem to agree that 695prohibiting hyphenation there was not the best idea ever. 696 697@c 698static halfword find_next_wordstart(halfword r) 699{ 700 register int l; 701 register int start_ok = 1; 702 int mathlevel = 1; 703 int chr ; 704 halfword t ; 705 while (r != null) { 706 switch (type(r)) { 707 case whatsit_node: 708 break; 709 case glue_node: 710 start_ok = 1; 711 break; 712 case math_node: 713 while (mathlevel > 0) { 714 r = vlink(r); 715 if (r == null) 716 return r; 717 if (type(r) == math_node) { 718 if (subtype(r) == before) { 719 mathlevel++; 720 } else { 721 mathlevel--; 722 } 723 } 724 } 725 break; 726 case glyph_node: 727 if (is_simple_character(r)) { 728 chr = character(r) ; 729 if (chr == ex_hyphen_char) { 730 /* We only accept an explicit hyphen when there is a preceding glyph and */ 731 /* we skip a sequence of explicit hyphens as that normally indicates a */ 732 /* -- or --- ligature in which case we can in a worse case usage get bad */ 733 /* node lists later on due to messed up ligature building as these dashes */ 734 /* ligatures in base fonts. This is a side effect of the separating the */ 735 /* hyphenation, ligaturing and kerning steps. A test is cmr with ------. */ 736 t = vlink(r) ; 737 if ((start_ok > 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char)) { 738 t = compound_word_break(r, char_lang(r)); 739 subtype(t) = automatic_disc; 740 start_ok = 1 ; 741 } else { 742 start_ok = 0; 743 } 744 } else if (start_ok && (l = get_lc_code(chr)) > 0) { 745 if (char_uchyph(r) || l == chr) { 746 return r; 747 } else { 748 start_ok = 0; 749 } 750 } 751 } 752 break; 753 default: 754 start_ok = 0; 755 break; 756 } 757 r = vlink(r); 758 } 759 return r; 760} 761 762@ @c 763static int valid_wordend(halfword s) 764{ 765 register halfword r = s; 766 register int clang = char_lang(s); 767 if (r == null) 768 return 1; 769 while ((r != null) && ((type(r) == glyph_node && is_simple_character(r) 770 && clang == char_lang(r)) || 771 (type(r) == kern_node && (subtype(r) == normal)) 772 )) { 773 r = vlink(r); 774 } 775 if (r == null || (type(r) == glyph_node && is_simple_character(r) 776 && clang != char_lang(r)) || type(r) == glue_node 777 || type(r) == whatsit_node || type(r) == ins_node 778 || type(r) == adjust_node || type(r) == penalty_node 779 || (type(r) == kern_node 780 && (subtype(r) == explicit || subtype(r) == acc_kern))) 781 return 1; 782 return 0; 783} 784 785@ @c 786void hnj_hyphenation(halfword head, halfword tail) 787{ 788 int lchar, i; 789 struct tex_language *lang; 790 lang_variables langdata; 791 char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 }; 792 int wordlen = 0; 793 char *hy = utf8word; 794 char *replacement = NULL; 795 boolean explicit_hyphen = false; 796 halfword s, r = head, wordstart = null, save_tail1 = null, left = 797 null, right = null; 798 799 /* this first movement assures two things: 800 \item{a)} that we won't waste lots of time on something that has been 801 handled already (in that case, none of the glyphs match |simple_character|). 802 \item{b)} that the first word can be hyphenated. if the movement was 803 not explicit, then the indentation at the start of a paragraph 804 list would make |find_next_wordstart()| look too far ahead. 805 */ 806 807 while (r != null && (type(r) != glyph_node || !is_simple_character(r))) { 808 r = vlink(r); 809 } 810 /* this will make |r| a glyph node with subtype character */ 811 r = find_next_wordstart(r); 812 if (r == null) 813 return; 814 815 assert(tail != null); 816 save_tail1 = vlink(tail); 817 s = new_penalty(0); 818 couple_nodes(tail, s); 819 820 while (r != null) { /* could be while(1), but let's be paranoid */ 821 int clang, lhmin, rhmin; 822 halfword hyf_font; 823 halfword end_word = r; 824 wordstart = r; 825 assert(is_simple_character(wordstart)); 826 hyf_font = font(wordstart); 827 if (hyphen_char(hyf_font) < 0) /* for backward compat */ 828 hyf_font = 0; 829 clang = char_lang(wordstart); 830 lhmin = char_lhmin(wordstart); 831 rhmin = char_rhmin(wordstart); 832 langdata.pre_hyphen_char = get_pre_hyphen_char(clang); 833 langdata.post_hyphen_char = get_post_hyphen_char(clang); 834 while (r != null && type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r) && 835 (((lchar = get_lc_code(character(r))) > 0) || (character(r) == ex_hyphen_char && (lchar = ex_hyphen_char)))) { 836 if (character(r) == ex_hyphen_char) 837 explicit_hyphen = true; 838 wordlen++; 839 hy = uni2string(hy, (unsigned) lchar); 840 /* this should not be needed any more */ 841 /*if (vlink(r)!=null) alink(vlink(r))=r; */ 842 end_word = r; 843 r = vlink(r); 844 } 845 if (valid_wordend(r) && wordlen >= lhmin + rhmin 846 && (hyf_font != 0) && clang >=0 && (lang = tex_languages[clang]) != NULL) { 847 *hy = 0; 848 if (lang->exceptions != 0 && 849 (replacement = 850 hyphenation_exception(lang->exceptions, utf8word)) != NULL) { 851#ifdef VERBOSE 852 fprintf(stderr, "replacing %s (c=%d) by %s\n", utf8word, clang, 853 replacement); 854#endif 855 do_exception(wordstart, r, replacement); 856 free(replacement); 857 } else if (explicit_hyphen == true) { 858 /* insert an explicit discretionary after each of the last in a 859 set of explicit hyphens */ 860 halfword rr = r; 861 halfword t = null; 862#ifdef VERBOSE 863 fprintf(stderr, "explicit hyphen(s) found in %s (c=%d)\n", utf8word, clang); 864#endif 865 while (rr != wordstart) { 866 if (is_simple_character(rr)) { 867 if (character(rr) == ex_hyphen_char) { 868 t = compound_word_break(rr, clang); 869 subtype(t) = automatic_disc; 870 while(character(alink(rr)) == ex_hyphen_char) 871 rr = alink(rr); 872 if (rr == wordstart) 873 break; 874 } 875 } 876 rr = alink(rr); 877 } 878 879 } else if (lang->patterns != NULL) { 880 881 left = wordstart; 882 for (i = lhmin; i > 1; i--) { 883 left = vlink(left); 884 while (!is_simple_character(left)) 885 left = vlink(left); 886 } 887 right = r; 888 for (i = rhmin; i > 0; i--) { 889 right = alink(right); 890 while (!is_simple_character(right)) 891 right = alink(right); 892 } 893 894#ifdef VERBOSE 895 fprintf(stderr, "hyphenate %s (c=%d,l=%d,r=%d) from %c to %c\n", 896 utf8word, clang, lhmin, rhmin, character(left), 897 character(right)); 898#endif 899 (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word, 900 wordlen, left, right, &langdata); 901 } 902 } 903 explicit_hyphen = false; 904 wordlen = 0; 905 hy = utf8word; 906 if (r == null) 907 break; 908 r = find_next_wordstart(r); 909 } 910 flush_node(vlink(tail)); 911 vlink(tail) = save_tail1; 912} 913 914 915@ @c 916void new_hyphenation(halfword head, halfword tail) 917{ 918 register int callback_id = 0; 919 if (head == null || vlink(head) == null) 920 return; 921 fix_node_list(head); 922 callback_id = callback_defined(hyphenate_callback); 923 if (callback_id > 0) { 924 lua_State *L = Luas; 925 if (!get_callback(L, callback_id)) { 926 lua_pop(L, 2); 927 return; 928 } 929 nodelist_to_lua(L, head); 930 nodelist_to_lua(L, tail); 931 if (lua_pcall(L, 2, 0, 0) != 0) { 932 fprintf(stdout, "error: %s\n", lua_tostring(L, -1)); 933 lua_pop(L, 2); 934 lua_error(L); 935 return; 936 } 937 lua_pop(L, 1); 938 } else if (callback_id == 0) { 939 hnj_hyphenation(head, tail); 940 } 941} 942 943@ dumping and undumping languages 944 945@c 946#define dump_string(a) \ 947 if (a!=NULL) { \ 948 x = (int)strlen(a)+1; \ 949 dump_int(x); dump_things(*a, x); \ 950 } else { \ 951 x = 0; dump_int(x); \ 952 } 953 954 955static void dump_one_language(int i) 956{ 957 char *s = NULL; 958 int x = 0; 959 struct tex_language *lang; 960 lang = tex_languages[i]; 961 dump_int(lang->id); 962 dump_int(lang->pre_hyphen_char); 963 dump_int(lang->post_hyphen_char); 964 dump_int(lang->pre_exhyphen_char); 965 dump_int(lang->post_exhyphen_char); 966 if (lang->patterns != NULL) { 967 s = (char *) hnj_serialize(lang->patterns); 968 } 969 dump_string(s); 970 if (s != NULL) { 971 free(s); 972 s = NULL; 973 } 974 if (lang->exceptions != 0) 975 s = exception_strings(lang); 976 dump_string(s); 977 if (s != NULL) { 978 free(s); 979 } 980 free(lang); 981} 982 983void dump_language_data(void) 984{ 985 int i; 986 dump_int(next_lang_id); 987 for (i = 0; i < next_lang_id; i++) { 988 if (tex_languages[i]) { 989 dump_int(1); 990 dump_one_language(i); 991 } else { 992 dump_int(0); 993 } 994 } 995} 996 997 998static void undump_one_language(int i) 999{ 1000 char *s = NULL; 1001 int x = 0; 1002 struct tex_language *lang = get_language(i); 1003 undump_int(x); 1004 lang->id = x; 1005 undump_int(x); 1006 lang->pre_hyphen_char = x; 1007 undump_int(x); 1008 lang->post_hyphen_char = x; 1009 undump_int(x); 1010 lang->pre_exhyphen_char = x; 1011 undump_int(x); 1012 lang->post_exhyphen_char = x; 1013 /* patterns */ 1014 undump_int(x); 1015 if (x > 0) { 1016 s = xmalloc((unsigned) x); 1017 undump_things(*s, x); 1018 load_patterns(lang, (unsigned char *) s); 1019 free(s); 1020 } 1021 /* exceptions */ 1022 undump_int(x); 1023 if (x > 0) { 1024 s = xmalloc((unsigned) x); 1025 undump_things(*s, x); 1026 load_hyphenation(lang, (unsigned char *) s); 1027 free(s); 1028 } 1029} 1030 1031void undump_language_data(void) 1032{ 1033 int i, x, numlangs; 1034 undump_int(numlangs); 1035 next_lang_id = numlangs; 1036 for (i = 0; i < numlangs; i++) { 1037 undump_int(x); 1038 if (x == 1) { 1039 undump_one_language(i); 1040 } 1041 } 1042} 1043 1044 1045@ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named 1046|new_hyph_exceptions| to do the right thing. 1047 1048@c 1049void new_hyph_exceptions(void) 1050{ /* enters new exceptions */ 1051 (void) scan_toks(false, true); 1052 load_tex_hyphenation(int_par(language_code), def_ref); 1053 flush_list(def_ref); 1054} 1055 1056@ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a 1057procedure named |new_patterns|. 1058 1059@c 1060void new_patterns(void) 1061{ /* initializes the hyphenation pattern data */ 1062 (void) scan_toks(false, true); 1063 load_tex_patterns(int_par(language_code), def_ref); 1064 flush_list(def_ref); 1065} 1066 1067@ `\.{\\prehyphenchar}', sets the |pre_break| character, and 1068`\.{\\posthyphenchar}' the |post_break| character. Their respective 1069defaults are ascii hyphen ("-") and zero (nul). 1070 1071@c 1072void new_pre_hyphen_char(void) 1073{ 1074 scan_optional_equals(); 1075 scan_int(); 1076 set_pre_hyphen_char(int_par(language_code), cur_val); 1077} 1078 1079void new_post_hyphen_char(void) 1080{ 1081 scan_optional_equals(); 1082 scan_int(); 1083 set_post_hyphen_char(int_par(language_code), cur_val); 1084} 1085 1086 1087@ `\.{\\preexhyphenchar}', sets the |pre_break| character, and 1088`\.{\\postexhyphenchar}' the |post_break| character. Their 1089defaults are both zero (nul). 1090 1091@c 1092void new_pre_exhyphen_char(void) 1093{ 1094 scan_optional_equals(); 1095 scan_int(); 1096 set_pre_exhyphen_char(int_par(language_code), cur_val); 1097} 1098 1099void new_post_exhyphen_char(void) 1100{ 1101 scan_optional_equals(); 1102 scan_int(); 1103 set_post_exhyphen_char(int_par(language_code), cur_val); 1104} 1105