1% texlang.w
2%
3% Copyright 2006-2012 Taco Hoekwater <taco@@luatex.org>
4%
5% This file is part of LuaTeX.
6%
7% LuaTeX is free software; you can redistribute it and/or modify it under
8% the terms of the GNU General Public License as published by the Free
9% Software Foundation; either version 2 of the License, or (at your
10% option) any later version.
11%
12% LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14% FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15% License for more details.
16%
17% You should have received a copy of the GNU General Public License along
18% with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
19
20@ @c
21
22
23#include "ptexlib.h"
24#include <string.h>
25#include "lua/luatex-api.h"
26
27@ Low-level helpers
28
29@ @c
30#define noVERBOSE
31
32#define MAX_TEX_LANGUAGES  16384
33
34#define ex_hyphen_char int_par(ex_hyphen_char_code)
35
36static struct tex_language *tex_languages[MAX_TEX_LANGUAGES] = { NULL };
37
38static int next_lang_id = 0;
39
40struct tex_language *new_language(int n)
41{
42    struct tex_language *lang;
43    unsigned l;
44    if (n >= 0) {
45        l = (unsigned) n;
46        if (l != (MAX_TEX_LANGUAGES - 1))
47            if (next_lang_id <= n)
48                next_lang_id = n + 1;
49    } else {
50        while (tex_languages[next_lang_id] != NULL)
51            next_lang_id++;
52        l = (unsigned) next_lang_id++;
53    }
54    if (l < (MAX_TEX_LANGUAGES - 1) && tex_languages[l] == NULL) {
55        lang = xmalloc(sizeof(struct tex_language));
56        tex_languages[l] = lang;
57        lang->id = (int) l;
58        lang->exceptions = 0;
59        lang->patterns = NULL;
60        lang->pre_hyphen_char = '-';
61        lang->post_hyphen_char = 0;
62        lang->pre_exhyphen_char = 0;
63        lang->post_exhyphen_char = 0;
64        return lang;
65    } else {
66        return NULL;
67    }
68}
69
70struct tex_language *get_language(int n)
71{
72    if (n >= 0 && n < MAX_TEX_LANGUAGES) {
73        if (tex_languages[n] != NULL) {
74            return tex_languages[n];
75        } else {
76            return new_language(n);
77        }
78    } else {
79        return NULL;
80    }
81}
82
83@ @c
84void set_pre_hyphen_char(int n, int v)
85{
86    struct tex_language *l = get_language((int) n);
87    if (l != NULL)
88        l->pre_hyphen_char = (int) v;
89}
90
91void set_post_hyphen_char(int n, int v)
92{
93    struct tex_language *l = get_language((int) n);
94    if (l != NULL)
95        l->post_hyphen_char = (int) v;
96}
97
98
99void set_pre_exhyphen_char(int n, int v)
100{
101    struct tex_language *l = get_language((int) n);
102    if (l != NULL)
103        l->pre_exhyphen_char = (int) v;
104}
105
106void set_post_exhyphen_char(int n, int v)
107{
108    struct tex_language *l = get_language((int) n);
109    if (l != NULL)
110        l->post_exhyphen_char = (int) v;
111}
112
113
114int get_pre_hyphen_char(int n)
115{
116    struct tex_language *l = get_language((int) n);
117    if (l == NULL)
118        return -1;
119    return (int) l->pre_hyphen_char;
120}
121
122int get_post_hyphen_char(int n)
123{
124    struct tex_language *l = get_language((int) n);
125    if (l == NULL)
126        return -1;
127    return (int) l->post_hyphen_char;
128}
129
130
131int get_pre_exhyphen_char(int n)
132{
133    struct tex_language *l = get_language((int) n);
134    if (l == NULL)
135        return -1;
136    return (int) l->pre_exhyphen_char;
137}
138
139int get_post_exhyphen_char(int n)
140{
141    struct tex_language *l = get_language((int) n);
142    if (l == NULL)
143        return -1;
144    return (int) l->post_exhyphen_char;
145}
146
147@ @c
148void load_patterns(struct tex_language *lang, const unsigned char *buff)
149{
150    if (lang == NULL || buff == NULL || strlen((const char *) buff) == 0)
151        return;
152    if (lang->patterns == NULL) {
153        lang->patterns = hnj_hyphen_new();
154    }
155    hnj_hyphen_load(lang->patterns, buff);
156}
157
158void clear_patterns(struct tex_language *lang)
159{
160    if (lang == NULL)
161        return;
162    if (lang->patterns != NULL) {
163        hnj_hyphen_clear(lang->patterns);
164    }
165}
166
167void load_tex_patterns(int curlang, halfword head)
168{
169    char *s = tokenlist_to_cstring(head, 1, NULL);
170    load_patterns(get_language(curlang), (unsigned char *) s);
171}
172
173
174@ @c
175#define STORE_CHAR(x) do {                          \
176	unsigned xx = get_lc_code(x);               \
177        if (!xx) xx = x;                            \
178        uindex = uni2string(uindex, xx);            \
179    } while (0)
180
181/* Cleans one word which is returned in |cleaned|,
182   returns the new offset into |buffer| */
183
184const char *clean_hyphenation(const char *buff, char **cleaned)
185{
186    int items = 0;
187    unsigned char word[MAX_WORD_LEN + 1]; /* work buffer for bytes */
188    unsigned uword[MAX_WORD_LEN + 1] = { 0 };  /* work buffer for unicode */
189    int u = 0; /* unicode buffer value */
190    int i = 0; /* index into buffer */
191    char *uindex = (char *)word;
192    const char *s = buff;
193
194    while (*s && !isspace((unsigned char)*s)) {
195	word[i++] = (unsigned)*s;
196	s++;
197        if ((s-buff)>MAX_WORD_LEN) {
198            /* todo: this is too strict, should count unicode, not bytes */
199    	    *cleaned = NULL;
200            tex_error("exception too long", NULL);
201            return s;
202        }
203    }
204    /* now convert the input to unicode */
205    word[i] = '\0';
206    utf2uni_strcpy(uword, (const char *)word);
207
208    /* build the new word string */
209    i = 0;
210    while (uword[i]>0) {
211	u = uword[i++];
212        if (u == '-') {        /* skip */
213        } else if (u == '=') {
214            STORE_CHAR('-');
215        } else if (u == '{') {
216            u = uword[i++];
217            items = 0;
218            while (u && u != '}') {
219                u = uword[i++];
220            }
221            if (u == '}') {
222                items++;
223                u = uword[i++];
224            }
225            while (u && u != '}') {
226                u = uword[i++];
227            }
228            if (u == '}') {
229                items++;
230                u = uword[i++];;
231            }
232            if (u == '{') {
233                u = uword[i++];;
234            }
235            while (u && u != '}') {
236                STORE_CHAR(u);
237                u = uword[i++];
238            }
239            if (u == '}') {
240	        items++;
241            }
242            if (items != 3) {   /* syntax error */
243                *cleaned = NULL;
244                tex_error("exception syntax error", NULL);
245                return s;
246            }
247        } else {
248            STORE_CHAR(u);
249        }
250    }
251    *uindex = '\0';
252    *cleaned = xstrdup((char *) word);
253    return s;
254}
255
256@ @c
257void load_hyphenation(struct tex_language *lang, const unsigned char *buff)
258{
259    const char *s;
260    const char *value;
261    char *cleaned;
262    lua_State *L = Luas;
263    if (lang == NULL)
264        return;
265    if (lang->exceptions == 0) {
266        lua_newtable(L);
267        lang->exceptions = luaL_ref(L, LUA_REGISTRYINDEX);
268    }
269    lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
270    s = (const char *) buff;
271    while (*s) {
272        while (isspace((unsigned char)*s))
273            s++;
274        if (*s) {
275            value = s;
276            s = clean_hyphenation(s, &cleaned);
277            if (cleaned != NULL) {
278                if ((s - value) > 0) {
279                    lua_pushstring(L, cleaned);
280                    lua_pushlstring(L, value, (size_t) (s - value));
281                    lua_rawset(L, -3);
282                }
283                free(cleaned);
284            } else {
285#ifdef VERBOSE
286                fprintf(stderr, "skipping invalid hyphenation exception: %s\n",
287                        value);
288#endif
289            }
290        }
291    }
292}
293
294void clear_hyphenation(struct tex_language *lang)
295{
296    if (lang == NULL)
297        return;
298    if (lang->exceptions != 0) {
299        luaL_unref(Luas, LUA_REGISTRYINDEX, lang->exceptions);
300        lang->exceptions = 0;
301    }
302}
303
304
305void load_tex_hyphenation(int curlang, halfword head)
306{
307    char *s = tokenlist_to_cstring(head, 1, NULL);
308    load_hyphenation(get_language(curlang), (unsigned char *) s);
309}
310
311@ TODO: clean this up. The |delete_attribute_ref()| statements are not very
312   nice, but needed. Also, in the post-break, it would be nicer to get the
313   attribute list from |vlink(n)|. No rush, as it is currently not used much.
314
315@c
316halfword insert_discretionary(halfword t, halfword pre, halfword post,
317                              halfword replace)
318{
319    halfword g, n;
320    int f;
321    n = new_node(disc_node, syllable_disc);
322    try_couple_nodes(n, vlink(t));
323    couple_nodes(t, n);
324    if (replace != null)
325        f = font(replace);
326    else
327        f = get_cur_font();     /* for compound words following explicit hyphens */
328    for (g = pre; g != null; g = vlink(g)) {
329        font(g) = f;
330        if (node_attr(t) != null) {
331            delete_attribute_ref(node_attr(g));
332            node_attr(g) = node_attr(t);
333            attr_list_ref(node_attr(t)) += 1;
334        }
335    }
336    for (g = post; g != null; g = vlink(g)) {
337        font(g) = f;
338        if (node_attr(t) != null) {
339            delete_attribute_ref(node_attr(g));
340            node_attr(g) = node_attr(t);
341            attr_list_ref(node_attr(t)) += 1;
342        }
343    }
344    for (g = replace; g != null; g = vlink(g)) {
345        if (node_attr(t) != null) {
346            delete_attribute_ref(node_attr(g));
347            node_attr(g) = node_attr(t);
348            attr_list_ref(node_attr(t)) += 1;
349        }
350    }
351    if (node_attr(t) != null) {
352        delete_attribute_ref(node_attr(vlink(t)));
353        node_attr(vlink(t)) = node_attr(t);
354        attr_list_ref(node_attr(t)) += 1;
355    }
356    t = vlink(t);
357    set_disc_field(pre_break(t), pre);
358    set_disc_field(post_break(t), post);
359    set_disc_field(no_break(t), replace);
360    return t;
361}
362
363halfword insert_syllable_discretionary(halfword t, lang_variables * lan)
364{
365    halfword g, n;
366    n = new_node(disc_node, syllable_disc);
367    couple_nodes(n, vlink(t));
368    couple_nodes(t, n);
369    delete_attribute_ref(node_attr(n));
370    if (node_attr(t) != null) {
371        node_attr(n) = node_attr(t);
372        attr_list_ref(node_attr(t))++;
373    } else {
374        node_attr(n) = null;
375    }
376    if (lan->pre_hyphen_char > 0) {
377        g = raw_glyph_node();
378        set_to_character(g);
379        character(g) = lan->pre_hyphen_char;
380        font(g) = font(t);
381        lang_data(g) = lang_data(t);
382        if (node_attr(t) != null) {
383            node_attr(g) = node_attr(t);
384            attr_list_ref(node_attr(t))++;
385        }
386        set_disc_field(pre_break(n), g);
387    }
388
389    if (lan->post_hyphen_char > 0) {
390        t = vlink(n);
391        g = raw_glyph_node();
392        set_to_character(g);
393        character(g) = lan->post_hyphen_char;
394        font(g) = font(t);
395        lang_data(g) = lang_data(t);
396        if (node_attr(t) != null) {
397            node_attr(g) = node_attr(t);
398            attr_list_ref(node_attr(t)) += 1;
399        }
400        set_disc_field(post_break(n), g);
401    }
402    return n;
403}
404
405halfword insert_word_discretionary(halfword t, lang_variables * lan)
406{
407    halfword pre = null, pos = null;
408    if (lan->pre_exhyphen_char > 0)
409        pre = insert_character(null, lan->pre_exhyphen_char);
410    if (lan->post_exhyphen_char > 0)
411        pos = insert_character(null, lan->post_exhyphen_char);
412    return insert_discretionary(t, pre, pos, null);
413}
414
415@ @c
416halfword compound_word_break(halfword t, int clang)
417{
418    int disc;
419    lang_variables langdata;
420    langdata.pre_exhyphen_char = get_pre_exhyphen_char(clang);
421    langdata.post_exhyphen_char = get_post_exhyphen_char(clang);
422    disc = insert_word_discretionary(t, &langdata);
423    return disc;
424}
425
426
427halfword insert_complex_discretionary(halfword t, lang_variables * lan,
428                                      halfword pre, halfword pos,
429                                      halfword replace)
430{
431    (void) lan;
432    return insert_discretionary(t, pre, pos, replace);
433}
434
435
436halfword insert_character(halfword t, int c)
437{
438    halfword p;
439    p = new_node(glyph_node, 0);
440    set_to_character(p);
441    character(p) = c;
442    if (t != null) {
443        couple_nodes(t, p);
444    }
445    return p;
446}
447
448@ @c
449void set_disc_field(halfword f, halfword t)
450{
451    if (t != null) {
452        couple_nodes(f, t);
453        tlink(f) = tail_of_list(t);
454    } else {
455        vlink(f) = null;
456        tlink(f) = null;
457    }
458}
459
460
461
462@ @c
463static char *hyphenation_exception(int exceptions, char *w)
464{
465    char *ret = NULL;
466    lua_State *L = Luas;
467    lua_checkstack(L, 2);
468    lua_rawgeti(L, LUA_REGISTRYINDEX, exceptions);
469    if (lua_istable(L, -1)) {   /* ?? */
470        lua_pushstring(L, w);   /* word table */
471        lua_rawget(L, -2);
472        if (lua_isstring(L, -1)) {
473            ret = xstrdup(lua_tostring(L, -1));
474        }
475        lua_pop(L, 2);
476    } else {
477        lua_pop(L, 1);
478    }
479    return ret;
480}
481
482
483@ @c
484char *exception_strings(struct tex_language *lang)
485{
486    const char *value;
487    size_t size = 0, current = 0;
488    size_t l = 0;
489    char *ret = NULL;
490    lua_State *L = Luas;
491    if (lang->exceptions == 0)
492        return NULL;
493    lua_checkstack(L, 2);
494    lua_rawgeti(L, LUA_REGISTRYINDEX, lang->exceptions);
495    if (lua_istable(L, -1)) {
496        /* iterate and join */
497        lua_pushnil(L);         /* first key */
498        while (lua_next(L, -2) != 0) {
499            value = lua_tolstring(L, -1, &l);
500            if (current + 2 + l > size) {
501                ret =
502                    xrealloc(ret,
503                             (unsigned) ((size + size / 5) + current + l +
504                                         1024));
505                size = (size + size / 5) + current + l + 1024;
506            }
507            *(ret + current) = ' ';
508            strcpy(ret + current + 1, value);
509            current += l + 1;
510            lua_pop(L, 1);
511        }
512    }
513    return ret;
514}
515
516
517@ the sequence from |wordstart| to |r| can contain only normal characters
518it could be faster to modify a halfword pointer and return an integer
519
520@c
521static halfword find_exception_part(unsigned int *j, unsigned int *uword, int len)
522{
523    halfword g = null, gg = null;
524    register unsigned i = *j;
525    i++;                        /* this puts uword[i] on the |{| */
526    while (i < (unsigned) len && uword[i + 1] != '}') {
527        if (g == null) {
528            gg = new_char(0, (int) uword[i + 1]);
529            g = gg;
530        } else {
531            halfword s = new_char(0, (int) uword[i + 1]);
532            couple_nodes(g, s);
533            g = vlink(g);
534        }
535        i++;
536    }
537    *j = ++i;
538    return gg;
539}
540
541static int count_exception_part(unsigned int *j, unsigned int *uword, int len)
542{
543    int ret = 0;
544    register unsigned i = *j;
545    i++;                        /* this puts uword[i] on the |{| */
546    while (i < (unsigned) len && uword[i + 1] != '}') {
547        ret++;
548        i++;
549    }
550    *j = ++i;
551    return ret;
552}
553
554
555@ @c
556static const char *PAT_ERROR[] = {
557    "Exception discretionaries should contain three pairs of braced items.",
558    "No intervening spaces are allowed.",
559    NULL
560};
561
562static void do_exception(halfword wordstart, halfword r, char *replacement)
563{
564    unsigned i;
565    halfword t;
566    unsigned len;
567    int clang;
568    lang_variables langdata;
569    unsigned uword[MAX_WORD_LEN + 1] = { 0 };
570    utf2uni_strcpy(uword, replacement);
571    len = u_length(uword);
572    i = 0;
573    t = wordstart;
574    clang = char_lang(wordstart);
575    langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
576    langdata.post_hyphen_char = get_post_hyphen_char(clang);
577
578    for (i = 0; i < len; i++) {
579        if (uword[i + 1] == '-') {      /* a hyphen follows */
580            while (vlink(t) != r
581                   && (type(t) != glyph_node || !is_simple_character(t)))
582                t = vlink(t);
583            if (vlink(t) == r)
584                break;
585            insert_syllable_discretionary(t, &langdata);
586            t = vlink(t);       /* skip the new disc */
587        } else if (uword[i + 1] == '=') {
588            /* do nothing ? */
589            t = vlink(t);
590        } else if (uword[i + 1] == '{') {
591            halfword gg, hh, replace = null;
592            int repl;
593            gg = find_exception_part(&i, uword, (int) len);
594            if (i == len || uword[i + 1] != '{') {
595                tex_error("broken pattern 1", PAT_ERROR);
596            }
597            hh = find_exception_part(&i, uword, (int) len);
598            if (i == len || uword[i + 1] != '{') {
599                tex_error("broken pattern 2", PAT_ERROR);
600            }
601            repl = count_exception_part(&i, uword, (int) len);
602            if (i == len) {
603                tex_error("broken pattern 3", PAT_ERROR);
604            }
605            /*i++;  *//* jump over the last right brace */
606            if (vlink(t) == r)
607                break;
608            if (repl > 0) {
609                halfword q = t;
610                replace = vlink(q);
611                while (repl > 0 && q != null) {
612                    q = vlink(q);
613                    if (type(q) == glyph_node) {
614                        repl--;
615                    }
616                }
617                try_couple_nodes(t, vlink(q));
618                vlink(q) = null;
619            }
620            t = insert_discretionary(t, gg, hh, replace);
621            t = vlink(t);       /* skip the new disc */
622        } else {
623            t = vlink(t);
624        }
625    }
626}
627
628@ This is a documentation section from the pascal web file. It is not
629true any more, but I do not have time right now to rewrite it -- Taco
630
631When the line-breaking routine is unable to find a feasible sequence of
632breakpoints, it makes a second pass over the paragraph, attempting to
633hyphenate the hyphenatable words. The goal of hyphenation is to insert
634discretionary material into the paragraph so that there are more
635potential places to break.
636
637The general rules for hyphenation are somewhat complex and technical,
638because we want to be able to hyphenate words that are preceded or
639followed by punctuation marks, and because we want the rules to work
640for languages other than English. We also must contend with the fact
641that hyphens might radically alter the ligature and kerning structure
642of a word.
643
644A sequence of characters will be considered for hyphenation only if it
645belongs to a ``potentially hyphenatable part'' of the current paragraph.
646This is a sequence of nodes $p_0p_1\ldots p_m$ where $p_0$ is a glue node,
647$p_1\ldots p_{m-1}$ are either character or ligature or whatsit or
648implicit kern nodes, and $p_m$ is a glue or penalty or insertion or adjust
649or mark or whatsit or explicit kern node.  (Therefore hyphenation is
650disabled by boxes, math formulas, and discretionary nodes already inserted
651by the user.) The ligature nodes among $p_1\ldots p_{m-1}$ are effectively
652expanded into the original non-ligature characters; the kern nodes and
653whatsits are ignored. Each character |c| is now classified as either a
654nonletter (if |lc_code(c)=0|), a lowercase letter (if
655|lc_code(c)=c|), or an uppercase letter (otherwise); an uppercase letter
656is treated as if it were |lc_code(c)| for purposes of hyphenation. The
657characters generated by $p_1\ldots p_{m-1}$ may begin with nonletters; let
658$c_1$ be the first letter that is not in the middle of a ligature. Whatsit
659nodes preceding $c_1$ are ignored; a whatsit found after $c_1$ will be the
660terminating node $p_m$. All characters that do not have the same font as
661$c_1$ will be treated as nonletters. The |hyphen_char| for that font
662must be between 0 and 255, otherwise hyphenation will not be attempted.
663\TeX\ looks ahead for as many consecutive letters $c_1\ldots c_n$ as
664possible; however, |n| must be less than 64, so a character that would
665otherwise be $c_{64}$ is effectively not a letter. Furthermore $c_n$ must
666not be in the middle of a ligature.  In this way we obtain a string of
667letters $c_1\ldots c_n$ that are generated by nodes $p_a\ldots p_b$, where
668|1<=a<=b+1<=m|. If |n>=l_hyf+r_hyf|, this string qualifies for hyphenation;
669however, |uc_hyph| must be positive, if $c_1$ is uppercase.
670
671The hyphenation process takes place in three stages. First, the candidate
672sequence $c_1\ldots c_n$ is found; then potential positions for hyphens
673are determined by referring to hyphenation tables; and finally, the nodes
674$p_a\ldots p_b$ are replaced by a new sequence of nodes that includes the
675discretionary breaks found.
676
677Fortunately, we do not have to do all this calculation very often, because
678of the way it has been taken out of \TeX's inner loop. For example, when
679the second edition of the author's 700-page book {\sl Seminumerical
680Algorithms} was typeset by \TeX, only about 1.2 hyphenations needed to be
681@^Knuth, Donald Ervin@>
682tried per paragraph, since the line breaking algorithm needed to use two
683passes on only about 5 per cent of the paragraphs.
684
685
686When a word been set up to contain a candidate for hyphenation,
687\TeX\ first looks to see if it is in the user's exception dictionary. If not,
688hyphens are inserted based on patterns that appear within the given word,
689using an algorithm due to Frank~M. Liang.
690@^Liang, Franklin Mark@>
691
692
693@ This is incompatible with TEX because the first word of a paragraph
694can be hyphenated, but most european users seem to agree that
695prohibiting hyphenation there was not the best idea ever.
696
697@c
698static halfword find_next_wordstart(halfword r)
699{
700    register int l;
701    register int start_ok = 1;
702    int mathlevel = 1;
703    int chr ;
704    halfword t ;
705    while (r != null) {
706        switch (type(r)) {
707        case whatsit_node:
708            break;
709        case glue_node:
710            start_ok = 1;
711            break;
712        case math_node:
713            while (mathlevel > 0) {
714                r = vlink(r);
715                if (r == null)
716                    return r;
717                if (type(r) == math_node) {
718                    if (subtype(r) == before) {
719                        mathlevel++;
720                    } else {
721                        mathlevel--;
722                    }
723                }
724            }
725            break;
726        case glyph_node:
727            if (is_simple_character(r)) {
728                chr = character(r) ;
729                if (chr == ex_hyphen_char) {
730                    /* We only accept an explicit hyphen when there is a preceding glyph and  */
731                    /* we skip a sequence of explicit hyphens as that normally indicates a    */
732                    /* -- or --- ligature in which case we can in a worse case usage get bad  */
733                    /* node lists later on due to messed up ligature building as these dashes */
734                    /* ligatures in base fonts. This is a side effect of the separating the   */
735                    /* hyphenation, ligaturing and kerning steps. A test is cmr with ------.  */
736                    t = vlink(r) ;
737                    if ((start_ok > 0) && (t!=null) && (type(t) == glyph_node) && (character(t) != ex_hyphen_char)) {
738                        t = compound_word_break(r, char_lang(r));
739                        subtype(t) = automatic_disc;
740                        start_ok = 1 ;
741                    } else {
742                        start_ok = 0;
743                    }
744                } else if (start_ok && (l = get_lc_code(chr)) > 0) {
745                    if (char_uchyph(r) || l == chr) {
746                        return r;
747                    } else {
748                        start_ok = 0;
749                    }
750                }
751            }
752            break;
753        default:
754            start_ok = 0;
755            break;
756        }
757        r = vlink(r);
758    }
759    return r;
760}
761
762@ @c
763static int valid_wordend(halfword s)
764{
765    register halfword r = s;
766    register int clang = char_lang(s);
767    if (r == null)
768        return 1;
769    while ((r != null) && ((type(r) == glyph_node && is_simple_character(r)
770                            && clang == char_lang(r)) ||
771                           (type(r) == kern_node && (subtype(r) == normal))
772           )) {
773        r = vlink(r);
774    }
775    if (r == null || (type(r) == glyph_node && is_simple_character(r)
776                      && clang != char_lang(r)) || type(r) == glue_node
777        || type(r) == whatsit_node || type(r) == ins_node
778        || type(r) == adjust_node || type(r) == penalty_node
779        || (type(r) == kern_node
780            && (subtype(r) == explicit || subtype(r) == acc_kern)))
781        return 1;
782    return 0;
783}
784
785@ @c
786void hnj_hyphenation(halfword head, halfword tail)
787{
788    int lchar, i;
789    struct tex_language *lang;
790    lang_variables langdata;
791    char utf8word[(4 * MAX_WORD_LEN) + 1] = { 0 };
792    int wordlen = 0;
793    char *hy = utf8word;
794    char *replacement = NULL;
795    boolean explicit_hyphen = false;
796    halfword s, r = head, wordstart = null, save_tail1 = null, left =
797        null, right = null;
798
799    /* this first movement assures two things:
800     \item{a)} that we won't waste lots of time on something that has been
801      handled already (in that case, none of the glyphs match |simple_character|).
802     \item{b)} that the first word can be hyphenated. if the movement was
803     not explicit, then the indentation at the start of a paragraph
804     list would make |find_next_wordstart()| look too far ahead.
805     */
806
807    while (r != null && (type(r) != glyph_node || !is_simple_character(r))) {
808        r = vlink(r);
809    }
810    /* this will make |r| a glyph node with subtype character */
811    r = find_next_wordstart(r);
812    if (r == null)
813        return;
814
815    assert(tail != null);
816    save_tail1 = vlink(tail);
817    s = new_penalty(0);
818    couple_nodes(tail, s);
819
820    while (r != null) {         /* could be while(1), but let's be paranoid */
821        int clang, lhmin, rhmin;
822        halfword hyf_font;
823        halfword end_word = r;
824        wordstart = r;
825        assert(is_simple_character(wordstart));
826        hyf_font = font(wordstart);
827        if (hyphen_char(hyf_font) < 0)  /* for backward compat */
828            hyf_font = 0;
829        clang = char_lang(wordstart);
830        lhmin = char_lhmin(wordstart);
831        rhmin = char_rhmin(wordstart);
832        langdata.pre_hyphen_char = get_pre_hyphen_char(clang);
833        langdata.post_hyphen_char = get_post_hyphen_char(clang);
834        while (r != null && type(r) == glyph_node && is_simple_character(r) && clang == char_lang(r) &&
835              (((lchar = get_lc_code(character(r))) > 0) || (character(r) == ex_hyphen_char && (lchar = ex_hyphen_char)))) {
836            if (character(r) == ex_hyphen_char)
837    	        explicit_hyphen = true;
838            wordlen++;
839            hy = uni2string(hy, (unsigned) lchar);
840            /* this should not be needed  any more */
841            /*if (vlink(r)!=null) alink(vlink(r))=r; */
842            end_word = r;
843            r = vlink(r);
844        }
845        if (valid_wordend(r) && wordlen >= lhmin + rhmin
846            && (hyf_font != 0) && clang >=0 && (lang = tex_languages[clang]) != NULL) {
847            *hy = 0;
848            if (lang->exceptions != 0 &&
849                (replacement =
850                 hyphenation_exception(lang->exceptions, utf8word)) != NULL) {
851#ifdef VERBOSE
852                fprintf(stderr, "replacing %s (c=%d) by %s\n", utf8word, clang,
853                        replacement);
854#endif
855                do_exception(wordstart, r, replacement);
856                free(replacement);
857            } else if (explicit_hyphen == true) {
858                /* insert an explicit discretionary after each of the last in a
859	           set of explicit hyphens */
860                halfword rr = r;
861                halfword t = null;
862#ifdef VERBOSE
863                fprintf(stderr, "explicit hyphen(s) found in %s (c=%d)\n", utf8word, clang);
864#endif
865                while (rr != wordstart) {
866	            if (is_simple_character(rr)) {
867                        if (character(rr) == ex_hyphen_char) {
868                            t = compound_word_break(rr, clang);
869                            subtype(t) = automatic_disc;
870	                    while(character(alink(rr)) == ex_hyphen_char)
871	                       rr = alink(rr);
872	                    if (rr == wordstart)
873	                       break;
874                        }
875                    }
876                    rr = alink(rr);
877                }
878
879            } else if (lang->patterns != NULL) {
880
881                left = wordstart;
882                for (i = lhmin; i > 1; i--) {
883                    left = vlink(left);
884                    while (!is_simple_character(left))
885                        left = vlink(left);
886                }
887                right = r;
888                for (i = rhmin; i > 0; i--) {
889                    right = alink(right);
890                    while (!is_simple_character(right))
891                        right = alink(right);
892                }
893
894#ifdef VERBOSE
895                fprintf(stderr, "hyphenate %s (c=%d,l=%d,r=%d) from %c to %c\n",
896                        utf8word, clang, lhmin, rhmin, character(left),
897                        character(right));
898#endif
899                (void) hnj_hyphen_hyphenate(lang->patterns, wordstart, end_word,
900                                            wordlen, left, right, &langdata);
901            }
902        }
903	explicit_hyphen = false;
904        wordlen = 0;
905        hy = utf8word;
906        if (r == null)
907            break;
908        r = find_next_wordstart(r);
909    }
910    flush_node(vlink(tail));
911    vlink(tail) = save_tail1;
912}
913
914
915@ @c
916void new_hyphenation(halfword head, halfword tail)
917{
918    register int callback_id = 0;
919    if (head == null || vlink(head) == null)
920        return;
921    fix_node_list(head);
922    callback_id = callback_defined(hyphenate_callback);
923    if (callback_id > 0) {
924        lua_State *L = Luas;
925        if (!get_callback(L, callback_id)) {
926            lua_pop(L, 2);
927            return;
928        }
929        nodelist_to_lua(L, head);
930        nodelist_to_lua(L, tail);
931        if (lua_pcall(L, 2, 0, 0) != 0) {
932            fprintf(stdout, "error: %s\n", lua_tostring(L, -1));
933            lua_pop(L, 2);
934            lua_error(L);
935            return;
936        }
937        lua_pop(L, 1);
938    } else if (callback_id == 0) {
939        hnj_hyphenation(head, tail);
940    }
941}
942
943@ dumping and undumping languages
944
945@c
946#define dump_string(a)                          \
947  if (a!=NULL) {                                \
948      x = (int)strlen(a)+1;                     \
949    dump_int(x);  dump_things(*a, x);           \
950  } else {                                      \
951    x = 0; dump_int(x);                         \
952  }
953
954
955static void dump_one_language(int i)
956{
957    char *s = NULL;
958    int x = 0;
959    struct tex_language *lang;
960    lang = tex_languages[i];
961    dump_int(lang->id);
962    dump_int(lang->pre_hyphen_char);
963    dump_int(lang->post_hyphen_char);
964    dump_int(lang->pre_exhyphen_char);
965    dump_int(lang->post_exhyphen_char);
966    if (lang->patterns != NULL) {
967        s = (char *) hnj_serialize(lang->patterns);
968    }
969    dump_string(s);
970    if (s != NULL) {
971        free(s);
972        s = NULL;
973    }
974    if (lang->exceptions != 0)
975        s = exception_strings(lang);
976    dump_string(s);
977    if (s != NULL) {
978        free(s);
979    }
980    free(lang);
981}
982
983void dump_language_data(void)
984{
985    int i;
986    dump_int(next_lang_id);
987    for (i = 0; i < next_lang_id; i++) {
988        if (tex_languages[i]) {
989            dump_int(1);
990            dump_one_language(i);
991        } else {
992            dump_int(0);
993        }
994    }
995}
996
997
998static void undump_one_language(int i)
999{
1000    char *s = NULL;
1001    int x = 0;
1002    struct tex_language *lang = get_language(i);
1003    undump_int(x);
1004    lang->id = x;
1005    undump_int(x);
1006    lang->pre_hyphen_char = x;
1007    undump_int(x);
1008    lang->post_hyphen_char = x;
1009    undump_int(x);
1010    lang->pre_exhyphen_char = x;
1011    undump_int(x);
1012    lang->post_exhyphen_char = x;
1013    /* patterns */
1014    undump_int(x);
1015    if (x > 0) {
1016        s = xmalloc((unsigned) x);
1017        undump_things(*s, x);
1018        load_patterns(lang, (unsigned char *) s);
1019        free(s);
1020    }
1021    /* exceptions */
1022    undump_int(x);
1023    if (x > 0) {
1024        s = xmalloc((unsigned) x);
1025        undump_things(*s, x);
1026        load_hyphenation(lang, (unsigned char *) s);
1027        free(s);
1028    }
1029}
1030
1031void undump_language_data(void)
1032{
1033    int i, x, numlangs;
1034    undump_int(numlangs);
1035    next_lang_id = numlangs;
1036    for (i = 0; i < numlangs; i++) {
1037        undump_int(x);
1038        if (x == 1) {
1039            undump_one_language(i);
1040        }
1041    }
1042}
1043
1044
1045@ When \TeX\ has scanned `\.{\\hyphenation}', it calls on a procedure named
1046|new_hyph_exceptions| to do the right thing.
1047
1048@c
1049void new_hyph_exceptions(void)
1050{                               /* enters new exceptions */
1051    (void) scan_toks(false, true);
1052    load_tex_hyphenation(int_par(language_code), def_ref);
1053    flush_list(def_ref);
1054}
1055
1056@ Similarly, when \TeX\ has scanned `\.{\\patterns}', it calls on a
1057procedure named |new_patterns|.
1058
1059@c
1060void new_patterns(void)
1061{                               /* initializes the hyphenation pattern data */
1062    (void) scan_toks(false, true);
1063    load_tex_patterns(int_par(language_code), def_ref);
1064    flush_list(def_ref);
1065}
1066
1067@ `\.{\\prehyphenchar}', sets the |pre_break| character, and
1068`\.{\\posthyphenchar}' the |post_break| character. Their respective
1069defaults are ascii hyphen ("-") and zero (nul).
1070
1071@c
1072void new_pre_hyphen_char(void)
1073{
1074    scan_optional_equals();
1075    scan_int();
1076    set_pre_hyphen_char(int_par(language_code), cur_val);
1077}
1078
1079void new_post_hyphen_char(void)
1080{
1081    scan_optional_equals();
1082    scan_int();
1083    set_post_hyphen_char(int_par(language_code), cur_val);
1084}
1085
1086
1087@ `\.{\\preexhyphenchar}', sets the |pre_break| character, and
1088`\.{\\postexhyphenchar}' the |post_break| character. Their
1089defaults are both zero (nul).
1090
1091@c
1092void new_pre_exhyphen_char(void)
1093{
1094    scan_optional_equals();
1095    scan_int();
1096    set_pre_exhyphen_char(int_par(language_code), cur_val);
1097}
1098
1099void new_post_exhyphen_char(void)
1100{
1101    scan_optional_equals();
1102    scan_int();
1103    set_post_exhyphen_char(int_par(language_code), cur_val);
1104}
1105