1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
2  * licenses follows.
3  */
4 
5 /* LibHnj - a library for high quality hyphenation and justification
6  * Copyright (C) 1998 Raph Levien,
7  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
8  *           (C) 2001 Peter Novodvorsky (nidd@cs.msu.su)
9  *           (C) 2006, 2007, 2008, 2010 László Németh (nemeth at OOo)
10  *
11  * This library is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Library General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This library is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * Library General Public License for more details.
20  *
21  * You should have received a copy of the GNU Library General Public
22  * License along with this library; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA  02111-1307  USA.
25 */
26 
27 /*
28  * The contents of this file are subject to the Mozilla Public License
29  * Version 1.0 (the "MPL"); you may not use this file except in
30  * compliance with the MPL.  You may obtain a copy of the MPL at
31  * http://www.mozilla.org/MPL/
32  *
33  * Software distributed under the MPL is distributed on an "AS IS" basis,
34  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
35  * for the specific language governing rights and limitations under the
36  * MPL.
37  *
38  */
39 #include <stdlib.h> /* for NULL, malloc */
40 #include <stdio.h>  /* for fprintf */
41 #include <string.h> /* for strdup */
42 
43 #ifdef UNX
44 #include <unistd.h> /* for exit */
45 #endif
46 
47 #define noVERBOSE
48 
49 /* calculate hyphenmin values with long ligature length (2 or 3 characters
50  * instead of 1 or 2) for comparison with hyphenation without ligatures */
51 #define noLONG_LIGATURE
52 
53 #ifdef LONG_LIGATURE
54 #define LIG_xx	1
55 #define LIG_xxx	2
56 #else
57 #define LIG_xx	0
58 #define LIG_xxx	1
59 #endif
60 
61 #include "hnjalloc.h"
62 #include "hyphen.h"
63 
64 static char *
hnj_strdup(const char * s)65 hnj_strdup (const char *s)
66 {
67   char *newstr;
68   int l;
69 
70   l = strlen (s);
71   newstr = (char *) hnj_malloc (l + 1);
72   memcpy (newstr, s, l);
73   newstr[l] = 0;
74   return newstr;
75 }
76 
77 /* remove cross-platform text line end characters */
hnj_strchomp(char * s)78 void hnj_strchomp(char * s)
79 {
80   int k = strlen(s);
81   if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
82   if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
83 }
84 
85 /* a little bit of a hash table implementation. This simply maps strings
86    to state numbers */
87 
88 typedef struct _HashTab HashTab;
89 typedef struct _HashEntry HashEntry;
90 
91 /* A cheap, but effective, hack. */
92 #define HASH_SIZE 31627
93 
94 struct _HashTab {
95   HashEntry *entries[HASH_SIZE];
96 };
97 
98 struct _HashEntry {
99   HashEntry *next;
100   char *key;
101   int val;
102 };
103 
104 /* a char* hash function from ASU - adapted from Gtk+ */
105 static unsigned int
hnj_string_hash(const char * s)106 hnj_string_hash (const char *s)
107 {
108   const char *p;
109   unsigned int h=0, g;
110   for(p = s; *p != '\0'; p += 1) {
111     h = ( h << 4 ) + *p;
112     if ( ( g = h & 0xf0000000 ) ) {
113       h = h ^ (g >> 24);
114       h = h ^ g;
115     }
116   }
117   return h /* % M */;
118 }
119 
120 static HashTab *
hnj_hash_new(void)121 hnj_hash_new (void)
122 {
123   HashTab *hashtab;
124   int i;
125 
126   hashtab = (HashTab *) hnj_malloc (sizeof(HashTab));
127   for (i = 0; i < HASH_SIZE; i++)
128     hashtab->entries[i] = NULL;
129 
130   return hashtab;
131 }
132 
133 static void
hnj_hash_free(HashTab * hashtab)134 hnj_hash_free (HashTab *hashtab)
135 {
136   int i;
137   HashEntry *e, *next;
138 
139   for (i = 0; i < HASH_SIZE; i++)
140     for (e = hashtab->entries[i]; e; e = next)
141       {
142 	next = e->next;
143 	hnj_free (e->key);
144 	hnj_free (e);
145       }
146 
147   hnj_free (hashtab);
148 }
149 
150 /* assumes that key is not already present! */
151 static void
hnj_hash_insert(HashTab * hashtab,const char * key,int val)152 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
153 {
154   int i;
155   HashEntry *e;
156 
157   i = hnj_string_hash (key) % HASH_SIZE;
158   e = (HashEntry *) hnj_malloc (sizeof(HashEntry));
159   e->next = hashtab->entries[i];
160   e->key = hnj_strdup (key);
161   e->val = val;
162   hashtab->entries[i] = e;
163 }
164 
165 /* return val if found, otherwise -1 */
166 static int
hnj_hash_lookup(HashTab * hashtab,const char * key)167 hnj_hash_lookup (HashTab *hashtab, const char *key)
168 {
169   int i;
170   HashEntry *e;
171   i = hnj_string_hash (key) % HASH_SIZE;
172   for (e = hashtab->entries[i]; e; e = e->next)
173     if (!strcmp (key, e->key))
174       return e->val;
175   return -1;
176 }
177 
178 /* Get the state number, allocating a new state if necessary. */
179 static int
hnj_get_state(HyphenDict * dict,HashTab * hashtab,const char * string)180 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
181 {
182   int state_num;
183 
184   state_num = hnj_hash_lookup (hashtab, string);
185 
186   if (state_num >= 0)
187     return state_num;
188 
189   hnj_hash_insert (hashtab, string, dict->num_states);
190   /* predicate is true if dict->num_states is a power of two */
191   if (!(dict->num_states & (dict->num_states - 1)))
192     {
193       dict->states = (HyphenState *) hnj_realloc (dict->states,
194 				  (dict->num_states << 1) *
195 				  sizeof(HyphenState));
196     }
197   dict->states[dict->num_states].match = NULL;
198   dict->states[dict->num_states].repl = NULL;
199   dict->states[dict->num_states].fallback_state = -1;
200   dict->states[dict->num_states].num_trans = 0;
201   dict->states[dict->num_states].trans = NULL;
202   return dict->num_states++;
203 }
204 
205 /* add a transition from state1 to state2 through ch - assumes that the
206    transition does not already exist */
207 static void
hnj_add_trans(HyphenDict * dict,int state1,int state2,char ch)208 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
209 {
210   int num_trans;
211 
212   num_trans = dict->states[state1].num_trans;
213   if (num_trans == 0)
214     {
215       dict->states[state1].trans = (HyphenTrans *) hnj_malloc (sizeof(HyphenTrans));
216     }
217   else if (!(num_trans & (num_trans - 1)))
218     {
219       dict->states[state1].trans = (HyphenTrans *) hnj_realloc (dict->states[state1].trans,
220 						(num_trans << 1) *
221 						sizeof(HyphenTrans));
222     }
223   dict->states[state1].trans[num_trans].ch = ch;
224   dict->states[state1].trans[num_trans].new_state = state2;
225   dict->states[state1].num_trans++;
226 }
227 
228 #ifdef VERBOSE
229 HashTab *global[1];
230 
231 static char *
get_state_str(int state,int level)232 get_state_str (int state, int level)
233 {
234   int i;
235   HashEntry *e;
236 
237   for (i = 0; i < HASH_SIZE; i++)
238     for (e = global[level]->entries[i]; e; e = e->next)
239       if (e->val == state)
240 	return e->key;
241   return NULL;
242 }
243 #endif
244 
hnj_hyphen_load_line(char * buf,HyphenDict * dict,HashTab * hashtab)245 void hnj_hyphen_load_line(char * buf, HyphenDict * dict, HashTab * hashtab) {
246   int i, j;
247   char word[MAX_CHARS];
248   char pattern[MAX_CHARS];
249   char * repl;
250   signed char replindex;
251   signed char replcut;
252   int state_num = 0;
253   int last_state;
254   char ch;
255   int found;
256 
257 	  if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
258 	    dict->lhmin = atoi(buf + 13);
259 	    return;
260 	  } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
261 	    dict->rhmin = atoi(buf + 14);
262 	    return;
263 	  } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
264 	    dict->clhmin = atoi(buf + 21);
265 	    return;
266 	  } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
267 	    dict->crhmin = atoi(buf + 22);
268 	    return;
269 	  } else if (strncmp(buf, "NOHYPHEN", 8) == 0) {
270 	    char * space = buf + 8;
271 	    while (*space != '\0' && (*space == ' ' || *space == '\t')) space++;
272 	    if (*buf != '\0') dict->nohyphen = hnj_strdup(space);
273 	    if (dict->nohyphen) {
274 	        char * nhe = dict->nohyphen + strlen(dict->nohyphen) - 1;
275 	        *nhe = 0;
276 	        for (nhe = nhe - 1; nhe > dict->nohyphen; nhe--) {
277 	                if (*nhe == ',') {
278 	                    dict->nohyphenl++;
279 	                    *nhe = 0;
280 	                }
281 	        }
282 	    }
283 	    return;
284 	  }
285 	  j = 0;
286 	  pattern[j] = '0';
287           repl = strchr(buf, '/');
288           replindex = 0;
289           replcut = 0;
290           if (repl) {
291             char * index = strchr(repl + 1, ',');
292             *repl = '\0';
293             if (index) {
294                 char * index2 = strchr(index + 1, ',');
295                 *index = '\0';
296                 if (index2) {
297                     *index2 = '\0';
298                     replindex = (signed char) atoi(index + 1) - 1;
299                     replcut = (signed char) atoi(index2 + 1);
300                 }
301             } else {
302                 hnj_strchomp(repl + 1);
303                 replindex = 0;
304                 replcut = (signed char) strlen(buf);
305             }
306             repl = hnj_strdup(repl + 1);
307           }
308 	  for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
309 	    {
310 	      if (buf[i] >= '0' && buf[i] <= '9')
311 		pattern[j] = buf[i];
312 	      else
313 		{
314 		  word[j] = buf[i];
315 		  pattern[++j] = '0';
316 		}
317 	    }
318 	  word[j] = '\0';
319 	  pattern[j + 1] = '\0';
320 
321           i = 0;
322 	  if (!repl) {
323 	    /* Optimize away leading zeroes */
324             for (; pattern[i] == '0'; i++);
325           } else {
326             if (*word == '.') i++;
327             /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
328             if (dict->utf8) {
329                 int pu = -1;        /* unicode character position */
330                 int ps = -1;        /* unicode start position (original replindex) */
331                 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
332                 for (; pc < (strlen(word) + 1); pc++) {
333                 /* beginning of an UTF-8 character (not '10' start bits) */
334                     if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
335                     if ((ps < 0) && (replindex == pu)) {
336                         ps = replindex;
337                         replindex = (signed char) pc;
338                     }
339                     if ((ps >= 0) && ((pu - ps) == replcut)) {
340                         replcut = (signed char) (pc - replindex);
341                         break;
342                     }
343                 }
344                 if (*word == '.') replindex--;
345             }
346           }
347 
348 #ifdef VERBOSE
349 	  printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
350 #endif
351 	  found = hnj_hash_lookup (hashtab, word);
352 	  state_num = hnj_get_state (dict, hashtab, word);
353 	  dict->states[state_num].match = hnj_strdup (pattern + i);
354 	  dict->states[state_num].repl = repl;
355 	  dict->states[state_num].replindex = replindex;
356           if (!replcut) {
357             dict->states[state_num].replcut = (signed char) strlen(word);
358           } else {
359             dict->states[state_num].replcut = replcut;
360           }
361 
362 	  /* now, put in the prefix transitions */
363           for (; found < 0 && j > 0; --j)
364 	    {
365 	      last_state = state_num;
366 	      ch = word[j - 1];
367 	      word[j - 1] = '\0';
368 	      found = hnj_hash_lookup (hashtab, word);
369 	      state_num = hnj_get_state (dict, hashtab, word);
370 	      hnj_add_trans (dict, state_num, last_state, ch);
371 	    }
372 }
373 
374 HyphenDict *
hnj_hyphen_load(const char * fn)375 hnj_hyphen_load (const char *fn)
376 {
377   HyphenDict *result;
378   FILE *f;
379   f = fopen (fn, "r");
380   if (f == NULL)
381     return NULL;
382 
383   result = hnj_hyphen_load_file(f);
384 
385   fclose(f);
386   return result;
387 }
388 
389 HyphenDict *
hnj_hyphen_load_file(FILE * f)390 hnj_hyphen_load_file (FILE *f)
391 {
392   HyphenDict *dict[2];
393   HashTab *hashtab;
394   char buf[MAX_CHARS];
395   int nextlevel = 0;
396   int i, j, k;
397   HashEntry *e;
398   int state_num = 0;
399 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
400 for (k = 0; k < 2; k++) {
401   hashtab = hnj_hash_new ();
402 #ifdef VERBOSE
403   global[k] = hashtab;
404 #endif
405   hnj_hash_insert (hashtab, "", 0);
406   dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
407   dict[k]->num_states = 1;
408   dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
409   dict[k]->states[0].match = NULL;
410   dict[k]->states[0].repl = NULL;
411   dict[k]->states[0].fallback_state = -1;
412   dict[k]->states[0].num_trans = 0;
413   dict[k]->states[0].trans = NULL;
414   dict[k]->nextlevel = NULL;
415   dict[k]->lhmin = 0;
416   dict[k]->rhmin = 0;
417   dict[k]->clhmin = 0;
418   dict[k]->crhmin = 0;
419   dict[k]->nohyphen = NULL;
420   dict[k]->nohyphenl = 0;
421 
422   /* read in character set info */
423   if (k == 0) {
424     for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
425     if (fgets(dict[k]->cset,  sizeof(dict[k]->cset),f) != NULL) {
426       for (i=0;i<MAX_NAME;i++)
427         if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
428           dict[k]->cset[i] = 0;
429     } else {
430       dict[k]->cset[0] = 0;
431     }
432     dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
433   } else {
434     strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
435     dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
436     dict[k]->utf8 = dict[0]->utf8;
437   }
438 
439   if (k == 0 || nextlevel) {
440     while (fgets (buf, sizeof(buf), f) != NULL) {
441       if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
442 	nextlevel = 1;
443 	break;
444       } else if (buf[0] != '%') hnj_hyphen_load_line(buf, dict[k], hashtab);
445     }
446   } else if (k == 1) {
447     /* default first level: hyphen and ASCII apostrophe */
448     if (!dict[0]->utf8) hnj_hyphen_load_line("NOHYPHEN ',-\n", dict[k], hashtab);
449     else hnj_hyphen_load_line("NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
450     strncpy(buf, "1-1\n", MAX_CHARS-1); // buf rewritten by hnj_hyphen_load here
451     buf[MAX_CHARS-1] = '\0';
452     hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
453     hnj_hyphen_load_line("1'1\n", dict[k], hashtab); /* ASCII apostrophe */
454     if (dict[0]->utf8) {
455       hnj_hyphen_load_line("1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
456       hnj_hyphen_load_line("1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
457     }
458   }
459 
460   /* Could do unioning of matches here (instead of the preprocessor script).
461      If we did, the pseudocode would look something like this:
462 
463      foreach state in the hash table
464         foreach i = [1..length(state) - 1]
465            state to check is substr (state, i)
466            look it up
467            if found, and if there is a match, union the match in.
468 
469      It's also possible to avoid the quadratic blowup by doing the
470      search in order of increasing state string sizes - then you
471      can break the loop after finding the first match.
472 
473      This step should be optional in any case - if there is a
474      preprocessed rule table, it's always faster to use that.
475 
476 */
477 
478   /* put in the fallback states */
479   for (i = 0; i < HASH_SIZE; i++)
480     for (e = hashtab->entries[i]; e; e = e->next)
481       {
482 	if (*(e->key)) for (j = 1; 1; j++)
483 	  {
484 	    state_num = hnj_hash_lookup (hashtab, e->key + j);
485 	    if (state_num >= 0)
486 	      break;
487 	  }
488         /* KBH: FIXME state 0 fallback_state should always be -1? */
489 	if (e->val)
490 	  dict[k]->states[e->val].fallback_state = state_num;
491       }
492 #ifdef VERBOSE
493   for (i = 0; i < HASH_SIZE; i++)
494     for (e = hashtab->entries[i]; e; e = e->next)
495       {
496 	printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
497 		dict[k]->states[e->val].fallback_state);
498 	for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
499 	  printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
500 		  dict[k]->states[e->val].trans[j].new_state);
501       }
502 #endif
503 
504 #ifndef VERBOSE
505   hnj_hash_free (hashtab);
506 #endif
507   state_num = 0;
508 }
509   if (nextlevel) dict[0]->nextlevel = dict[1];
510   else {
511     dict[1] -> nextlevel = dict[0];
512     dict[1]->lhmin = dict[0]->lhmin;
513     dict[1]->rhmin = dict[0]->rhmin;
514     dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
515     dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
516 #ifdef VERBOSE
517     HashTab *r = global[0];
518     global[0] = global[1];
519     global[1] = r;
520 #endif
521     return dict[1];
522   }
523   return dict[0];
524 }
525 
hnj_hyphen_free(HyphenDict * dict)526 void hnj_hyphen_free (HyphenDict *dict)
527 {
528   int state_num;
529   HyphenState *hstate;
530 
531   for (state_num = 0; state_num < dict->num_states; state_num++)
532     {
533       hstate = &dict->states[state_num];
534       if (hstate->match)
535 	hnj_free (hstate->match);
536       if (hstate->repl)
537 	hnj_free (hstate->repl);
538       if (hstate->trans)
539 	hnj_free (hstate->trans);
540     }
541   if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
542 
543   if (dict->nohyphen) hnj_free(dict->nohyphen);
544 
545   hnj_free (dict->states);
546 
547   hnj_free (dict);
548 }
549 
550 #define MAX_WORD 256
551 
hnj_hyphen_hyphenate(HyphenDict * dict,const char * word,int word_size,char * hyphens)552 int hnj_hyphen_hyphenate (HyphenDict *dict,
553 			   const char *word, int word_size,
554 			   char *hyphens)
555 {
556   char *prep_word;
557   int i, j, k;
558   int state;
559   char ch;
560   HyphenState *hstate;
561   char *match;
562   int offset;
563 
564   prep_word = (char*) hnj_malloc (word_size + 3);
565 
566   j = 0;
567   prep_word[j++] = '.';
568 
569   for (i = 0; i < word_size; i++) {
570     if (word[i] <= '9' && word[i] >= '0') {
571       prep_word[j++] = '.';
572     } else {
573       prep_word[j++] = word[i];
574     }
575   }
576 
577   prep_word[j++] = '.';
578   prep_word[j] = '\0';
579 
580   for (i = 0; i < word_size + 5; i++)
581     hyphens[i] = '0';
582 
583 #ifdef VERBOSE
584   printf ("prep_word = %s\n", prep_word);
585 #endif
586 
587   /* now, run the finite state machine */
588   state = 0;
589   for (i = 0; i < j; i++)
590     {
591       ch = prep_word[i];
592       for (;;)
593 	{
594 
595 	  if (state == -1) {
596             /* return 1; */
597 	    /*  KBH: FIXME shouldn't this be as follows? */
598             state = 0;
599             goto try_next_letter;
600           }
601 
602 #ifdef VERBOSE
603 	  char *state_str;
604 	  state_str = get_state_str (state, 0);
605 
606 	  for (k = 0; k < i - strlen (state_str); k++)
607 	    putchar (' ');
608 	  printf ("%s", state_str);
609 #endif
610 
611 	  hstate = &dict->states[state];
612 	  for (k = 0; k < hstate->num_trans; k++)
613 	    if (hstate->trans[k].ch == ch)
614 	      {
615 		state = hstate->trans[k].new_state;
616 		goto found_state;
617 	      }
618 	  state = hstate->fallback_state;
619 #ifdef VERBOSE
620 	  printf (" falling back, fallback_state %d\n", state);
621 #endif
622 	}
623     found_state:
624 #ifdef VERBOSE
625       printf ("found state %d\n",state);
626 #endif
627       /* Additional optimization is possible here - especially,
628 	 elimination of trailing zeroes from the match. Leading zeroes
629 	 have already been optimized. */
630       match = dict->states[state].match;
631       /* replacing rules not handled by hyphen_hyphenate() */
632       if (match && !dict->states[state].repl)
633 	{
634 	  offset = i + 1 - strlen (match);
635 #ifdef VERBOSE
636 	  for (k = 0; k < offset; k++)
637 	    putchar (' ');
638 	  printf ("%s\n", match);
639 #endif
640 	  /* This is a linear search because I tried a binary search and
641 	     found it to be just a teeny bit slower. */
642 	  for (k = 0; match[k]; k++)
643 	    if (hyphens[offset + k] < match[k])
644 	      hyphens[offset + k] = match[k];
645 	}
646 
647       /* KBH: we need this to make sure we keep looking in a word */
648       /* for patterns even if the current character is not known in state 0 */
649       /* since patterns for hyphenation may occur anywhere in the word */
650       try_next_letter: ;
651 
652     }
653 #ifdef VERBOSE
654   for (i = 0; i < j; i++)
655     putchar (hyphens[i]);
656   putchar ('\n');
657 #endif
658 
659   for (i = 0; i < j - 4; i++)
660 #if 0
661     if (hyphens[i + 1] & 1)
662       hyphens[i] = '-';
663 #else
664     hyphens[i] = hyphens[i + 1];
665 #endif
666   hyphens[0] = '0';
667   for (; i < word_size; i++)
668     hyphens[i] = '0';
669   hyphens[word_size] = '\0';
670 
671   hnj_free (prep_word);
672 
673   return 0;
674 }
675 
676 /* Unicode ligature length */
hnj_ligature(unsigned char c)677 int hnj_ligature(unsigned char c) {
678     switch (c) {
679         case 0x80:			/* ff */
680         case 0x81:			/* fi */
681         case 0x82: return LIG_xx;	/* fl */
682         case 0x83:			/* ffi */
683         case 0x84: return LIG_xxx;	/* ffl */
684         case 0x85:			/* long st */
685         case 0x86: return LIG_xx;	/* st */
686     }
687     return 0;
688 }
689 
690 /* character length of the first n byte of the input word */
hnj_hyphen_strnlen(const char * word,int n,int utf8)691 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
692 {
693     int i = 0;
694     int j = 0;
695     while (j < n && word[j] != '\0') {
696       i++;
697       // Unicode ligature support
698       if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
699         i += hnj_ligature(word[j + 2]);
700       }
701       for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
702     }
703     return i;
704 }
705 
hnj_hyphen_lhmin(int utf8,const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut,int lhmin)706 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
707 	char *** rep, int ** pos, int ** cut, int lhmin)
708 {
709     int i = 1, j;
710 
711     // Unicode ligature support
712     if (utf8 && ((unsigned char) word[0] == 0xEF) && ((unsigned char) word[1] == 0xAC))  {
713       i += hnj_ligature(word[2]);
714     }
715 
716     // ignore numbers
717     for (j = 0; word[j] <= '9' && word[j] >= '0'; j++) i--;
718 
719     for (j = 0; i < lhmin && word[j] != '\0'; i++) do {
720       // check length of the non-standard part
721       if (*rep && *pos && *cut && (*rep)[j]) {
722         char * rh = strchr((*rep)[j], '=');
723         if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
724           hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
725             free((*rep)[j]);
726             (*rep)[j] = NULL;
727             hyphens[j] = '0';
728           }
729        } else {
730          hyphens[j] = '0';
731        }
732        j++;
733 
734        // Unicode ligature support
735        if (utf8 && ((unsigned char) word[j] == 0xEF) && ((unsigned char) word[j + 1] == 0xAC))  {
736          i += hnj_ligature(word[j + 2]);
737        }
738     } while (utf8 && (word[j] & 0xc0) == 0x80);
739     return 0;
740 }
741 
hnj_hyphen_rhmin(int utf8,const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut,int rhmin)742 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
743 	char *** rep, int ** pos, int ** cut, int rhmin)
744 {
745     int i = 0;
746     int j;
747 
748     // ignore numbers
749     for (j = word_size - 1; j > 0 && word[j] <= '9' && word[j] >= '0'; j--) i--;
750 
751     for (j = word_size - 1; i < rhmin && j > 0; j--) {
752       // check length of the non-standard part
753       if (*rep && *pos && *cut && (*rep)[j]) {
754         char * rh = strchr((*rep)[j], '=');
755         if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
756           hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
757             free((*rep)[j]);
758             (*rep)[j] = NULL;
759             hyphens[j] = '0';
760           }
761        } else {
762          hyphens[j] = '0';
763        }
764        if (!utf8 || (word[j] & 0xc0) == 0xc0 || (word[j] & 0x80) != 0x80) i++;
765     }
766     return 0;
767 }
768 
769 // recursive function for compound level hyphenation
hnj_hyphen_hyph_(HyphenDict * dict,const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut,int clhmin,int crhmin,int lend,int rend)770 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
771     char * hyphens, char *** rep, int ** pos, int ** cut,
772     int clhmin, int crhmin, int lend, int rend)
773 {
774   char *prep_word;
775   int i, j, k;
776   int state;
777   char ch;
778   HyphenState *hstate;
779   char *match;
780   char *repl;
781   signed char replindex;
782   signed char replcut;
783   int offset;
784   int * matchlen;
785   int * matchindex;
786   char ** matchrepl;
787   int isrepl = 0;
788   int nHyphCount;
789 
790   size_t prep_word_size = word_size + 3;
791   prep_word = (char*) hnj_malloc (prep_word_size);
792   matchlen = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
793   matchindex = (int*) hnj_malloc ((word_size + 3) * sizeof(int));
794   matchrepl = (char**) hnj_malloc ((word_size + 3) * sizeof(char *));
795 
796   j = 0;
797   prep_word[j++] = '.';
798 
799   for (i = 0; i < word_size; i++) {
800     if (word[i] <= '9' && word[i] >= '0') {
801       prep_word[j++] = '.';
802     } else {
803       prep_word[j++] = word[i];
804     }
805   }
806 
807 
808 
809   prep_word[j++] = '.';
810   prep_word[j] = '\0';
811 
812   for (i = 0; i < j; i++)
813     hyphens[i] = '0';
814 
815 #ifdef VERBOSE
816   printf ("prep_word = %s\n", prep_word);
817 #endif
818 
819   /* now, run the finite state machine */
820   state = 0;
821   for (i = 0; i < j; i++)
822     {
823       ch = prep_word[i];
824       for (;;)
825 	{
826 
827 	  if (state == -1) {
828             /* return 1; */
829 	    /*  KBH: FIXME shouldn't this be as follows? */
830             state = 0;
831             goto try_next_letter;
832           }
833 
834 #ifdef VERBOSE
835 	  char *state_str;
836 	  state_str = get_state_str (state, 1);
837 
838 	  for (k = 0; k < i - strlen (state_str); k++)
839 	    putchar (' ');
840 	  printf ("%s", state_str);
841 #endif
842 
843 	  hstate = &dict->states[state];
844 	  for (k = 0; k < hstate->num_trans; k++)
845 	    if (hstate->trans[k].ch == ch)
846 	      {
847 		state = hstate->trans[k].new_state;
848 		goto found_state;
849 	      }
850 	  state = hstate->fallback_state;
851 #ifdef VERBOSE
852 	  printf (" falling back, fallback_state %d\n", state);
853 #endif
854 	}
855     found_state:
856 #ifdef VERBOSE
857       printf ("found state %d\n",state);
858 #endif
859       /* Additional optimization is possible here - especially,
860 	 elimination of trailing zeroes from the match. Leading zeroes
861 	 have already been optimized. */
862       match = dict->states[state].match;
863       repl = dict->states[state].repl;
864       replindex = dict->states[state].replindex;
865       replcut = dict->states[state].replcut;
866       /* replacing rules not handled by hyphen_hyphenate() */
867       if (match)
868 	{
869 	  offset = i + 1 - strlen (match);
870 #ifdef VERBOSE
871 	  for (k = 0; k < offset; k++)
872 	    putchar (' ');
873 	  printf ("%s (%s)\n", match, repl);
874 #endif
875           if (repl) {
876             if (!isrepl) for(; isrepl < word_size; isrepl++) {
877                 matchrepl[isrepl] = NULL;
878                 matchindex[isrepl] = -1;
879             }
880             matchlen[offset + replindex] = replcut;
881           }
882 	  /* This is a linear search because I tried a binary search and
883 	     found it to be just a teeny bit slower. */
884 	  for (k = 0; match[k]; k++) {
885 	    if ((hyphens[offset + k] < match[k])) {
886 	      hyphens[offset + k] = match[k];
887               if (match[k]&1) {
888                 matchrepl[offset + k] = repl;
889                 if (repl && (k >= replindex) && (k <= replindex + replcut)) {
890                     matchindex[offset + replindex] = offset + k;
891                 }
892               }
893             }
894           }
895 
896 	}
897 
898       /* KBH: we need this to make sure we keep looking in a word */
899       /* for patterns even if the current character is not known in state 0 */
900       /* since patterns for hyphenation may occur anywhere in the word */
901       try_next_letter: ;
902 
903     }
904 #ifdef VERBOSE
905   for (i = 0; i < j; i++)
906     putchar (hyphens[i]);
907   putchar ('\n');
908 #endif
909 
910   for (i = 0; i < j - 3; i++)
911 #if 0
912     if (hyphens[i + 1] & 1)
913       hyphens[i] = '-';
914 #else
915     hyphens[i] = hyphens[i + 1];
916 #endif
917   for (; i < word_size; i++)
918     hyphens[i] = '0';
919   hyphens[word_size] = '\0';
920 
921        /* now create a new char string showing hyphenation positions */
922        /* count the hyphens and allocate space for the new hyphenated string */
923        nHyphCount = 0;
924        for (i = 0; i < word_size; i++)
925           if (hyphens[i]&1)
926              nHyphCount++;
927        j = 0;
928        for (i = 0; i < word_size; i++) {
929            if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
930                 if (rep && pos && cut) {
931                     if (!*rep)
932                         *rep = (char **) calloc(word_size, sizeof(char *));
933                     if (!*pos)
934                         *pos = (int *) calloc(word_size, sizeof(int));
935                     if (!*cut) {
936                         *cut = (int *) calloc(word_size, sizeof(int));
937                     }
938                     (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
939                     (*pos)[matchindex[i] - 1] = matchindex[i] - i;
940                     (*cut)[matchindex[i] - 1] = matchlen[i];
941                 }
942                 j += strlen(matchrepl[matchindex[i]]);
943                 i += matchlen[i] - 1;
944           }
945        }
946 
947   hnj_free (matchrepl);
948   hnj_free (matchlen);
949   hnj_free (matchindex);
950 
951   // recursive hyphenation of the first (compound) level segments
952   if (dict->nextlevel) {
953      char ** rep2;
954      int * pos2;
955      int * cut2;
956      char * hyphens2;
957      int begin = 0;
958 
959      rep2 = (char**) hnj_malloc (word_size * sizeof(char *));
960      pos2 = (int*) hnj_malloc (word_size * sizeof(int));
961      cut2 = (int*) hnj_malloc (word_size * sizeof(int));
962      hyphens2 = (char*) hnj_malloc (word_size + 3);
963      for (i = 0; i < word_size; i++) rep2[i] = NULL;
964      for (i = 0; i < word_size; i++) if
965         (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
966         if (i - begin > 1) {
967             int hyph = 0;
968             prep_word[i + 2] = '\0';
969             /* non-standard hyphenation at compound boundary (Schiffahrt) */
970             if (rep && *rep && *pos && *cut && (*rep)[i]) {
971                 char * l = strchr((*rep)[i], '=');
972                 size_t offset = 2 + i - (*pos)[i];
973                 strncpy(prep_word + offset, (*rep)[i], prep_word_size - offset - 1);
974                 prep_word[prep_word_size - 1] = '\0';
975                 if (l) {
976                     hyph = (l - (*rep)[i]) - (*pos)[i];
977                     prep_word[2 + i + hyph] = '\0';
978                 }
979             }
980             hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
981                 hyphens2, &rep2, &pos2, &cut2, clhmin,
982                 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
983             for (j = 0; j < i - begin - 1; j++) {
984                 hyphens[begin + j] = hyphens2[j];
985                 if (rep2[j] && rep && pos && cut) {
986                     if (!*rep && !*pos && !*cut) {
987                         int k;
988                         *rep = (char **) malloc(sizeof(char *) * word_size);
989                         *pos = (int *) malloc(sizeof(int) * word_size);
990                         *cut = (int *) malloc(sizeof(int) * word_size);
991                         for (k = 0; k < word_size; k++) {
992                             (*rep)[k] = NULL;
993                             (*pos)[k] = 0;
994                             (*cut)[k] = 0;
995                         }
996                     }
997                     (*rep)[begin + j] = rep2[j];
998                     (*pos)[begin + j] = pos2[j];
999                     (*cut)[begin + j] = cut2[j];
1000                 }
1001             }
1002             prep_word[i + 2] = word[i + 1];
1003             if (*rep && *pos && *cut && (*rep)[i]) {
1004                 size_t offset = 1;
1005                 strncpy(prep_word + offset, word, prep_word_size - offset - 1);
1006                 prep_word[prep_word_size - 1] = '\0';
1007             }
1008         }
1009         begin = i + 1;
1010         for (j = 0; j < word_size; j++) rep2[j] = NULL;
1011      }
1012 
1013      // non-compound
1014      if (begin == 0) {
1015         hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
1016             hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
1017         if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1018             rep, pos, cut, clhmin);
1019         if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1020             rep, pos, cut, crhmin);
1021      }
1022 
1023      free(rep2);
1024      free(cut2);
1025      free(pos2);
1026      free(hyphens2);
1027   }
1028 
1029   hnj_free (prep_word);
1030   return 0;
1031 }
1032 
1033 /* UTF-8 normalization of hyphen and non-standard positions */
hnj_hyphen_norm(const char * word,int word_size,char * hyphens,char *** rep,int ** pos,int ** cut)1034 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
1035 	char *** rep, int ** pos, int ** cut)
1036 {
1037   int i, j, k;
1038   if ((((unsigned char) word[0]) >> 6) == 2) {
1039     fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
1040     return 1;
1041   }
1042 
1043   /* calculate UTF-8 character positions */
1044   for (i = 0, j = -1; i < word_size; i++) {
1045     /* beginning of an UTF-8 character (not '10' start bits) */
1046     if ((((unsigned char) word[i]) >> 6) != 2) j++;
1047     hyphens[j] = hyphens[i];
1048     if (rep && pos && cut && *rep && *pos && *cut) {
1049         int l = (*pos)[i];
1050         (*pos)[j] = 0;
1051         for (k = 0; k < l; k++) {
1052             if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
1053         }
1054         k = i - l + 1;
1055         l = k + (*cut)[i];
1056         (*cut)[j] = 0;
1057         for (; k < l; k++) {
1058             if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
1059         }
1060         (*rep)[j] = (*rep)[i];
1061         if (j < i) {
1062             (*rep)[i] = NULL;
1063             (*pos)[i] = 0;
1064             (*cut)[i] = 0;
1065         }
1066     }
1067   }
1068   hyphens[j + 1] = '\0';
1069 #ifdef VERBOSE
1070   printf ("nums: %s\n", hyphens);
1071 #endif
1072   return 0;
1073 }
1074 
1075 /* get the word with all possible hyphenations (output: hyphword) */
hnj_hyphen_hyphword(const char * word,int l,const char * hyphens,char * hyphword,char *** rep,int ** pos,int ** cut)1076 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
1077     char * hyphword, char *** rep, int ** pos, int ** cut)
1078 {
1079   int hyphenslen = l + 5;
1080 
1081   int i, j;
1082   for (i = 0, j = 0; i < l; i++, j++) {
1083     if (hyphens[i]&1) {
1084       hyphword[j] = word[i];
1085       if (*rep && *pos && *cut && (*rep)[i]) {
1086         size_t offset = j - (*pos)[i] + 1;
1087         strncpy(hyphword + offset, (*rep)[i], hyphenslen - offset - 1);
1088         hyphword[hyphenslen-1] = '\0';
1089         j += strlen((*rep)[i]) - (*pos)[i];
1090         i += (*cut)[i] - (*pos)[i];
1091       } else hyphword[++j] = '=';
1092     } else hyphword[j] = word[i];
1093   }
1094   hyphword[j] = '\0';
1095 }
1096 
1097 
1098 /* main api function with default hyphenmin parameters */
hnj_hyphen_hyphenate2(HyphenDict * dict,const char * word,int word_size,char * hyphens,char * hyphword,char *** rep,int ** pos,int ** cut)1099 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
1100 			   const char *word, int word_size, char * hyphens,
1101 			   char *hyphword, char *** rep, int ** pos, int ** cut)
1102 {
1103   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1104     dict->clhmin, dict->crhmin, 1, 1);
1105   hnj_hyphen_lhmin(dict->utf8, word, word_size,
1106     hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
1107   hnj_hyphen_rhmin(dict->utf8, word, word_size,
1108     hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
1109 
1110   /* nohyphen */
1111   if (dict->nohyphen) {
1112     char * nh = dict->nohyphen;
1113     int nhi;
1114     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1115         char * nhy = (char *) strstr(word, nh);
1116         while (nhy) {
1117             hyphens[nhy - word + strlen(nh) - 1] = '0';
1118             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = '0';
1119             nhy = (char *) strstr(nhy + 1, nh);
1120         }
1121         nh = nh + strlen(nh) + 1;
1122     }
1123   }
1124 
1125   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1126   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1127 #ifdef VERBOSE
1128   printf ("nums: %s\n", hyphens);
1129 #endif
1130   return 0;
1131 }
1132 
1133 /* previous main api function with hyphenmin parameters */
hnj_hyphen_hyphenate3(HyphenDict * dict,const char * word,int word_size,char * hyphens,char * hyphword,char *** rep,int ** pos,int ** cut,int lhmin,int rhmin,int clhmin,int crhmin)1134 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
1135 	const char *word, int word_size, char * hyphens,
1136 	char *hyphword, char *** rep, int ** pos, int ** cut,
1137 	int lhmin, int rhmin, int clhmin, int crhmin)
1138 {
1139   lhmin = (lhmin > dict->lhmin) ? lhmin : dict->lhmin;
1140   rhmin = (rhmin > dict->rhmin) ? rhmin : dict->rhmin;
1141   clhmin = (clhmin > dict->clhmin) ? clhmin : dict->clhmin;
1142   crhmin = (crhmin > dict->crhmin) ? crhmin : dict->crhmin;
1143   hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
1144     clhmin, crhmin, 1, 1);
1145   hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
1146     rep, pos, cut, (lhmin > 0 ? lhmin : 2));
1147   hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
1148     rep, pos, cut, (rhmin > 0 ? rhmin : 2));
1149   if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
1150 
1151   /* nohyphen */
1152   if (dict->nohyphen) {
1153     char * nh = dict->nohyphen;
1154     int nhi;
1155     for (nhi = 0; nhi <= dict->nohyphenl; nhi++) {
1156         char * nhy = (char *) strstr(word, nh);
1157         while (nhy) {
1158             hyphens[nhy - word + strlen(nh) - 1] = 0;
1159             if (nhy - word  - 1 >= 0) hyphens[nhy - word - 1] = 0;
1160             nhy = (char *) strstr(nhy + 1, nh);
1161         }
1162         nh = nh + strlen(nh) + 1;
1163     }
1164   }
1165 
1166   if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
1167   return 0;
1168 }
1169