1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30
31 /* from OpenSolaris "n8.c 1.8 05/06/08 SMI" */
32
33 /*
34 * Portions Copyright (c) 2005 Gunnar Ritter, Freiburg i. Br., Germany
35 *
36 * Sccsid @(#)n8.c 1.44 (gritter) 9/26/10
37 */
38
39 /*
40 * University Copyright- Copyright (c) 1982, 1986, 1988
41 * The Regents of the University of California
42 * All Rights Reserved
43 *
44 * University Acknowledgment- Portions of this document are derived from
45 * software developed by the University of California, Berkeley, and its
46 * contributors.
47 */
48
49 #include <stddef.h>
50 #include <stdio.h>
51 #ifdef EUC
52 #include <wctype.h>
53 #endif
54 #include <ctype.h>
55 #include <stdlib.h>
56 #include <string.h>
57 #include "tdef.h"
58 #include "ext.h"
59 #include "pt.h"
60 #include "libhnj/hyphen.h"
61 #define HY_BIT 0200 /* generic stuff in here only works for ascii */
62 #define HY_BIT2 0x80000000
63
64 /*
65 * troff8.c
66 *
67 * hyphenation
68 */
69
70 int *hbuf;
71 int NHEX;
72 int *nexth;
73 tchar *hyend;
74 #define THRESH 160 /*digram goodness threshold*/
75 int thresh = THRESH;
76
77 static void hyphenhnj(void);
78
79 static int *
growhbuf(int ** pp)80 growhbuf(int **pp)
81 {
82 int *nhbuf;
83 int inc = 4;
84 ptrdiff_t j;
85
86 if ((nhbuf = realloc(hbuf, (NHEX+inc) * sizeof *hbuf)) == NULL)
87 return NULL;
88 NHEX += inc;
89 j = (char *)nhbuf - (char *)hbuf;
90 nexth = (int *)((char *)nexth + j);
91 if (pp)
92 *pp = (int *)((char *)*pp + j);
93 return hbuf = nhbuf;
94 }
95
96 void
hyphen(tchar * wp)97 hyphen(tchar *wp)
98 {
99 register int j;
100 register tchar *i;
101 tchar *_wdstart, *_wdend;
102
103 i = wp;
104 while (punct(*i++))
105 ;
106 if (!alph(*--i))
107 return;
108 wdstart = i++;
109 while (hyext ? *i++ : alph(*i++))
110 ;
111 hyend = wdend = --i - 1;
112 while (punct(*i++))
113 ;
114 if (*--i)
115 return;
116 if (!(wdhyf & 060) && (wdend - wdstart - (hylen - 1)) < 0)
117 return;
118 hyp = hyptr;
119 *hyp = 0;
120 hyoff = 2;
121 if (dicthnj) {
122 i = _wdstart = wdstart;
123 _wdend = wdend;
124 do {
125 if (cbits(*i) == '-' || cbits(*i) == EMDASH ||
126 i == _wdend) {
127 while (wdstart <= i && (punct(*wdstart) ||
128 (cbits(*wdstart) >= '0' &&
129 cbits(*wdstart) <= '9')))
130 wdstart++;
131 for (wdend = wdstart; wdend <= i; wdend++) {
132 if (!alph(*wdend) ||
133 (cbits(*wdend) >= '0' &&
134 cbits(*wdend) <= '9'))
135 break;
136 }
137 hyend = --wdend;
138 if ((wdhyf & 060 || wdstart + 3 <= wdend) &&
139 !exword())
140 hyphenhnj();
141 wdstart = &i[1];
142 if (i < _wdend) {
143 *hyp++ = &i[1];
144 if (hyp > (hyptr + NHYP - 1))
145 hyp = hyptr + NHYP - 1;
146 }
147 }
148 } while (i++ <= _wdend);
149 wdstart = _wdstart;
150 wdend = _wdend;
151 } else if (!exword() && !suffix())
152 digram();
153 *hyp++ = 0;
154 if (*hyptr)
155 for (j = 1; j; ) {
156 j = 0;
157 for (hyp = hyptr + 1; *hyp != 0; hyp++) {
158 if (*(hyp - 1) > *hyp) {
159 j++;
160 i = *hyp;
161 *hyp = *(hyp - 1);
162 *(hyp - 1) = i;
163 }
164 }
165 }
166 }
167
168
169 int
punct(tchar i)170 punct(tchar i)
171 {
172 if (!cbits(i) || alph(i))
173 return(0);
174 else
175 return(1);
176 }
177
178
179 int
alph(tchar j)180 alph(tchar j)
181 {
182 int i;
183 int f;
184 int h;
185
186 while (isxfunc(j, CHAR))
187 j = charout[sbits(j)].ch;
188 i = cbits(j);
189 f = fbits(j);
190 if (!ismot(j) && i < nhcode && (h = hcode[i]) != 0) {
191 if (h & ~0177)
192 h = tr2un(h, f);
193 #ifdef EUC
194 return hyext ? iswalnum(h) : iswalpha(h);
195 } else
196 #else /* !EUC */
197 i = h;
198 }
199 #endif /* !EUC */
200 #ifdef EUC
201 if (!ismot(j) && i & ~0177) {
202 int u;
203 #ifndef NROFF
204 if (islig(j) && hyext &&
205 lgrevtab && lgrevtab[f] && lgrevtab[f][i])
206 return 1;
207 #endif /* !NROFF */
208 u = tr2un(i, f);
209 if (u == 0x017F) /* longs */
210 u = 's';
211 return hyext ? iswalnum(u) : iswalpha(u);
212 } else
213 #endif /* EUC */
214 if ((!ismot(j) && i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z') ||
215 (hyext && i >= '0' && i <= '9'))
216 return(1);
217 else
218 return(0);
219 }
220
221
222 void
caseht(void)223 caseht(void)
224 {
225 thresh = THRESH;
226 if (skip(0))
227 return;
228 noscale++;
229 thresh = hatoi();
230 noscale = 0;
231 }
232
233
234 void
casehw(void)235 casehw(void)
236 {
237 register int i, k;
238 int *j;
239 tchar t;
240 int cnt = 0;
241
242 lgf++;
243 if (nexth == NULL)
244 growhbuf(NULL);
245 k = 0;
246 while (!skip(!cnt++)) {
247 if ((j = nexth) >= (hbuf + NHEX - 2) && growhbuf(&j) == NULL)
248 goto full;
249 for (; ; ) {
250 if (ismot(t = getch()))
251 continue;
252 i = cbits(t);
253 if (i == ' ' || i == '\n') {
254 *j++ = 0;
255 nexth = j;
256 *j = 0;
257 if (i == ' ')
258 break;
259 else
260 return;
261 }
262 if (i == '-') {
263 k = HY_BIT2;
264 continue;
265 }
266 *j++ = maplow(t) | k;
267 k = 0;
268 if (j >= (hbuf + NHEX - 2) && growhbuf(&j) == NULL)
269 goto full;
270 }
271 }
272 return;
273 full:
274 errprint("exception word list full.");
275 *nexth = 0;
276 }
277
278
279 int
exword(void)280 exword(void)
281 {
282 register tchar *w;
283 register int *e;
284 int *save;
285
286 e = hbuf;
287 while (1) {
288 save = e;
289 if (e == NULL || *e == 0)
290 return(0);
291 w = wdstart;
292 while (*e && w <= hyend) {
293 #ifndef NROFF
294 int i, m, f;
295 m = cbits(*w);
296 f = fbits(*w);
297 if (islig(*w) && lgrevtab && lgrevtab[f] &&
298 lgrevtab[f][m]) {
299 for (i = 0; lgrevtab[f][m][i]; i++) {
300 if ((*e&~HY_BIT2) ==
301 maplow(lgrevtab[f][m][i])) {
302 e++;
303 } else
304 goto end;
305 }
306 w++;
307 } else
308 #endif
309 {
310 if ((*e&~HY_BIT2) == maplow(*w)) {
311 e++;
312 w++;
313 } else
314 goto end;
315 }
316 }
317 end: if (!*e) {
318 if (w-1 == hyend || (w == wdend && maplow(*w) == 's')) {
319 w = wdstart;
320 for (e = save; *e; e++) {
321 #ifndef NROFF
322 int i, m, f;
323 m = cbits(*w);
324 f = fbits(*w);
325 if (islig(*w) && lgrevtab &&
326 lgrevtab[f] &&
327 lgrevtab[f][m]) {
328 for (i = 0; lgrevtab[f][m][i];
329 i++) {
330 if (*e++ & HY_BIT2) {
331 *hyp = (void *)
332 ((intptr_t)w |
333 i);
334 hyp++;
335 }
336 }
337 e--;
338 } else
339 #endif
340 {
341 if (*e & HY_BIT2)
342 *hyp++ = w;
343 }
344 w++;
345 if (hyp > (hyptr + NHYP - 1))
346 hyp = hyptr + NHYP - 1;
347 }
348 return(1);
349 } else {
350 e++;
351 continue;
352 }
353 } else
354 while (*e++)
355 ;
356 }
357 }
358
359
360 int
suffix(void)361 suffix(void)
362 {
363 register tchar *w;
364 register const char *s, *s0;
365 tchar i;
366 extern const char *suftab[];
367
368 again:
369 i = cbits(*hyend);
370 if (i >= 128 || !alph(*hyend))
371 return(0);
372 if (i < 'a')
373 i -= 'A' - 'a';
374 if ((s0 = suftab[i-'a']) == 0)
375 return(0);
376 for (; ; ) {
377 if ((i = *s0 & 017) == 0)
378 return(0);
379 s = s0 + i - 1;
380 w = hyend - 1;
381 while (s > s0 && w >= wdstart && (*s & 0177) == maplow(*w)) {
382 s--;
383 w--;
384 }
385 if (s == s0)
386 break;
387 s0 += i;
388 }
389 s = s0 + i - 1;
390 w = hyend;
391 if (*s0 & HY_BIT)
392 goto mark;
393 while (s > s0) {
394 w--;
395 if (*s-- & HY_BIT) {
396 mark:
397 hyend = w - 1;
398 if (*s0 & 0100)
399 continue;
400 if (!chkvow(w))
401 return(0);
402 *hyp++ = w;
403 }
404 }
405 if (*s0 & 040)
406 return(0);
407 if (exword())
408 return(1);
409 goto again;
410 }
411
412
413 int
maplow(tchar t)414 maplow(tchar t)
415 {
416 int h, i, f;
417
418 while (isxfunc(t, CHAR))
419 t = charout[sbits(t)].ch;
420 i = cbits(t);
421 f = fbits(t);
422 if (!ismot(t) && i < nhcode && (h = hcode[i]) != 0) {
423 if (h & ~0177)
424 h = tr2un(h, f);
425 h = tr2un(h, f);
426 return(h);
427 } else
428 #ifdef EUC
429 if (!ismot(t) && i & ~0177) {
430 i = tr2un(i, f);
431 if (i == 0x017F) /* longs */
432 i = 's';
433 if (iswupper(i))
434 i = towlower(i);
435 } else
436 #endif /* EUC */
437 if (ischar(i) && isupper(i))
438 i = tolower(i);
439 return(i);
440 }
441
442
443 int
vowel(tchar i)444 vowel(tchar i)
445 {
446 switch (maplow(i)) {
447 case 'a':
448 case 'e':
449 case 'i':
450 case 'o':
451 case 'u':
452 case 'y':
453 return(1);
454 default:
455 return(0);
456 }
457 }
458
459
460 tchar *
chkvow(tchar * w)461 chkvow(tchar *w)
462 {
463 while (--w >= wdstart)
464 if (vowel(*w))
465 return(w);
466 return(0);
467 }
468
469
470 void
digram(void)471 digram(void)
472 {
473 register tchar *w;
474 register int val;
475 tchar * nhyend, *maxw = 0;
476 int maxval;
477 extern const char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
478
479 for (w = wdstart; w <= wdend; w++)
480 if (cbits(*w) & ~0177)
481 return;
482
483 again:
484 if (!(w = chkvow(hyend + 1)))
485 return;
486 hyend = w;
487 if (!(w = chkvow(hyend)))
488 return;
489 nhyend = w;
490 maxval = 0;
491 w--;
492 while ((++w < hyend) && (w < (wdend - 1))) {
493 val = 1;
494 if (w == wdstart)
495 val *= dilook('a', *w, bxh);
496 else if (w == wdstart + 1)
497 val *= dilook(*(w-1), *w, bxxh);
498 else
499 val *= dilook(*(w-1), *w, xxh);
500 val *= dilook(*w, *(w+1), xhx);
501 val *= dilook(*(w+1), *(w+2), hxx);
502 if (val > maxval) {
503 maxval = val;
504 maxw = w + 1;
505 }
506 }
507 hyend = nhyend;
508 if (maxval > thresh)
509 *hyp++ = maxw;
510 goto again;
511 }
512
513
514 int
dilook(tchar a,tchar b,const char t[26][13])515 dilook(tchar a, tchar b, const char t[26][13])
516 {
517 register int i, j;
518
519 i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
520 if (!(j & 01))
521 i >>= 4;
522 return(i & 017);
523 }
524
525 void
casehylang(void)526 casehylang(void)
527 {
528 int c, i = 0, sz = 0;
529 char *path = NULL;
530 size_t l;
531
532 dicthnj = NULL;
533 free(hylang);
534 hylang = NULL;
535 hyext = 0;
536 skip(0);
537 do {
538 c = getach();
539 if (i >= sz)
540 hylang = realloc(hylang, (sz += 8) * sizeof *hylang);
541 hylang[i++] = c;
542 } while (c);
543 if (i == 1) {
544 free(hylang);
545 hylang = NULL;
546 return;
547 }
548 if (strchr(hylang, '/') == NULL) {
549 l = strlen(hylang) + strlen(HYPDIR) + 12;
550 path = malloc(l);
551 snprintf(path, l, "%s/hyph_%s.dic", HYPDIR, hylang);
552 } else {
553 l = strlen(hylang) + 1;
554 path = malloc(l);
555 n_strcpy(path, hylang, l);
556 }
557 if ((dicthnj = hnj_hyphen_load(path)) == NULL) {
558 errprint("Can't load %s", path);
559 free(hylang);
560 hylang = NULL;
561 free(path);
562 return;
563 }
564 free(path);
565 hyext = 1;
566 }
567
568 static int
addc(int m,char ** cp,tchar ** wp,int ** wpp,int distance)569 addc(int m, char **cp, tchar **wp, int **wpp, int distance)
570 {
571 tchar t;
572
573 t = m ? m | sfmask(**wp) : **wp;
574 m = maplow(t);
575 if (m > 0 && m <= 0x7f) {
576 *(*cp)++ = m;
577 *(*wpp)++ = distance;
578 } else if (m >= 0x80 && m <= 0x7ff) {
579 *(*cp)++ = (m >> 6 & 037) | 0300;
580 *(*wpp)++ = distance;
581 *(*cp)++ = (m & 077) | 0200;
582 *(*wpp)++ = -1000;
583 } else if (m >= 0x800 && m <= 0xffff) {
584 *(*cp)++ = (m >> 12 & 017) | 0340;
585 *(*wpp)++ = distance;
586 *(*cp)++ = (m >> 6 & 077) | 0200;
587 *(*wpp)++ = -1000;
588 *(*cp)++ = (m & 077) | 0200;
589 *(*wpp)++ = -1000;
590 } else
591 return 0;
592 return 1;
593 }
594
595 static void
hyphenhnj(void)596 hyphenhnj(void)
597 {
598 tchar *wp;
599 char *cb, *cp, *hb;
600 int *wpos, *wpp;
601 int i, j, k;
602
603 i = 12 * (wdend - wdstart) + 1;
604 cb = malloc(i * sizeof *cb);
605 hb = malloc(i * sizeof *hb);
606 wpos = malloc(i * sizeof *wpos);
607 cp = cb;
608 wpp = wpos;
609 for (wp = wdstart; wp <= wdend; wp++) {
610 #ifndef NROFF
611 int m = cbits(*wp);
612 int f = fbits(*wp);
613 if (islig(*wp) && lgrevtab && lgrevtab[f] && lgrevtab[f][m]) {
614 for (i = 0; lgrevtab[f][m][i]; i++) {
615 if (addc(lgrevtab[f][m][i], &cp, &wp, &wpp,
616 i ? -i : wp-wdstart) == 0)
617 goto retn;
618 }
619 } else
620 #endif
621 {
622 if (addc(0, &cp, &wp, &wpp, wp - wdstart) == 0)
623 goto retn;
624 }
625 }
626 *cp = '\0';
627 j = cp - cb;
628 while (wpp <= &wpos[j])
629 *wpp++ = -1000;
630 hnj_hyphen_hyphenate(dicthnj, cb, j, hb);
631 k = 0;
632 for (i = 0; i < j; i++) {
633 if (wpos[i+1] >= 0)
634 k = wpos[i+1];
635 if ((hb[i] - '0') & 1 && wpos[i+1] >= -3) {
636 if (wpos[i+1] >= 0)
637 *hyp = &wdstart[wpos[i+1]];
638 else {
639 *hyp = &wdstart[k];
640 *hyp = (void *)((intptr_t)*hyp | -wpos[i+1]);
641 }
642 if (++hyp > (hyptr + NHYP - 1))
643 hyp = hyptr + NHYP - 1;
644 }
645 }
646 retn:
647 free(cb);
648 free(hb);
649 free(wpos);
650 }
651