1 /*
2 * Copyright (C) NEC Corporation 1991,1992
3 */
4 #ifndef lint
5 static char rcsid[] =
6 "$Id: analyze.c,v 2.13 1994/04/19 10:16:17 uchida Exp $ (NEC)";
7 #endif
8
9 #include <stdio.h>
10 #include <ctype.h>
11 #include "plain2.h"
12 #include "picture.h"
13 #include "kanji.h"
14
15 /*
16 * �ƹԤ��ʲ���ʸ��¤�Ȥ��Ƥ���ħ���Ĥ��ݤ���Ƚ�ꤹ��
17 * 1) �ꥹ��
18 * 2) ���ֹ�
19 * 3) ��/ɽ�ֹ�
20 */
21 struct strVal appendixPat[] = {
22 /*
23 * ��Ͽ/Appendix
24 */
25 {"��Ͽ", 0},
26 {"Appendix", 0},
27 {"APPENDIX", 0},
28 {"����������", 0},
29 {"���УУţΣģɣ�", 0},
30 {"Appendix", 0},
31 {"APPENDIX", 0},
32 {"����������", 0},
33 {"���УУţΣģɣ�", 0},
34 {"", -1}
35 };
36 struct strVal zenkakuAlpUp[] = {
37 {"��", 0}, {"��", 1}, {"��", 2}, {"��", 3},
38 {"��", 4}, {"��", 5}, {"��", 6}, {"��", 7},
39 {"��", 8}, {"��", 9}, {"��", 10}, {"��", 11},
40 {"��", 12}, {"��", 13}, {"��", 14}, {"��", 15},
41 {"��", 16}, {"��", 17}, {"��", 18}, {"��", 19},
42 {"��", 20}, {"��", 21}, {"��", 22}, {"��", 23},
43 {"��", 24}, {"��", 25}, { "", -1}};
44 struct strVal zenkakuAlpLow[] = {
45 {"��", 0}, {"��", 1}, {"��", 2}, {"��", 3},
46 {"��", 4}, {"��", 5}, {"��", 6}, {"��", 7},
47 {"��", 8}, {"��", 9}, {"��", 10}, {"��", 11},
48 {"��", 12}, {"��", 13}, {"��", 14}, {"��", 15},
49 {"��", 16}, {"��", 17}, {"��", 18}, {"��", 19},
50 {"��", 20}, {"��", 21}, {"��", 22}, {"��", 23},
51 {"��", 24}, {"��", 25}, { "", -1}};
52 struct strVal bullet[] = {
53 /*
54 * �ӥ��åȥꥹ�Ȥ˻Ȥ��ޡ���
55 */
56 { "��", 0}, { "��", 0},
57 { "o ", 0}, { "* ", 0},
58 { "+ ", 0},
59 { "", -1}
60 };
61 struct strVal dash[] = {
62 /*
63 * ���å���ꥹ�Ȥ˻Ȥ��ޡ���
64 */
65 { "��", 0},
66 { "- ", 0},
67 { "-", 0},
68 { "", -1}
69 };
70 struct strVal listSpecial[] = {
71 /*
72 * �ꥹ�Ȥ˻Ȥ��ü�ʥޡ��������ҥꥹ�ȤȤ��ư���
73 */
74 { "��", 0}, { "��", 0},
75 { "��", 0}, { "��", 0},
76 { "��", 0}, { "��", 0},
77 { "��", 0}, { "��", 0},
78 { "��", 0}, { "��", 0},
79 { "��", 0}, { "��", 0},
80 { "��", 0}, { "��", 0},
81 { "��", 0}, { "��", 0},
82 { "��", 0},
83 { "", -1}
84 };
85 /*
86 * ���ѿ���
87 */
88 struct strVal zenkakuNum[] = {
89 { "��", 0}, { "��", 1},
90 { "��", 2}, { "��", 3},
91 { "��", 4}, { "��", 5},
92 { "��", 6}, { "��", 7},
93 { "��", 8}, { "��", 9},
94 { "", -1}
95 };
96 /*
97 * ������
98 */
99 struct strVal lroman[] = {
100 {"��", 10 }, {"�ɣ�", 9 },
101 {"�֣ɣɣ�", 8 }, {"�֣ɣ�", 7 },
102 {"�֣�", 6 }, {"��", 5 },
103 {"�ɣ�", 4 }, {"�ɣɣ�", 3 },
104 {"�ɣ�", 2 }, {"��", 1 },
105 {"X", 10 }, {"IX", 9 },
106 {"VIII",8 }, {"VII", 7 },
107 {"VI", 6 }, {"V", 5 },
108 {"IV", 4 }, {"III", 3 },
109 {"II", 2 }, {"I", 1 },
110 { "", -1}
111 };
112 struct strVal sroman[] = {
113 {"��", 10 }, {"���", 9 },
114 {"������", 8 }, {"�����", 7 },
115 {"����", 6 }, {"��", 5 },
116 {"���", 4 }, {"����", 3 },
117 {"���", 2 }, {"��", 1 },
118 {"x", 10 }, {"ix", 9 },
119 {"viii",8 }, {"vii", 7 },
120 {"vi", 6 }, {"v", 5 },
121 {"iv", 4 }, {"iii", 3 },
122 {"ii", 2 }, {"i", 1 },
123 { "", -1}
124 };
125 /*
126 * �����
127 */
128 struct strVal lparenpat[] = {
129 {"��", 0},
130 {"��", 0},
131 {"(", 0},
132 {"[", 0},
133 { "", -1}
134 };
135 /*
136 * �����(�ݤ��ä�)
137 */
138 struct strVal rparenpat[] = {
139 {"��", 0},
140 {")", 0},
141 { "", -1}
142 };
143 /*
144 * �����(�֥�å��å�)
145 */
146 struct strVal rbracketpat[] = {
147 {"��", 0},
148 {"]", 0},
149 { "", -1}
150 };
151 /*
152 * ���ѤΥɥå�/��
153 */
154 struct strVal kdot[] = {
155 {"��", 0},
156 {"��", 0},
157 { "", -1}
158 };
159
160 /*
161 * ʸ������ͤ��Ȥߤι�¤�������ʸ����ΰ���
162 * �ޥå�����ʸ�����ޤ¤�Τ��֤�
163 */
164 struct strVal *
kstrMatch(str,kpat)165 kstrMatch(str, kpat)
166 register char *str;
167 register struct strVal *kpat;
168 {
169 register char *patstr;
170 register char *s;
171 for (; *kpat->pattern; kpat++) {
172 patstr = kpat->pattern;
173 s = str;
174 while (*patstr && *s) {
175 if (*patstr++ != *s++)
176 goto nextPat;
177 }
178 DBG2(9,"<%s> matched with <%s>\n", str, kpat->pattern);
179 return kpat;
180 nextPat:
181 ;
182 }
183 return NULL;
184 }
185 /*
186 * Ⱦ�Ѥޤ������Ѥο���
187 * �ޥå�����ʸ�����Ĺ��(1 or 2)���ޤ�������(0)���֤���
188 */
checkIfNumber(str,rval)189 checkIfNumber(str, rval)
190 char *str;
191 int *rval;
192 {
193 struct strVal *svp;
194 if (*str == '\0')
195 return 0;
196 if (index("0123456789", *str)) {
197 *rval = *str - '0';
198 return 1;
199 }
200 if (svp = kstrMatch(str, zenkakuNum)) {
201 *rval = svp->value;
202 return 2;
203 }
204 return 0;
205 }
206 /*
207 * Ⱦ�Ѥޤ������ѤΥ�����
208 * �ޥå�����ʸ�����Ĺ�����ޤ�������(0)���֤���
209 */
checkIfRoman(str,rval,typep)210 checkIfRoman(str, rval, typep)
211 register char *str;
212 int *rval;
213 int *typep;
214 {
215 struct strVal *svp;
216 if (svp = kstrMatch(str, lroman))
217 *typep = L_LROMAN;
218 else if (svp = kstrMatch(str, sroman))
219 *typep = L_SROMAN;
220 else
221 return 0;
222 *rval = svp->value;
223 return strlen(svp->pattern);
224 }
checkIfAppendix(str,nump)225 checkIfAppendix(str, nump)
226 char *str;
227 int *nump;
228 {
229 struct strVal *svp;
230 int len, len2;
231 int ttt;
232 #define skipSpaces() while (*str == ' ') {str++;len++;}
233 if ((svp = kstrMatch(str, appendixPat)) == NULL)
234 return 0;
235 len = strlen(svp->pattern);
236 str += len;
237 skipSpaces();
238 while (svp = kstrMatch(str, appendixPat)) {
239 len2 = strlen(svp->pattern);
240 len += len2;
241 str += len2;
242 }
243 skipSpaces();
244 if ((len2 = checkIfRoman(str, nump, &ttt))
245 || (len2 = checkIfNumStr(str, nump))
246 || (len2 = checkIfAlpha(str, nump, &ttt))
247 || (len2 = checkIfZenkakuAlpha(str, nump, &ttt))) {
248 len += len2;
249 str += len2;
250 }
251 else
252 return 0;
253 skipSpaces();
254 if (svp = kstrMatch(str, kdot))
255 len += strlen(svp->pattern);
256 else if (*str == '.')
257 len++;
258 skipSpaces();
259 return len;
260 }
261 /*
262 * ����������ֹ�Ȥ��Ƥ���ħ�������Ƥ��뤫
263 *
264 * ����(����/Ⱦ��) + �ɥå�(����/Ⱦ��) �� ������(�ꥹ�Ȥ��⤷��ʤ�)
265 * ���� + �ɥå� + ���� �� OK
266 * ���� + �ɥå� + ���� + �ɥå� �� OK
267 * ���� + �ɥå� + ���� + �ɥå� + �� �� OK
268 *
269 */
checkIfSecNumber(str,dots)270 checkIfSecNumber(str, dots)
271 register char *str;
272 int *dots;
273 {
274 struct strVal *svp;
275 int ofst, tofst, len;
276 int intermDots;
277 int dummy;
278 intermDots = 0;
279 ofst = 0;
280 while (1) {
281 tofst = 0;
282 while (len = checkIfNumber(str+ofst+tofst, &dummy))
283 tofst += len;
284 ofst += tofst;
285 if (tofst == 0) {
286 *dots = intermDots;
287 return ofst;
288 }
289 if ((len = (index(".", *(str + ofst)) != NULL))
290 || (svp = kstrMatch(str+ofst, kdot))
291 && (len = strlen(svp->pattern))) {
292 ofst += len;
293 intermDots++;
294 }
295 else {
296 if (intermDots && index(" ", *(str+ofst))) {
297 *dots = intermDots + 1;
298 return ofst + 1;
299 }
300 #ifdef KANJI
301 else if (isZenkaku(str)
302 && index(" ", *(str + ofst))) {
303 *dots = intermDots + 1;
304 return ofst + 1;
305 }
306 #endif
307 else
308 return 0;
309 }
310 }
311 }
312 /*
313 * �������ݤ�
314 * ���� lnump �ˤϿ������͡�������ͤˤ�
315 * �ޥå�����������Ĺ�����֤�
316 */
checkIfNumStr(str,lnump)317 checkIfNumStr(str, lnump)
318 char *str;
319 int *lnump;
320 {
321 int ofst, len;
322 int num;
323 *lnump = 0;
324 ofst = 0;
325 while (len = checkIfNumber(str + ofst, &num)){
326 *lnump = *lnump * 10 + num;
327 ofst += len;
328 }
329 return ofst;
330 }
331 /*
332 * Ⱦ�ѥ���ե��٥åȤ��ݤ�
333 * ���� lnump �ˤϡ�����ե��٥åȤΰ���(a=1, b=2, ��)���֤�
334 * typep����ʸ��/��ʸ���ζ��̤��֤�
335 * �ޥå�����ʸ����Ĺ��(0 or 1)���֤�
336 */
checkIfZenkakuAlpha(str,lnump,typep)337 checkIfZenkakuAlpha(str, lnump, typep)
338 char *str;
339 int *lnump;
340 int *typep;
341 {
342 struct strVal *svp;
343 if (svp = kstrMatch(str, zenkakuAlpUp)) {
344 *lnump = svp->value;
345 *typep = L_LALPHA;
346 return 2;
347 }
348 else if (svp = kstrMatch(str, zenkakuAlpLow)) {
349 *lnump = svp->value;
350 *typep = L_SALPHA;
351 return 2;
352 }
353 return 0;
354 }
355 /*
356 * Ⱦ�ѥ���ե��٥åȤ��ݤ�
357 * ���� lnump �ˤϡ�����ե��٥åȤΰ���(a=1, b=2, ��)���֤�
358 * typep����ʸ��/��ʸ���ζ��̤��֤�
359 * �ޥå�����ʸ����Ĺ��(0 or 1)���֤�
360 */
checkIfAlpha(str,lnump,typep)361 checkIfAlpha(str, lnump, typep)
362 char *str;
363 int *lnump;
364 int *typep;
365 {
366 if (isupper(*str)) {
367 *lnump = alphaVal(tolower(*str));
368 *typep = L_LALPHA;
369 return 1;
370 }
371 else if (islower(*str)) {
372 *lnump = alphaVal(*str);
373 *typep = L_SALPHA;
374 return 1;
375 }
376 return 0;
377 }
378 /*
379 * ��ꥹ�ȤȤ��Ƥ���ħ�������Ƥ��뤫
380 *
381 * {��������������Ⱦ�ѥ���ե��٥å�} + �ɥå�
382 * {��������������Ⱦ�ѥ���ե��٥å�} + �����
383 * ����� + {��������������Ⱦ�ѥ���ե��٥å�} + �����
384 *
385 * �ꥹ���ֹ���ʬ��Ĺ�����֤�
386 */
checkIfEnumList(str,textp)387 checkIfEnumList(str, textp)
388 register char *str;
389 register struct text *textp;
390 {
391 register int len;
392 struct strVal *svp;
393 int ofst, lparen, num, type;
394 int alphabetic = 0;
395 lparen = 0;
396 if (svp = kstrMatch(str, lparenpat)) {
397 lparen = strlen(svp->pattern);
398 }
399 /* Enumeration body */
400 if (len = checkIfRoman(str + lparen, &num, &type)) {
401 textp->listType = type;
402 textp->listNum = num;
403 }
404 else if (len = checkIfNumStr(str + lparen, &num)) {
405 textp->listType = L_NUMBER;
406 textp->listNum = num;
407 }
408 else if (len = checkIfAlpha(str + lparen, &num, &type)) {
409 textp->listType = type;
410 textp->listNum = num;
411 alphabetic = 1;
412 }
413 if (len == 0)
414 return 0;
415 ofst = lparen + len;
416 if (lparen == 0 && (len = (index(".", *(str + ofst))) != NULL)) {
417 if (alphabetic && !japaneseText)
418 /* Special rule for non-japanese text */
419 return 0;
420 textp->listHint = LH_DOTTED;
421 }
422 else if (svp = kstrMatch(str+ofst, rparenpat)) {
423 len = strlen(svp->pattern);
424 if (lparen)
425 textp->listHint = LH_PAREN;
426 else
427 textp->listHint = LH_RPAREN;
428 }
429 else if (svp = kstrMatch(str+ofst, rbracketpat)) {
430 len = strlen(svp->pattern);
431 if (lparen)
432 textp->listHint = LH_BRACKET;
433 else
434 textp->listHint = LH_RBRACKET;
435 }
436 else return 0;
437 ofst += len;
438 if (textp->listType == L_NUMBER && checkIfNumStr(str + ofst, &num))
439 return 0;
440 return ofst;
441 }
442 /*
443 * ���ҷ��ꥹ�ȤȤ��Ƥ���ħ�������Ƥ��뤫
444 *
445 * "[" + �����ȥ� + "]" + ���� <== OK
446 * "[" + �����ȥ� + "]" + ���� + ʸ�� <== OK
447 * "[" + �����ȥ� + "]" + ʸ�� <== ����
448 * �����ȥ� + ":" + ���� <== OK
449 * �����ȥ� + ":" + ���� + ʸ�� <== OK
450 * �����ȥ� + ":" + ʸ�� <== ����
451 *
452 * �㳰Ū������
453 * "[" + �����ȥ� + ":" + "]" <=== "[" ���� "]"�ޤǤ��ȥ����Ȥ���
454 * "[" + �����ȥ� "]" + ���� + ":" <=== "[" ���� "]"�ޤǤ��ȥ����Ȥ���
455 */
checkIfDscrList(str)456 checkIfDscrList(str)
457 register char *str;
458 {
459 register char *markp;
460 int colonLen = 0, bracketLen = 0;
461 markp = index(str, ':');
462 if (markp && (index(markp, ' ') == markp + 1))
463 colonLen = markp - str + 1;
464 else if (markp && *(markp+1) == '\0')
465 colonLen = markp - str + 1;
466 if (*str == '[') {
467 markp = index(str, ']');
468 if (markp && (index(markp, ' ') == markp + 1))
469 bracketLen = markp - str + 1;
470 else if (markp && *(markp+1) == '\0')
471 bracketLen = markp - str + 1;
472 }
473 if (bracketLen)
474 return bracketLen;
475 else
476 return colonLen;
477 }
478 /*
479 * �ꥹ�Ȥޤ��ϥ���������ֹ�Ȥ��Ƥ���ħ�������Ƥ��뤫
480 * ���������Ƥ���С��Ԥ�°���Ȥ�����Ͽ���롥
481 * ���ꥹ�Ȥȥ���������ֹ�ξ�郎������������Ʊ����Ĵ�٤Ƥ���
482 * �㤨�� "1. xxx"�Ȥ����Ԥϡ��ɤ���ξ���������
483 */
lineAtrListSec(textp)484 lineAtrListSec(textp)
485 register struct text *textp;
486 {
487 register char *str;
488 struct strVal *svp;
489 int len;
490 int dots;
491 str = textp->body + textp->indent;
492 if (svp = kstrMatch(str, bullet)) {
493 len = strlen(svp->pattern);
494 if (*(str + len)) {
495 DBG1(7,"Matched with BULLET %s\n", str);
496 textp->pListHead = DEFINITELY;
497 textp->listType = L_BULLET;
498 textp->headLen = len;
499 }
500 }
501 else if (svp = kstrMatch(str, dash)) {
502 len = strlen(svp->pattern);
503 if (kstrMatch(str + len, dash) == 0
504 && *(str + len)
505 && (len != 1 || textp->japanese)) {
506 /* "-" AMBIGUOUS list head. but "------" is not */
507 DBG1(7,"Matched with DASH %s\n", str);
508 if (len == 1)
509 textp->pListHead = AMBIGUOUS;
510 else
511 textp->pListHead = DEFINITELY;
512 textp->listType = L_DASH;
513 textp->headLen = len;
514 }
515 }
516 else if (len = checkIfEnumList(str, textp)) {
517 DBG1(7,"Matched with ENUM_LIST=%s\n", str);
518 textp->pListHead = AMBIGUOUS;
519 textp->headLen = len;
520 }
521 if (indentedSecnum || (textp->indent == 0)) {
522 if (len = checkIfSecNumber(str, &dots)) {
523 DBG3(7,"Matched with SEC_NUM(%d,%d) %s\n",
524 len, dots, str);
525 if (dots > 1
526 #ifdef KANJI
527 || maybeZenkakuNum(textp->body)
528 #endif
529 ){
530 /* x.x or (Zenkaku Number...) */
531 textp->pSecNum = DEFINITELY;
532 }
533 else {
534 textp->pSecNum = AMBIGUOUS;
535 }
536 textp->headLen = len;
537 textp->secDepth = dots;
538 }
539 }
540 }
541 /*
542 * ���ҷ��ꥹ�ȤȤ��Ƥ���ħ�������Ƥ���С��Ԥ�°���Ȥ�����Ͽ���롥
543 */
lineAtrDlist(textp)544 lineAtrDlist(textp)
545 register struct text *textp;
546 {
547 int ret;
548 if (textp->pListHead)
549 return;
550 if (ret = checkIfDscrList(textp->body + textp->indent)) {
551 DBG1(7,"Matched with DSCR_LIST %s\n", textp->body);
552 if (ret < MAX_DSCRLEN) {
553 textp->pListHead = DEFINITELY;
554 textp->listType = L_DLIST;
555 textp->headLen = ret;
556 }
557 }
558 {
559 struct strVal *svp;
560 if (svp = kstrMatch(textp->body + textp->indent, listSpecial)){
561 char *s;
562 ret = strlen(svp->pattern);
563 s = textp->body + textp->indent + ret;
564
565 if (*s
566 #ifdef PICTURE
567 && picLineMatch(s) == NULL
568 && picMiscMatch(s) == NULL
569 #endif
570 ) {
571 DBG1(7,"Matched with DSCR_LIST(special) %s\n",
572 textp->body);
573 textp->pListHead = DEFINITELY;
574 textp->listType = L_DLIST;
575 textp->headLen = ret;
576 }
577 }
578 }
579 }
580 /*
581 * ʸ�������®�����뤿�ᡤ�����оݤȤʤ�ʸ��������Х��Ȥ�
582 * ��Ͽ���Ƥ���
583 */
byteRegister(reg,kpat)584 byteRegister(reg, kpat)
585 register unsigned char reg[];
586 struct strVal *kpat;
587 {
588 for (; *kpat->pattern; kpat++) {
589 reg[(unsigned char)*kpat->pattern]++;
590 }
591 }
592 /*
593 * �ƹԤ���ħ�餫���ḡ�Ф��Ƥ���
594 */
analyzeLines(begin,end)595 analyzeLines(begin, end)
596 int begin;
597 int end;
598 {
599 int i, l;
600 unsigned char firstByteListSec[256];
601 short lengthAccum[MAX_LINE_LEN];
602 register struct text *textp;
603 DBG0(5,"checkLineCharacter\n");
604 bzero((char *)lengthAccum, sizeof(lengthAccum));
605 /*
606 * �ꥹ�ȡ��ޤ��ϥ���������ֹ�Ȥ��Ƥθ���ܤ뤿��
607 * ���Х��Ȥ���Ͽ���Ƥ���
608 */
609 bzero((char *)firstByteListSec, sizeof(firstByteListSec));
610 byteRegister(firstByteListSec, bullet);
611 byteRegister(firstByteListSec, dash);
612 byteRegister(firstByteListSec, zenkakuNum);
613 byteRegister(firstByteListSec, sroman);
614 byteRegister(firstByteListSec, lroman);
615 byteRegister(firstByteListSec, lparenpat);
616 for (i = '0'; i <= '9'; i++)
617 firstByteListSec[i]++;
618 for (i = 'a'; i <= 'z'; i++)
619 firstByteListSec[i]++;
620 for (i = 'A'; i <= 'Z'; i++)
621 firstByteListSec[i]++;
622 for (l = begin; l < end; l++) {
623 textp = texts[l];
624 if (textp->blank)
625 continue;
626 lengthAccum[textp->length]++;
627 if (firstByteListSec[(unsigned char)*(textp->body + textp->indent)]) {
628 lineAtrListSec(textp);
629 }
630 if (textp->pSecNum == NEVER)
631 lineAtrDlist(textp);
632 lineAtrFtitle(textp);
633 #ifdef PICTURE
634 picCharCount(textp);
635 #endif
636 }
637 /*
638 * �饤�ȥޡ�����Ф��Ƥ���
639 * ���Τ�90%�ιԤ����ޤ���֤�饤�ȥޡ�����Ȥ��Ƥ���
640 */
641 for (i = 1; i < MAX_LINE_LEN; i++)
642 lengthAccum[i] += lengthAccum[i - 1];
643 for (i = 0; i < MAX_LINE_LEN; i++)
644 if ((long)lengthAccum[i] * 100
645 > (long)lengthAccum[MAX_LINE_LEN - 1] * 90){
646 rightMargin = i;
647 break;
648 }
649 MSG1("Right Margin = %d\n", rightMargin);
650 }
651
652