1 /* catdvi - get text from DVI files
2 Copyright (C) 1999 Antti-Juhani Kaijanaho <gaia@iki.fi>
3 Copyright (C) 2000-02 Bjoern Brill <brill@fs.math.uni-frankfurt.de>
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20 #include <stdlib.h>
21 #include <assert.h>
22 #include "layout.h"
23 #include "util.h"
24 #include "page2.h"
25 #include "outenc.h"
26 #include "linebuf.h"
27 #include "density.h"
28 #include "canvas.h"
29 #include "vlist.h"
30 #include "fontinfo.h"
31
32 /* page margins */
33 static sint32 left_margin, right_margin;
34 static sint32 top_margin, bottom_margin;
35
36 /* other page metrics */
37 int linecount = 0;
38 uint32 boxcount = 0;
39 double average_box_width = 0;
40 double average_box_totheight = 0;
41
42 /* layout parameters */
43 sint32 max_col_width, max_row_height;
44 double min_col_density, min_line_density;
45 scdf_t col_density, line_density;
46 scdf_t * x2col, * y2line;
47
48
49 /* some obvious stuff about rectangles */
50 typedef struct rect_t rect_t;
51 struct rect_t {
52 sint32 xmin;
53 sint32 xmax;
54 sint32 ymin;
55 sint32 ymax;
56 };
57
58 int intervals_intersect(
59 sint32 a, sint32 b,
60 sint32 c, sint32 d
61 );
62
63 int rect_intersects(const rect_t * this, const rect_t * that);
64
65 int rect_contains_point(const rect_t * this, sint32 x, sint32 y);
66
67 /* create rect */
68 void rect_set(
69 rect_t * this,
70 sint32 xmin,
71 sint32 xmax,
72 sint32 ymin,
73 sint32 ymax
74 );
75
76 void rect_set_empty(rect_t * this);
77
78 int rect_is_empty(const rect_t * this);
79
80 /* *this = smallest rect containing *this and *that */
81 void rect_union_with(rect_t * this, const rect_t * that);
82
83
84 /* a page_word is a sequence of contiguous box_t which should remain
85 * contiguous in printout.
86 */
87 typedef struct page_word_t page_word_t;
88 struct page_word_t {
89 list_node_t * alpha; /* node of first box in word */
90 list_node_t * omega; /* node of last box in word */
91 linebuf_t * text;
92 rect_t brect; /* bounding rectangle: the smallest rectangle
93 * containing the part _avove the baseline_
94 * of all boxes in the word
95 */
96 sint32 ceiling; /* stuff with y <= ceiling should appear top of
97 * our row in printout.
98 */
99 sint32 right_wall; /* stuff with x >= right_wall should appear right
100 * from our rightmost glyph in printout.
101 */
102 sint32 right_corridor; /* stuff on the same row with x >= right_corridor
103 * should appear trailing_spaces columns right from
104 * our rightmost glyph in printout.
105 */
106 int output_width;
107 int trailing_spaces; /* usually 1, somtetimes 0 */
108 int row;
109 int column;
110 };
111
112 /* The page has one global list of page_words */
113 vlist_t page_words;
114
115 /* constructor */
116 static void page_word_init(
117 page_word_t * this,
118 list_node_t * alpha,
119 list_node_t * omega,
120 int trailing_spaces
121 );
122
123 /* destructor */
124 static void page_word_done(page_word_t * this);
125
126 /* Try to resolve collisions (i.e. situations where the bounding rects of
127 * two words intersect).
128 */
129 static void page_words_collide(vitor_t vi, vitor_t vj);
130
131 /* Split *this before where; i.e. :
132 * - create a new page_word containing the boxes from where to this->omega
133 * on the heap ("right half")
134 * - shrink *this to the boxes from this->alpha to where->prev ("left half")
135 * - return pointer to new page_word
136 * Both the left and right half are brought into shape by constructor calls,
137 * so any information about ceilings, right walls, etc. gets lost.
138 */
139 static page_word_t * page_word_split_at(
140 page_word_t * this,
141 struct list_node_t * where
142 );
143
144
145 /* Estimate number of spaces from distance between boxes */
146 static int decide_space_count(
147 struct list_node_t * prev_p,
148 struct list_node_t * p
149 );
150
151 /* Look at the midpoints of two boxes. If either midpoint lies in the other
152 * box, the boxes compare "equal". Otherwise, we compare x of midpoints.
153 */
154 sint32 box_sloppy_x_cmp(const struct box_t * p1, const struct box_t * p2);
155
156
157 static void page_collect_statistics(void);
158
159 /* Make an initial pass at cutting the page into words, based on the distance
160 * between adjacent boxes. This will be refined in a later pass.
161 */
162 static void page_cut_words(void);
163
164
165 /* If the bounding rectangles of two words on the page collide, we have
166 * likely hit a word breaking problem (example: math subscripts are
167 * too thin to cause a word break between adjacent variables). If a
168 * collission occurs, we call a function that tries to resolve it
169 * by introducing additional word breaks.
170 */
171 static void page_check_word_collisions(void);
172
173 /* The idea: words with x coordinates close to each other and different
174 * y coordinates should appear on different rows in printout.
175 */
176 static void page_find_ceilings(void);
177
178 /* If two words end up on the same line in printout, then the left one
179 * needs to put its trailing_spaces before the right one starts.
180 * Obviously, this has to be called _after_ we know our rows.
181 */
182 static void page_find_right_corridors(void);
183
184 /* If the y intervals of the bounding rectangles of two words intersect,
185 * then the left one should end in printout before the right one starts.
186 * This should be called _after_ page_find_right_corridors() because we want
187 * to ensure right_wall <= right_corridor .
188 */
189 static void page_find_right_walls(void);
190
191 /* Determine on which row in printout words end up. The main vehicle here
192 * is a line density function; we force it to have integral >=1 on the
193 * interval [word->ceiling, word->brect->ymax].
194 * The words are then positioned so that their baseline is in the correct row.
195 */
196 static void page_determine_lines(void);
197
198 /* Determine on which column in printout words end up. The main vehicle here
199 * is a column density function; we force it to have large enough integrals
200 * on the intervals [word->brect->xmin, word->right_wall] resp.
201 * [word->brect->xmin, word->right_wall] to accomodate word->output_width
202 * resp. word->output_width + word->trailing_spaces columns.
203 * The words are then positioned so that their left boundary is in the correct
204 * column.
205 */
206 static void page_determine_cols(void);
207
208
page_print_formatted(void)209 void page_print_formatted(void)
210 {
211 canv_t canvas;
212 vitor_t pwi;
213
214 pmesg(50, "BEGIN page_print_formatted\n");
215
216 /* Life is much easier if we don't have to care for the case
217 * of an empty page all the time.
218 */
219 if(list_head == NULL) {
220 puts("\f\n");
221 pmesg(80, "This page is empty.\n");
222 pmesg(50, "END page_print_formatted\n");
223 return;
224 }
225
226 page_adjust_diacritics();
227 page_adjust_texmext();
228 page_adjust_radicals();
229
230 page_collect_statistics();
231
232 max_col_width = (sint32) (4.0 * average_box_width);
233 max_row_height = (sint32) (3.0 * average_box_totheight);
234 min_col_density = 1.0 / max_col_width;
235 min_line_density = 1.0 / max_row_height;
236 /*
237 * The minimal densities should ensure that large amounts of white space
238 * do not completely get lost. The factors 3.0, 4.0 have been determined
239 * by rules of thumb and experiments.
240 * The (unverified) expectation is that a printout column will be
241 * about average_box_width DVI units wide and a printout row will be
242 * about 1.5 * average_box_totheight DVI units high.
243 */
244 scdf_init(&col_density, left_margin, right_margin, min_col_density);
245 /* col_density measures the column density at every point between
246 * left and right margin which is required to do proper positioning
247 * of every word.
248 */
249 scdf_init(&line_density, top_margin, bottom_margin, min_line_density);
250 /* Similar for line_density and lines */
251
252 vlist_init(&page_words);
253 page_cut_words();
254 page_check_word_collisions();
255
256 page_find_ceilings();
257 page_determine_lines();
258
259 page_find_right_corridors();
260 page_find_right_walls();
261 page_determine_cols();
262
263 /* Now print out everything */
264 canv_init(
265 &canvas,
266 (sint32) scdf_eval(x2col, right_margin) + 1,
267 (sint32) scdf_eval(y2line, bottom_margin) + 1
268 );
269 for(
270 pwi = vlist_begin(&page_words);
271 pwi != vlist_end(&page_words);
272 pwi = pwi->next
273 ) {
274 page_word_t * pw;
275
276 pw = vitor2ptr(pwi, page_word_t);
277 canv_put_linebuf(
278 &canvas,
279 pw->text,
280 pw->column,
281 pw->row,
282 pw->output_width
283 );
284 pw->text = NULL; /* text is now owned by the canvas */
285 }
286 canv_write(&canvas, stdout);
287 puts("\f\n");
288
289 /* clean up */
290
291 canv_done(&canvas);
292
293 for(
294 pwi = vlist_begin(&page_words);
295 pwi != vlist_end(&page_words);
296 pwi = pwi->next
297 ) {
298 page_word_done(pwi->data);
299 free(pwi->data);
300 }
301 vlist_done(&page_words);
302
303 scdf_done(y2line); free(y2line);
304 scdf_done(x2col); free(x2col);
305 scdf_done(&col_density);
306 scdf_done(&line_density);
307
308 pmesg(50, "END page_print_formatted\n");
309 }
310
311
page_print_sequential(void)312 void page_print_sequential(void)
313 {
314 struct list_node_t * p;
315 struct box_t prev_box, curr_box;
316
317 sint32 delta_x = 0, delta_y = 0;
318 sint32 dist_x = 0;
319 sint32 epsilon_x = 0, epsilon_y = 0;
320
321 /* begin-of-page flag */
322 int bop = 1;
323 /* begin-of-line flag */
324 int bol = 1;
325
326 int spaces, spaces_by_prev, spaces_by_curr;
327 struct linebuf_t lb;
328
329 pmesg(50, "BEGIN page_print_sequential\n");
330
331 page_adjust_diacritics();
332
333 linebuf_init(&lb, 0);
334 prev_box.width = 1;
335 prev_box.height = 1;
336
337 for (p = list_head; p != 0; p = p->next) {
338
339 pmesg(80, "node: X = %li, Y = %li, glyph = %li (%c), font = %li\n",
340 p->b.x, p->b.y, p->b.glyph, (unsigned char) p->b.glyph, p->b.font);
341
342 curr_box = p->b;
343 if (curr_box.width == 0) curr_box.width = prev_box.width;
344 if (curr_box.height == 0) curr_box.height = prev_box.height;
345
346 if (!bop) {
347 delta_x = curr_box.x - prev_box.x;
348 delta_y = curr_box.y - prev_box.y;
349 dist_x = delta_x - prev_box.width;
350
351 /* (arithmetic mean of current and previous) / 20 */
352 epsilon_x = (curr_box.width + prev_box.width + 39) / 40;
353 epsilon_y = (curr_box.height + prev_box.height + 39) / 40;
354
355 /* check for new line */
356 if (
357 /* new line */
358 delta_y >= 22*epsilon_y ||
359 /* new column */
360 delta_y <= -60*epsilon_y ||
361 /* weird step back */
362 dist_x <= -100*epsilon_x
363 ) {
364 pmesg(80, "end of line\n");
365 outenc_write(stdout, &lb);
366 linebuf_clear(&lb);
367 puts("");
368 bol = 1;
369 }
370
371 /* check for word breaks, but not at the beginning of a line */
372 if ((dist_x > 0) && (!bol)) {
373 spaces_by_prev =
374 font_w_to_space(prev_box.font, dist_x);
375 spaces_by_curr =
376 font_w_to_space(curr_box.font, dist_x);
377 spaces = (spaces_by_prev + spaces_by_curr + 1) / 2;
378
379 if (spaces > 0) {
380 pmesg(80, "setting space\n");
381 linebuf_putg(&lb, ' ');
382 }
383 }
384
385 } /* end if (!bop) */
386
387 linebuf_putg(&lb, curr_box.glyph);
388 prev_box = curr_box;
389 bop = 0;
390 bol = 0;
391 }
392 /* flush line buffer and end page */
393 outenc_write(stdout, &lb);
394 puts("\n\f\n");
395
396 linebuf_done(&lb);
397
398 pmesg(50, "END page_print_sequential\n");
399 }
400
decide_space_count(struct list_node_t * prev_p,struct list_node_t * p)401 static int decide_space_count(
402 struct list_node_t * prev_p,
403 struct list_node_t * p
404 )
405 {
406 font_t prev_font, curr_font;
407 sint32 prev_x, prev_width;
408 sint32 curr_x;
409 sint32 delta;
410 int (* const f2spc)(sint32, sint32) = font_w_to_space;
411
412 prev_font = prev_p->b.font;
413 prev_x = prev_p->b.x;
414 prev_width = prev_p->b.width;
415
416 curr_font = p->b.font;
417 curr_x = p->b.x;
418
419 delta = curr_x - (prev_x + prev_width);
420 return (1 + f2spc(curr_font, delta) + f2spc(prev_font, delta)) / 2;
421 }
422
423
424 /*************** page_word_t method implementations ****************/
page_word_init(page_word_t * this,list_node_t * alpha,list_node_t * omega,int trailing_spaces)425 void page_word_init(
426 page_word_t * this,
427 list_node_t * alpha,
428 list_node_t * omega,
429 int trailing_spaces
430 )
431 {
432 list_node_t * p, * q;
433 rect_t r;
434
435 this->alpha = alpha;
436 this->omega = omega;
437 this->text = xmalloc(sizeof(linebuf_t));
438 linebuf_init(this->text, 20);
439 /* 20 seems a good tradeoff between memory usage and
440 * avoiding reallocs.
441 */
442 rect_set_empty(&(this->brect));
443 this->ceiling = SINT32_MIN;
444 /* The top line has no ceiling -- it fits automatically */
445 this->right_wall = right_margin;
446 this->right_corridor = right_margin;
447 /* this->output_width will be set below */
448 this->trailing_spaces = trailing_spaces;
449 this->row = -1; /* will be filled in later */
450 this->column = -1; /* will be filled in later */
451
452 q = omega->next;
453 for(p = alpha; p != q; p = p->next) {
454 /* add p->b to bounding rect */
455 rect_set(
456 &r,
457 p->b.x,
458 p->b.x + p->b.width,
459 p->b.y - p->b.height,
460 p->b.y
461 );
462 rect_union_with(&(this->brect), &r);
463
464 /* add p->b.glyph to text */
465 linebuf_putg(this->text, p->b.glyph);
466 }
467
468 this->output_width = outenc_get_width(this->text);
469 }
470
471
page_word_done(page_word_t * this)472 void page_word_done(page_word_t * this)
473 {
474 this->alpha = NULL;
475 this->omega = NULL;
476 /* this->text will be NULL after it has been passed to canvas */
477 if(this->text != NULL) {
478 linebuf_done(this->text);
479 free(this->text);
480 this->text = NULL;
481 }
482 }
483
page_words_collide(vitor_t vi,vitor_t vj)484 static void page_words_collide(vitor_t vi, vitor_t vj)
485 {
486 page_word_t * wp, * wq;
487 list_node_t * bp, * bq;
488 sint32 c;
489
490 pmesg(60, "BEGIN page_words_collide\n");
491
492 wp = vi->data;
493 wq = vj->data;
494
495 bp = wp->alpha;
496 bq = wq->alpha;
497 if(box_sloppy_x_cmp(&bp->b, &bq->b) > 0) {
498 /* We want to assume that *wp starts left from *wq */
499 pmesg(60, "calling myself with exchanged arguments\n");
500 page_words_collide(vj, vi);
501 pmesg(60, "END page_words_collide\n");
502 return;
503 }
504
505 /* Skip all letters of *wp left of *wq. We allow a certain margin overlap
506 * to accomodate italics correction, kerning, etc.
507 */
508 while(c = box_sloppy_x_cmp(&bp->b, &bq->b), c < 0) {
509 if(bp == wp->omega) {
510 /* We're at the end of *wp, so either really no collision
511 * at all, or only the last box collides. Anyway, nothing
512 * to do.
513 */
514 pmesg(60, "nothing to do\n");
515 pmesg(60, "END page_words_collide\n");
516 return;
517 }
518 bp = bp->next;
519 }
520
521 if(c == 0) {
522 /* Ouch, full crash */
523 pmesg(60, "unresolvable collision\n");
524 pmesg(60, "END page_words_collide\n");
525 return;
526 }
527
528 /* We've moved bp across the beginning of *wq */
529 pmesg(60, "resolving collision by splitting word\n");
530 vlist_insert_after(&page_words, vi, page_word_split_at(wp, bp));
531
532 /* Check for further collisions between *wq and the right piece of *wp */
533 pmesg(60, "checking for further collisions\n");
534 page_words_collide(vj, vi->next);
535 pmesg(60, "END page_words_collide\n");
536 }
537
page_word_split_at(page_word_t * this,struct list_node_t * where)538 static page_word_t * page_word_split_at(
539 page_word_t * this,
540 struct list_node_t * where
541 )
542 {
543 page_word_t * righthalf;
544 list_node_t * alpha;
545
546 /* No splitting with one half empty, please. */
547 assert(where != this->alpha);
548 assert(where != this->omega->next);
549
550 /* create right half */
551 righthalf = xmalloc(sizeof(page_word_t));
552 page_word_init(righthalf, where, this->omega, this->trailing_spaces);
553
554 /* reinit left half (ourselves) */
555 alpha = this->alpha;
556 page_word_done(this);
557 page_word_init(this, alpha, where->prev, 0);
558
559 return righthalf;
560 }
561
562
box_sloppy_x_cmp(const struct box_t * p1,const struct box_t * p2)563 sint32 box_sloppy_x_cmp(const struct box_t * p1, const struct box_t * p2)
564 {
565 sint32 m1, m2; /* midpoints */
566
567 m1 = p1->x + p1->width / 2;
568 m2 = p2->x + p2->width / 2;
569
570 /* If the x midpoint of one box is in the x interval of the other,
571 * the two are considered close together (and compare "equal").
572 * Otherwise, the x midpoints are compared.
573 */
574 if(p1->x <= m2 && m2 <= p1->x + p1->width) return 0;
575 if(p2->x <= m1 && m1 <= p2->x + p2->width) return 0;
576 return m1 - m2;
577 }
578
579
580 /*************** page layout passes ********************************/
page_collect_statistics(void)581 void page_collect_statistics(void)
582 {
583 struct list_node_t * p;
584 sint32 prev_y;
585
586 pmesg(80, "BEGIN page_collect_statistics\n");
587
588 left_margin = SINT32_MAX;
589 right_margin = SINT32_MIN;
590 top_margin = SINT32_MAX;
591 bottom_margin = SINT32_MIN;
592
593 average_box_width = 0;
594 average_box_totheight = 0;
595 linecount = 0;
596 boxcount = 0;
597
598 prev_y = list_head->b.y;
599 for (p = list_head; p != 0; p = p->next) {
600 sint32 x, y, width, height, depth;
601
602 x = p->b.x;
603 y = p->b.y;
604 width = p->b.width;
605 height = p->b.height;
606 depth = p->b.depth;
607
608 left_margin = min(left_margin, x);
609 right_margin = max(right_margin, x + width);
610 top_margin = min(top_margin, y - height);
611 bottom_margin = max(bottom_margin, y + depth);
612
613 if (y > prev_y) {
614 prev_y = y;
615 linecount++;
616 }
617
618 average_box_width += width;
619 average_box_totheight += (height + depth);
620 ++boxcount;
621 }
622 ++linecount; /* The last line hasn't been counted */
623 average_box_width = average_box_width / boxcount;
624 average_box_totheight = average_box_totheight / boxcount;
625
626 pmesg(80, "left margin: %ld, right margin: %ld.\n", \
627 left_margin, right_margin);
628 pmesg(80, "top margin: %ld, bottom margin: %ld.\n", \
629 top_margin, bottom_margin);
630
631 pmesg(80, "END page_collect_statistics\n");
632 }
633
634
page_cut_words(void)635 void page_cut_words(void)
636 {
637 struct list_node_t * p;
638
639 int bop, eop, bol, eol, bow, eow;
640 /* {begin,end} of {page, line, word} flags */
641
642 pmesg(50, "BEGIN page_cut_words\n");
643
644 bop = 1;
645 bol = 1;
646 bow = 1;
647 for (p = list_head; p != NULL; p = p->next) {
648 sint32 y;
649 list_node_t * curr_alpha;
650
651 pmesg(80, "node: X = %li, Y = %li, glyph = %li (%c), font = %li\n",
652 p->b.x, p->b.y, p->b.glyph, (unsigned char) p->b.glyph, p->b.font);
653
654 y = p->b.y;
655 eop = (p->next == NULL);
656 eol = eop || (p->next->b.y != y);
657 eow = eol || (decide_space_count(p, p->next) > 0);
658
659 if(bow) {
660 curr_alpha = p;
661 }
662
663 if(eow) {
664 int trailing_space;
665 page_word_t * new_word;
666
667 if(eol) {
668 trailing_space = 0;
669 }
670 else {
671 trailing_space = 1;
672 }
673
674 /* Make up a page_word_t from the collected data and insert
675 * it into the page_words list
676 */
677 new_word = xmalloc(sizeof(page_word_t));
678 page_word_init(new_word, curr_alpha, p, trailing_space);
679 vlist_push_back(&page_words, new_word);
680 }
681
682 if(eol) {
683 pmesg(80, "end of line\n");
684 }
685
686 bop = 0;
687 bol = eol;
688 bow = eow;
689
690 } /* for p */
691
692 pmesg(50, "END page_cut_words\n");
693
694 }
695
696
page_check_word_collisions(void)697 static void page_check_word_collisions(void)
698 {
699 vitor_t vi, nvi, vj, llvi;
700 page_word_t * p, * np, * q;
701 int eop, eol;
702
703 pmesg(50, "BEGIN page_check_word_collisions\n");
704
705 llvi = vlist_rend(&page_words);
706
707 for(
708 vi = vlist_begin(&page_words);
709 vi != vlist_end(&page_words);
710 vi = nvi
711 ) {
712 p = vi->data;
713 nvi = vi->next;
714 np = nvi->data;
715
716 eop = (nvi == vlist_end(&page_words));
717 eol = eop || np->brect.ymax != p->brect.ymax;
718
719 /* We only need to look upwards, a collision will then always be caught
720 * by the lower one of the colliding words.
721 * Collisions between words on the same baseline cannot occur
722 * (by design of the basic word break algorithm) and shouldn't
723 * result in new word breaks anyway.
724 *
725 * We keep an iterator to the last word in the previous line (upwards)
726 * to avoid repeated skipping backwards over preceding words
727 * on the same baseline.
728 */
729 for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
730 q = vj->data;
731 if(q->brect.ymax < p->brect.ymin) break;
732 /* We are too far up to find any more collisions. Note
733 * this happens _immediately_ in the standard (one column
734 * text without intersecting lines) case.
735 */
736 if(!intervals_intersect(
737 p->brect.xmin, p->brect.xmax,
738 q->brect.xmin, q->brect.xmax
739 )) continue;
740 page_words_collide(vi, vj);
741 /* We do _not_ break if q->brect.xmax < p->brect.xmin
742 * because there may be more than one line with
743 * possible collisions.
744 */
745 }
746
747 if(eol) llvi = vi;
748 }
749
750 pmesg(50, "END page_check_word_collisions\n");
751 }
752
753
page_find_ceilings(void)754 static void page_find_ceilings(void)
755 {
756 vitor_t vi, nvi, vj, llvi;
757 page_word_t * p, * np, * q;
758 int eop, eol;
759
760 pmesg(50, "BEGIN page_find_ceilings\n");
761
762 llvi = vlist_rend(&page_words);
763
764 /* Our policy is the following:
765 * a. Stuff that has its baseline above our head (read p->brect.ymin) must
766 * end up on a higher line.
767 * b. Stuff that is near us in x direction and has its baseline above
768 * ours must end up on a higher line.
769 */
770
771 for(
772 vi = vlist_begin(&page_words);
773 vi != vlist_end(&page_words);
774 vi = nvi
775 ) {
776 sint32 xmin, xmax; /* the x interval we're interested in */
777 sint32 ymin; /* we need not care for stuff higher that that */
778
779 p = vi->data;
780 nvi = vi->next;
781 np = nvi->data;
782
783 xmin = p->brect.xmin - 2 * average_box_width;
784 xmax = p->brect.xmax + 2 * average_box_width;
785 ymin = p->brect.ymax - max_row_height;
786
787 eop = (nvi == vlist_end(&page_words));
788 eol = eop || np->brect.ymax != p->brect.ymax;
789
790 /* We only need to look upwards, and no further than max_row_hight
791 *
792 * We keep an iterator to the last word in the previous line (upwards)
793 * to avoid repeated skipping backwards over preceding words
794 * on the same baseline.
795 * Since the list is traversed bottom-to-top, we can stop looking
796 * as soon as we've found the first ceiling candidate.
797 */
798 for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
799 q = vj->data;
800 if(q->brect.ymax < ymin) {
801 /* stuff so high will end up on another line automatically */
802 break;
803 }
804 if(q->brect.ymax < p->brect.ymin) {
805 p->ceiling = q->brect.ymax;
806 break;
807 }
808 if(intervals_intersect(
809 xmin, xmax,
810 q->brect.xmin, q->brect.xmax
811 )) {
812 p->ceiling = q->brect.ymax;
813 break;
814 }
815 }
816
817 if(eol) llvi = vi;
818 }
819
820 pmesg(50, "END page_find_ceilings\n");
821 }
822
823
page_find_right_corridors(void)824 static void page_find_right_corridors(void)
825 {
826 vitor_t vi, nvi, vj, llvi;
827 page_word_t * p, * np, * q;
828 int eop, eol;
829
830 pmesg(50, "BEGIN page_find_right_corridors\n");
831
832 llvi = vlist_rend(&page_words);
833
834 for(
835 vi = vlist_begin(&page_words);
836 vi != vlist_end(&page_words);
837 vi = nvi
838 ) {
839 p = vi->data;
840 nvi = vi->next;
841 np = nvi->data;
842
843 eop = (nvi == vlist_end(&page_words));
844 eol = eop || np->brect.ymax != p->brect.ymax;
845
846 /* first we look directly right */
847 if(!eol) {
848 p->right_corridor = min(p->right_corridor, np->brect.xmin);
849 }
850
851 /* Now we look upwards (left and right).
852 *
853 * We keep an iterator to the last word in the previous line (upwards)
854 * to avoid repeated skipping backwards over preceding words
855 * on the same baseline.
856 */
857 for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
858 q = vj->data;
859 if(q->row < p->row) break;
860 /* We need not look further up */
861 if(q->brect.xmin < p->brect.xmin) {
862 q->right_corridor = min(q->right_corridor, p->brect.xmin);
863 }
864 else if(p->brect.xmin < q->brect.xmin) {
865 p->right_corridor = min(p->right_corridor, q->brect.xmin);
866 }
867 }
868
869 if(eol) llvi = vi;
870 }
871
872 pmesg(50, "END page_find_right_corridors\n");
873 }
874
875
page_find_right_walls(void)876 static void page_find_right_walls(void)
877 {
878 vitor_t vi, nvi, vj, llvi;
879 page_word_t * p, * np, * q;
880 int eop, eol;
881
882 pmesg(50, "BEGIN page_find_right_walls\n");
883
884 llvi = vlist_rend(&page_words);
885
886 for(
887 vi = vlist_begin(&page_words);
888 vi != vlist_end(&page_words);
889 vi = nvi
890 ) {
891 p = vi->data;
892 nvi = vi->next;
893 np = nvi->data;
894
895 eop = (nvi == vlist_end(&page_words));
896 eol = eop || np->brect.ymax != p->brect.ymax;
897
898 /* This has to be done somewhere. The most natural place is here. */
899 p->right_wall = min(p->right_wall, p->right_corridor);
900
901 /* We need not look directly right, we've already done so to
902 * find right_corridor. But if we are at the end of the line,
903 * we want stuff on other lines starting clearly right from where we
904 * end to appear right of us.
905 */
906 if(eol) {
907 p->right_wall = min(
908 p->right_wall,
909 p->brect.xmax + average_box_width
910 );
911 /* FIXME: is average_box_width the right amount of slack? Or
912 * use no slack at all ?
913 */
914 }
915
916 /* Now we look upwards (left and right).
917 *
918 * We keep an iterator to the last word in the previous line (upwards)
919 * to avoid repeated skipping backwards over preceding words
920 * on the same baseline.
921 */
922 for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
923 q = vj->data;
924 if(q->brect.ymax < p->brect.ymin) break;
925 /* We need not look further up */
926 if(box_sloppy_x_cmp(&(q->omega->b), &(p->alpha->b)) < 0) {
927 /* *q ends left from *p */
928 q->right_wall = min(q->right_wall, p->brect.xmin);
929 }
930 else if(box_sloppy_x_cmp(&(p->omega->b), &(q->alpha->b)) < 0) {
931 /* *q starts right from *p */
932 p->right_wall = min(p->right_wall, q->brect.xmin);
933 }
934 /* FIXME: what about crashing words? Should we do something here?
935 * And is box_sloppy_x_cmp() really the right criterion?
936 */
937 }
938
939 if(eol) llvi = vi;
940 }
941
942 pmesg(50, "END page_find_right_walls\n");
943 }
944
945
page_determine_lines(void)946 static void page_determine_lines(void)
947 {
948 vitor_t vi;
949 page_word_t * p;
950
951 pmesg(50, "BEGIN page_determine_lines\n");
952
953 for(
954 vi = vlist_begin(&page_words);
955 vi != vlist_end(&page_words);
956 vi = vi->next
957 ) {
958 p = vi->data;
959 pmesg(
960 100,
961 "x = %ld, y = %ld, ceiling = %ld\n",
962 p->brect.xmin,
963 p->brect.ymax,
964 p->ceiling
965 );
966
967 if(p->ceiling >= top_margin) {
968 scdf_force_min_integral(
969 &line_density,
970 p->ceiling,
971 p->brect.ymax,
972 1
973 );
974 }
975 }
976
977 scdf_normalize(&line_density);
978 y2line = scdf_floor_of_integral(&line_density);
979
980 for(
981 vi = vlist_begin(&page_words);
982 vi != vlist_end(&page_words);
983 vi = vi->next
984 ) {
985 p = vi->data;
986 p->row = (sint32) scdf_eval(y2line, p->brect.ymax);
987 pmesg(100, "word at row %i\n", p->row);
988 }
989
990 pmesg(50, "END page_determine_lines\n");
991 }
992
993
page_determine_cols(void)994 static void page_determine_cols(void)
995 {
996 vitor_t vi, vj;
997 page_word_t * p;
998 int eop, eol;
999
1000 pmesg(50, "BEGIN page_determine_cols\n");
1001
1002 for(
1003 vi = vlist_begin(&page_words);
1004 vi != vlist_end(&page_words);
1005 vi = vj
1006 ) {
1007 p = vi->data;
1008 pmesg(
1009 100,
1010 "x = %ld, y = %ld, right_wall = %ld, right_corridor = %ld\n",
1011 p->brect.xmin,
1012 p->brect.ymax,
1013 p->right_wall,
1014 p->right_corridor
1015 );
1016 assert(p->right_wall <= p->right_corridor);
1017 assert(p->right_corridor <= right_margin);
1018
1019 if(p->right_wall < p->right_corridor) {
1020 scdf_force_min_integral(
1021 &col_density,
1022 p->brect.xmin,
1023 p->right_wall,
1024 p->output_width
1025 );
1026 }
1027 if(p->right_wall == p->right_corridor || p->trailing_spaces > 0) {
1028 scdf_force_min_integral(
1029 &col_density,
1030 p->brect.xmin,
1031 p->right_corridor,
1032 p->output_width + p->trailing_spaces
1033 );
1034 }
1035
1036 vj = vi->next;
1037 eop = (vj == vlist_end(&page_words));
1038 eol = eop || (p->brect.ymax != vitor2ptr(vj, page_word_t)->brect.ymax);
1039 if(eol) scdf_normalize(&col_density);
1040 }
1041
1042 x2col = scdf_floor_of_integral(&col_density);
1043
1044 for(
1045 vi = vlist_begin(&page_words);
1046 vi != vlist_end(&page_words);
1047 vi = vi->next
1048 ) {
1049 p = vi->data;
1050 p->column = (sint32) scdf_eval(x2col, p->brect.xmin);
1051 pmesg(100, "word at column %i\n", p->column);
1052 }
1053
1054 pmesg(50, "END page_determine_cols\n");
1055 }
1056
1057
1058 /***************** rect_t method implementations **********************/
intervals_intersect(sint32 a,sint32 b,sint32 c,sint32 d)1059 int intervals_intersect(
1060 sint32 a, sint32 b,
1061 sint32 c, sint32 d
1062 )
1063 {
1064 if(a > b || c > d) return 0; /* One of the intervals is empty */
1065 return c <= b && a <= d; /* yes, this works in ALL remaining cases :) */
1066 }
1067
rect_intersects(const rect_t * this,const rect_t * that)1068 int rect_intersects(const rect_t * this, const rect_t * that)
1069 {
1070 return (
1071 intervals_intersect(this->xmin, this->xmax, that->xmin, that->xmax)
1072 && intervals_intersect(this->ymin, this->ymax, that->ymin, that->ymax)
1073 );
1074 }
1075
rect_contains_point(const rect_t * this,sint32 x,sint32 y)1076 int rect_contains_point(const rect_t * this, sint32 x, sint32 y)
1077 {
1078 return (
1079 this->xmin <= x && x <= this->xmax
1080 && this->ymin <= y && y <= this->ymax
1081 );
1082 }
1083
rect_set(rect_t * this,sint32 xmin,sint32 xmax,sint32 ymin,sint32 ymax)1084 void rect_set(
1085 rect_t * this,
1086 sint32 xmin,
1087 sint32 xmax,
1088 sint32 ymin,
1089 sint32 ymax
1090 )
1091 {
1092 this->xmin = xmin;
1093 this->xmax = xmax;
1094 this->ymin = ymin;
1095 this->ymax = ymax;
1096 }
1097
rect_set_empty(rect_t * this)1098 void rect_set_empty(rect_t * this)
1099 {
1100 rect_set(this, 1, 0, 1, 0);
1101 }
1102
rect_is_empty(const rect_t * this)1103 int rect_is_empty(const rect_t * this)
1104 {
1105 return this->xmin > this->xmax || this->ymin > this->ymax;
1106 }
1107
rect_union_with(rect_t * this,const rect_t * that)1108 void rect_union_with(rect_t * this, const rect_t * that)
1109 {
1110 if(rect_is_empty(that)) return;
1111
1112 if(rect_is_empty(this)) *this = *that;
1113 else {
1114 this->xmin = min(this->xmin, that->xmin);
1115 this->xmax = max(this->xmax, that->xmax);
1116 this->ymin = min(this->ymin, that->ymin);
1117 this->ymax = max(this->ymax, that->ymax);
1118 }
1119 }
1120