1 /* catdvi - get text from DVI files
2    Copyright (C) 1999 Antti-Juhani Kaijanaho <gaia@iki.fi>
3    Copyright (C) 2000-02 Bjoern Brill <brill@fs.math.uni-frankfurt.de>
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software
17    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19 
20 #include <stdlib.h>
21 #include <assert.h>
22 #include "layout.h"
23 #include "util.h"
24 #include "page2.h"
25 #include "outenc.h"
26 #include "linebuf.h"
27 #include "density.h"
28 #include "canvas.h"
29 #include "vlist.h"
30 #include "fontinfo.h"
31 
32 /* page margins */
33 static sint32 left_margin, right_margin;
34 static sint32 top_margin, bottom_margin;
35 
36 /* other page metrics */
37 int linecount = 0;
38 uint32 boxcount = 0;
39 double average_box_width = 0;
40 double average_box_totheight = 0;
41 
42 /* layout parameters */
43 sint32 max_col_width, max_row_height;
44 double min_col_density, min_line_density;
45 scdf_t col_density, line_density;
46 scdf_t * x2col, * y2line;
47 
48 
49 /* some obvious stuff about rectangles */
50 typedef struct rect_t rect_t;
51 struct rect_t {
52     sint32 xmin;
53     sint32 xmax;
54     sint32 ymin;
55     sint32 ymax;
56 };
57 
58 int intervals_intersect(
59     sint32 a, sint32 b,
60     sint32 c, sint32 d
61 );
62 
63 int rect_intersects(const rect_t * this, const rect_t * that);
64 
65 int rect_contains_point(const rect_t * this, sint32 x, sint32 y);
66 
67 /* create rect */
68 void rect_set(
69     rect_t * this,
70     sint32 xmin,
71     sint32 xmax,
72     sint32 ymin,
73     sint32 ymax
74 );
75 
76 void rect_set_empty(rect_t * this);
77 
78 int rect_is_empty(const rect_t * this);
79 
80 /* *this = smallest rect containing *this and *that */
81 void rect_union_with(rect_t * this, const rect_t * that);
82 
83 
84 /* a page_word is a sequence of contiguous box_t which should remain
85  * contiguous in printout.
86  */
87 typedef struct page_word_t page_word_t;
88 struct page_word_t {
89     list_node_t * alpha;    /* node of first box in word */
90     list_node_t * omega;    /* node of last box in word */
91     linebuf_t * text;
92     rect_t brect;   	    /* bounding rectangle: the smallest rectangle
93     	    	    	     * containing the part _avove the baseline_
94 			     * of all boxes in the word
95 			     */
96     sint32 ceiling; 	    /* stuff with y <= ceiling should appear top of
97     	    	    	     * our row in printout.
98 			     */
99     sint32 right_wall;	    /* stuff with x >= right_wall should appear right
100     	    	    	     * from our rightmost glyph in printout.
101 			     */
102     sint32 right_corridor;  /* stuff on the same row with x >= right_corridor
103     	    	    	     * should appear trailing_spaces columns right from
104 			     * our rightmost glyph in printout.
105 			     */
106     int output_width;
107     int trailing_spaces;    /* usually 1, somtetimes 0 */
108     int row;
109     int column;
110 };
111 
112 /* The page has one global list of page_words */
113 vlist_t page_words;
114 
115 /* constructor */
116 static void page_word_init(
117     page_word_t * this,
118     list_node_t * alpha,
119     list_node_t * omega,
120     int trailing_spaces
121 );
122 
123 /* destructor */
124 static void page_word_done(page_word_t * this);
125 
126 /* Try to resolve collisions (i.e. situations where the bounding rects of
127  * two words intersect).
128  */
129 static void page_words_collide(vitor_t vi, vitor_t vj);
130 
131 /* Split *this before where; i.e. :
132  *   - create a new page_word containing the boxes from where to this->omega
133  *     on the heap ("right half")
134  *   - shrink *this to the boxes from this->alpha to where->prev ("left half")
135  *   - return pointer to new page_word
136  * Both the left and right half are brought into shape by constructor calls,
137  * so any information about ceilings, right walls, etc. gets lost.
138  */
139 static page_word_t * page_word_split_at(
140     page_word_t * this,
141     struct list_node_t * where
142 );
143 
144 
145 /* Estimate number of spaces from distance between boxes */
146 static int decide_space_count(
147     struct list_node_t * prev_p,
148     struct list_node_t * p
149 );
150 
151 /* Look at the midpoints of two boxes. If either midpoint lies in the other
152  * box, the boxes compare "equal". Otherwise, we compare x of midpoints.
153  */
154 sint32 box_sloppy_x_cmp(const struct box_t * p1, const struct box_t * p2);
155 
156 
157 static void page_collect_statistics(void);
158 
159 /* Make an initial pass at cutting the page into words, based on the distance
160  * between adjacent boxes. This will be refined in a later pass.
161  */
162 static void page_cut_words(void);
163 
164 
165 /* If the bounding rectangles of two words on the page collide, we have
166  * likely hit a word breaking problem (example: math subscripts are
167  * too thin to cause a word break between adjacent variables). If a
168  * collission occurs, we call a function that tries to resolve it
169  * by introducing additional word breaks.
170  */
171 static void page_check_word_collisions(void);
172 
173 /* The idea: words with x coordinates close to each other and different
174  * y coordinates should appear on different rows in printout.
175  */
176 static void page_find_ceilings(void);
177 
178 /* If two words end up on the same line in printout, then the left one
179  * needs to put its trailing_spaces before the right one starts.
180  * Obviously, this has to be called _after_ we know our rows.
181  */
182 static void page_find_right_corridors(void);
183 
184 /* If the y intervals of the bounding rectangles of two words intersect,
185  * then the left one should end in printout before the right one starts.
186  * This should be called _after_ page_find_right_corridors() because we want
187  * to ensure right_wall <= right_corridor .
188  */
189 static void page_find_right_walls(void);
190 
191 /* Determine on which row in printout words end up. The main vehicle here
192  * is a line density function; we force it to have integral >=1 on the
193  * interval [word->ceiling, word->brect->ymax].
194  * The words are then positioned so that their baseline is in the correct row.
195  */
196 static void page_determine_lines(void);
197 
198 /* Determine on which column in printout words end up. The main vehicle here
199  * is a column density function; we force it to have large enough integrals
200  * on the intervals [word->brect->xmin, word->right_wall] resp.
201  * [word->brect->xmin, word->right_wall] to accomodate word->output_width
202  * resp. word->output_width + word->trailing_spaces columns.
203  * The words are then positioned so that their left boundary is in the correct
204  * column.
205  */
206 static void page_determine_cols(void);
207 
208 
page_print_formatted(void)209 void page_print_formatted(void)
210 {
211     canv_t canvas;
212     vitor_t pwi;
213 
214     pmesg(50, "BEGIN page_print_formatted\n");
215 
216     /* Life is much easier if we don't have to care for the case
217      * of an empty page all the time.
218      */
219     if(list_head == NULL) {
220 	puts("\f\n");
221 	pmesg(80, "This page is empty.\n");
222 	pmesg(50, "END page_print_formatted\n");
223 	return;
224     }
225 
226     page_adjust_diacritics();
227     page_adjust_texmext();
228     page_adjust_radicals();
229 
230     page_collect_statistics();
231 
232     max_col_width = (sint32) (4.0 * average_box_width);
233     max_row_height = (sint32) (3.0 * average_box_totheight);
234     min_col_density = 1.0 / max_col_width;
235     min_line_density = 1.0 / max_row_height;
236     /*
237      * The minimal densities should ensure that large amounts of white space
238      * do not completely get lost. The factors 3.0, 4.0 have been determined
239      * by rules of thumb and experiments.
240      * The (unverified) expectation is that a printout column will be
241      * about average_box_width DVI units wide and a printout row will be
242      * about 1.5 * average_box_totheight DVI units high.
243      */
244     scdf_init(&col_density, left_margin, right_margin, min_col_density);
245 	/* col_density measures the column density at every point between
246 	 * left and right margin which is required to do proper positioning
247 	 * of every word.
248 	 */
249     scdf_init(&line_density, top_margin, bottom_margin, min_line_density);
250 	/* Similar for line_density and lines */
251 
252     vlist_init(&page_words);
253     page_cut_words();
254     page_check_word_collisions();
255 
256     page_find_ceilings();
257     page_determine_lines();
258 
259     page_find_right_corridors();
260     page_find_right_walls();
261     page_determine_cols();
262 
263     /* Now print out everything */
264     canv_init(
265     	&canvas,
266 	(sint32) scdf_eval(x2col, right_margin) + 1,
267 	(sint32) scdf_eval(y2line, bottom_margin) + 1
268     );
269     for(
270     	pwi = vlist_begin(&page_words);
271 	pwi != vlist_end(&page_words);
272 	pwi = pwi->next
273     ) {
274 	page_word_t * pw;
275 
276 	pw = vitor2ptr(pwi, page_word_t);
277 	canv_put_linebuf(
278 	    &canvas,
279 	    pw->text,
280 	    pw->column,
281 	    pw->row,
282 	    pw->output_width
283 	);
284 	pw->text = NULL;    /* text is now owned by the canvas */
285     }
286     canv_write(&canvas, stdout);
287     puts("\f\n");
288 
289     /* clean up */
290 
291     canv_done(&canvas);
292 
293     for(
294     	pwi = vlist_begin(&page_words);
295 	pwi != vlist_end(&page_words);
296 	pwi = pwi->next
297     ) {
298     	page_word_done(pwi->data);
299 	free(pwi->data);
300     }
301     vlist_done(&page_words);
302 
303     scdf_done(y2line); free(y2line);
304     scdf_done(x2col); free(x2col);
305     scdf_done(&col_density);
306     scdf_done(&line_density);
307 
308     pmesg(50, "END page_print_formatted\n");
309 }
310 
311 
page_print_sequential(void)312 void page_print_sequential(void)
313 {
314     struct list_node_t * p;
315     struct box_t prev_box, curr_box;
316 
317     sint32 delta_x = 0, delta_y = 0;
318     sint32 dist_x = 0;
319     sint32 epsilon_x = 0, epsilon_y = 0;
320 
321     /* begin-of-page flag */
322     int bop = 1;
323     /* begin-of-line flag */
324     int bol = 1;
325 
326     int spaces, spaces_by_prev, spaces_by_curr;
327     struct linebuf_t lb;
328 
329     pmesg(50, "BEGIN page_print_sequential\n");
330 
331     page_adjust_diacritics();
332 
333     linebuf_init(&lb, 0);
334     prev_box.width = 1;
335     prev_box.height = 1;
336 
337     for (p = list_head; p != 0; p = p->next) {
338 
339         pmesg(80, "node: X = %li, Y = %li, glyph = %li (%c), font = %li\n",
340               p->b.x, p->b.y, p->b.glyph, (unsigned char) p->b.glyph, p->b.font);
341 
342     	curr_box = p->b;
343         if (curr_box.width == 0) curr_box.width = prev_box.width;
344         if (curr_box.height == 0) curr_box.height = prev_box.height;
345 
346 	if (!bop) {
347     	    delta_x = curr_box.x - prev_box.x;
348 	    delta_y = curr_box.y - prev_box.y;
349 	    dist_x = delta_x - prev_box.width;
350 
351     	    /* (arithmetic mean of current and previous) / 20 */
352 	    epsilon_x = (curr_box.width + prev_box.width + 39) / 40;
353 	    epsilon_y = (curr_box.height + prev_box.height + 39) / 40;
354 
355             /* check for new line */
356 	    if (
357 		/* new line */
358 		delta_y >= 22*epsilon_y ||
359 		/* new column */
360 		delta_y <= -60*epsilon_y ||
361 		/* weird step back */
362 		dist_x <= -100*epsilon_x
363 	    ) {
364         	pmesg(80, "end of line\n");
365 		outenc_write(stdout, &lb);
366 		linebuf_clear(&lb);
367 		puts("");
368 		bol = 1;
369             }
370 
371 	    /* check for word breaks, but not at the beginning of a line */
372 	    if ((dist_x > 0) && (!bol)) {
373 		spaces_by_prev =
374 		    font_w_to_space(prev_box.font, dist_x);
375 		spaces_by_curr =
376 		    font_w_to_space(curr_box.font, dist_x);
377     		spaces = (spaces_by_prev + spaces_by_curr + 1) / 2;
378 
379         	if (spaces > 0) {
380                     pmesg(80, "setting space\n");
381                     linebuf_putg(&lb, ' ');
382         	}
383 	    }
384 
385 	} /* end if (!bop) */
386 
387         linebuf_putg(&lb, curr_box.glyph);
388 	prev_box = curr_box;
389 	bop = 0;
390 	bol = 0;
391     }
392     /* flush line buffer and end page */
393     outenc_write(stdout, &lb);
394     puts("\n\f\n");
395 
396     linebuf_done(&lb);
397 
398     pmesg(50, "END page_print_sequential\n");
399 }
400 
decide_space_count(struct list_node_t * prev_p,struct list_node_t * p)401 static int decide_space_count(
402     struct list_node_t * prev_p,
403     struct list_node_t * p
404 )
405 {
406         font_t prev_font, curr_font;
407         sint32 prev_x, prev_width;
408         sint32 curr_x;
409         sint32 delta;
410         int (* const f2spc)(sint32, sint32) = font_w_to_space;
411 
412         prev_font = prev_p->b.font;
413         prev_x = prev_p->b.x;
414         prev_width = prev_p->b.width;
415 
416         curr_font = p->b.font;
417         curr_x = p->b.x;
418 
419         delta = curr_x - (prev_x + prev_width);
420         return (1 + f2spc(curr_font, delta) + f2spc(prev_font, delta)) / 2;
421 }
422 
423 
424 /*************** page_word_t method implementations ****************/
page_word_init(page_word_t * this,list_node_t * alpha,list_node_t * omega,int trailing_spaces)425 void page_word_init(
426     page_word_t * this,
427     list_node_t * alpha,
428     list_node_t * omega,
429     int trailing_spaces
430 )
431 {
432     list_node_t * p, * q;
433     rect_t r;
434 
435     this->alpha = alpha;
436     this->omega = omega;
437     this->text = xmalloc(sizeof(linebuf_t));
438     linebuf_init(this->text, 20);
439     	/* 20 seems a good tradeoff between memory usage and
440 	 * avoiding reallocs.
441 	 */
442     rect_set_empty(&(this->brect));
443     this->ceiling = SINT32_MIN;
444     	/* The top line has no ceiling -- it fits automatically */
445     this->right_wall = right_margin;
446     this->right_corridor = right_margin;
447     /* this->output_width will be set below */
448     this->trailing_spaces = trailing_spaces;
449     this->row = -1; 	/* will be filled in later */
450     this->column = -1;	/* will be filled in later */
451 
452     q = omega->next;
453     for(p = alpha; p != q; p = p->next) {
454     	/* add p->b to bounding rect */
455     	rect_set(
456 	    &r,
457 	    p->b.x,
458 	    p->b.x + p->b.width,
459 	    p->b.y - p->b.height,
460 	    p->b.y
461 	);
462 	rect_union_with(&(this->brect), &r);
463 
464     	/* add p->b.glyph to text */
465     	linebuf_putg(this->text, p->b.glyph);
466     }
467 
468     this->output_width = outenc_get_width(this->text);
469 }
470 
471 
page_word_done(page_word_t * this)472 void page_word_done(page_word_t * this)
473 {
474     this->alpha = NULL;
475     this->omega = NULL;
476     /* this->text will be NULL after it has been passed to canvas */
477     if(this->text != NULL) {
478 	linebuf_done(this->text);
479 	free(this->text);
480 	this->text = NULL;
481     }
482 }
483 
page_words_collide(vitor_t vi,vitor_t vj)484 static void page_words_collide(vitor_t vi, vitor_t vj)
485 {
486     page_word_t * wp, * wq;
487     list_node_t * bp, * bq;
488     sint32 c;
489 
490     pmesg(60, "BEGIN page_words_collide\n");
491 
492     wp = vi->data;
493     wq = vj->data;
494 
495     bp = wp->alpha;
496     bq = wq->alpha;
497     if(box_sloppy_x_cmp(&bp->b, &bq->b) > 0) {
498     	/* We want to assume that *wp starts left from *wq */
499 	pmesg(60, "calling myself with exchanged arguments\n");
500     	page_words_collide(vj, vi);
501     	pmesg(60, "END page_words_collide\n");
502 	return;
503     }
504 
505     /* Skip all letters of *wp left of *wq. We allow a certain margin overlap
506      * to accomodate italics correction, kerning, etc.
507      */
508     while(c = box_sloppy_x_cmp(&bp->b, &bq->b), c < 0) {
509     	if(bp == wp->omega) {
510 	    /* We're at the end of *wp, so either really no collision
511 	     * at all, or only the last box collides. Anyway, nothing
512 	     * to do.
513 	     */
514 	    pmesg(60, "nothing to do\n");
515     	    pmesg(60, "END page_words_collide\n");
516 	    return;
517 	}
518     	bp = bp->next;
519     }
520 
521     if(c == 0) {
522 	/* Ouch, full crash */
523 	pmesg(60, "unresolvable collision\n");
524     	pmesg(60, "END page_words_collide\n");
525 	return;
526     }
527 
528     /* We've moved bp across the beginning of *wq */
529     pmesg(60, "resolving collision by splitting word\n");
530     vlist_insert_after(&page_words, vi, page_word_split_at(wp, bp));
531 
532     /* Check for further collisions between *wq and the right piece of *wp */
533     pmesg(60, "checking for further collisions\n");
534     page_words_collide(vj, vi->next);
535     pmesg(60, "END page_words_collide\n");
536 }
537 
page_word_split_at(page_word_t * this,struct list_node_t * where)538 static page_word_t * page_word_split_at(
539     page_word_t * this,
540     struct list_node_t * where
541 )
542 {
543     page_word_t * righthalf;
544     list_node_t * alpha;
545 
546     /* No splitting with one half empty, please. */
547     assert(where != this->alpha);
548     assert(where != this->omega->next);
549 
550     /* create right half */
551     righthalf = xmalloc(sizeof(page_word_t));
552     page_word_init(righthalf, where, this->omega, this->trailing_spaces);
553 
554     /* reinit left half (ourselves) */
555     alpha = this->alpha;
556     page_word_done(this);
557     page_word_init(this, alpha, where->prev, 0);
558 
559     return righthalf;
560 }
561 
562 
box_sloppy_x_cmp(const struct box_t * p1,const struct box_t * p2)563 sint32 box_sloppy_x_cmp(const struct box_t * p1, const struct box_t * p2)
564 {
565     sint32 m1, m2;  /* midpoints */
566 
567     m1 = p1->x + p1->width / 2;
568     m2 = p2->x + p2->width / 2;
569 
570     /* If the x midpoint of one box is in the x interval of the other,
571      * the two are considered close together (and compare "equal").
572      * Otherwise, the x midpoints are compared.
573      */
574     if(p1->x <= m2 && m2 <= p1->x + p1->width) return 0;
575     if(p2->x <= m1 && m1 <= p2->x + p2->width) return 0;
576     return m1 - m2;
577 }
578 
579 
580 /*************** page layout passes ********************************/
page_collect_statistics(void)581 void page_collect_statistics(void)
582 {
583     struct list_node_t * p;
584     sint32 prev_y;
585 
586     pmesg(80, "BEGIN page_collect_statistics\n");
587 
588     left_margin = SINT32_MAX;
589     right_margin = SINT32_MIN;
590     top_margin = SINT32_MAX;
591     bottom_margin = SINT32_MIN;
592 
593     average_box_width = 0;
594     average_box_totheight = 0;
595     linecount = 0;
596     boxcount = 0;
597 
598     prev_y = list_head->b.y;
599     for (p = list_head; p != 0; p = p->next) {
600     	sint32 x, y, width, height, depth;
601 
602 	x = p->b.x;
603 	y = p->b.y;
604 	width = p->b.width;
605 	height = p->b.height;
606 	depth = p->b.depth;
607 
608 	left_margin = min(left_margin, x);
609 	right_margin = max(right_margin, x + width);
610 	top_margin = min(top_margin, y - height);
611 	bottom_margin = max(bottom_margin, y + depth);
612 
613         if (y > prev_y) {
614             prev_y = y;
615             linecount++;
616         }
617 
618 	average_box_width += width;
619 	average_box_totheight += (height + depth);
620 	++boxcount;
621     }
622     ++linecount; /* The last line hasn't been counted */
623     average_box_width = average_box_width / boxcount;
624     average_box_totheight = average_box_totheight / boxcount;
625 
626     pmesg(80, "left margin: %ld, right margin: %ld.\n", \
627 	left_margin, right_margin);
628     pmesg(80, "top margin: %ld, bottom margin: %ld.\n", \
629 	top_margin, bottom_margin);
630 
631     pmesg(80, "END page_collect_statistics\n");
632 }
633 
634 
page_cut_words(void)635 void page_cut_words(void)
636 {
637     struct list_node_t * p;
638 
639     int bop, eop, bol, eol, bow, eow;
640     	/* {begin,end} of {page, line, word} flags */
641 
642     pmesg(50, "BEGIN page_cut_words\n");
643 
644     bop = 1;
645     bol = 1;
646     bow = 1;
647     for (p = list_head; p != NULL; p = p->next) {
648     	sint32 y;
649 	list_node_t * curr_alpha;
650 
651         pmesg(80, "node: X = %li, Y = %li, glyph = %li (%c), font = %li\n",
652               p->b.x, p->b.y, p->b.glyph, (unsigned char) p->b.glyph, p->b.font);
653 
654     	y = p->b.y;
655     	eop = (p->next == NULL);
656 	eol = eop || (p->next->b.y != y);
657     	eow = eol || (decide_space_count(p, p->next) > 0);
658 
659     	if(bow) {
660     	    curr_alpha = p;
661 	}
662 
663 	if(eow) {
664 	    int trailing_space;
665 	    page_word_t * new_word;
666 
667 	    if(eol) {
668 		trailing_space = 0;
669 	    }
670 	    else {
671 		trailing_space = 1;
672 	    }
673 
674     	    /* Make up a page_word_t from the collected data and insert
675 	     * it into the page_words list
676 	     */
677 	    new_word = xmalloc(sizeof(page_word_t));
678 	    page_word_init(new_word, curr_alpha, p, trailing_space);
679 	    vlist_push_back(&page_words, new_word);
680 	}
681 
682     	if(eol) {
683             pmesg(80, "end of line\n");
684 	}
685 
686     	bop = 0;
687 	bol = eol;
688     	bow = eow;
689 
690     } /* for p */
691 
692     pmesg(50, "END page_cut_words\n");
693 
694 }
695 
696 
page_check_word_collisions(void)697 static void page_check_word_collisions(void)
698 {
699     vitor_t vi, nvi, vj, llvi;
700     page_word_t * p, * np, * q;
701     int eop, eol;
702 
703     pmesg(50, "BEGIN page_check_word_collisions\n");
704 
705     llvi = vlist_rend(&page_words);
706 
707     for(
708     	vi = vlist_begin(&page_words);
709 	vi != vlist_end(&page_words);
710 	vi = nvi
711     ) {
712 	p = vi->data;
713 	nvi = vi->next;
714 	np = nvi->data;
715 
716     	eop = (nvi == vlist_end(&page_words));
717 	eol = eop || np->brect.ymax != p->brect.ymax;
718 
719 	/* We only need to look upwards, a collision will then always be caught
720 	 * by the lower one of the colliding words.
721 	 * Collisions between words on the same baseline cannot occur
722 	 * (by design of the basic word break algorithm) and shouldn't
723 	 * result in new word breaks anyway.
724 	 *
725 	 * We keep an iterator to the last word in the previous line (upwards)
726 	 * to avoid repeated skipping backwards over preceding words
727 	 * on the same baseline.
728 	 */
729     	for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
730 	    q = vj->data;
731     	    if(q->brect.ymax < p->brect.ymin) break;
732 	    	/* We are too far up to find any more collisions. Note
733 		 * this happens _immediately_ in the standard (one column
734 		 * text without intersecting lines) case.
735 		 */
736 	    if(!intervals_intersect(
737 	    	p->brect.xmin, p->brect.xmax,
738 		q->brect.xmin, q->brect.xmax
739 	    )) continue;
740 	    page_words_collide(vi, vj);
741 	    	/* We do _not_ break if q->brect.xmax < p->brect.xmin
742 		 * because there may be more than one line with
743 		 * possible collisions.
744 		 */
745 	}
746 
747 	if(eol) llvi = vi;
748     }
749 
750     pmesg(50, "END page_check_word_collisions\n");
751 }
752 
753 
page_find_ceilings(void)754 static void page_find_ceilings(void)
755 {
756     vitor_t vi, nvi, vj, llvi;
757     page_word_t * p, * np, * q;
758     int eop, eol;
759 
760     pmesg(50, "BEGIN page_find_ceilings\n");
761 
762     llvi = vlist_rend(&page_words);
763 
764     /* Our policy is the following:
765      * a. Stuff that has its baseline above our head (read p->brect.ymin) must
766      *    end up on a higher line.
767      * b. Stuff that is near us in x direction and has its baseline above
768      *    ours must end up on a higher line.
769      */
770 
771     for(
772     	vi = vlist_begin(&page_words);
773 	vi != vlist_end(&page_words);
774 	vi = nvi
775     ) {
776     	sint32 xmin, xmax;  /* the x interval we're interested in */
777     	sint32 ymin;        /* we need not care for stuff higher that that */
778 
779 	p = vi->data;
780     	nvi = vi->next;
781 	np = nvi->data;
782 
783     	xmin = p->brect.xmin - 2 * average_box_width;
784     	xmax = p->brect.xmax + 2 * average_box_width;
785     	ymin = p->brect.ymax - max_row_height;
786 
787     	eop = (nvi == vlist_end(&page_words));
788 	eol = eop || np->brect.ymax != p->brect.ymax;
789 
790 	/* We only need to look upwards, and no further than max_row_hight
791 	 *
792 	 * We keep an iterator to the last word in the previous line (upwards)
793 	 * to avoid repeated skipping backwards over preceding words
794 	 * on the same baseline.
795 	 * Since the list is traversed bottom-to-top, we can stop looking
796 	 * as soon as we've found the first ceiling candidate.
797 	 */
798     	for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
799 	    q = vj->data;
800     	    if(q->brect.ymax < ymin) {
801 	    	/* stuff so high will end up on another line automatically */
802 	    	break;
803 	    }
804     	    if(q->brect.ymax < p->brect.ymin) {
805 	    	p->ceiling = q->brect.ymax;
806 	    	break;
807     	    }
808 	    if(intervals_intersect(
809 	    	xmin, xmax,
810 		q->brect.xmin, q->brect.xmax
811 	    )) {
812     	    	p->ceiling = q->brect.ymax;
813 		break;
814 	    }
815 	}
816 
817 	if(eol) llvi = vi;
818     }
819 
820     pmesg(50, "END page_find_ceilings\n");
821 }
822 
823 
page_find_right_corridors(void)824 static void page_find_right_corridors(void)
825 {
826     vitor_t vi, nvi, vj, llvi;
827     page_word_t * p, * np, * q;
828     int eop, eol;
829 
830     pmesg(50, "BEGIN page_find_right_corridors\n");
831 
832     llvi = vlist_rend(&page_words);
833 
834     for(
835     	vi = vlist_begin(&page_words);
836 	vi != vlist_end(&page_words);
837 	vi = nvi
838     ) {
839 	p = vi->data;
840     	nvi = vi->next;
841 	np = nvi->data;
842 
843     	eop = (nvi == vlist_end(&page_words));
844 	eol = eop || np->brect.ymax != p->brect.ymax;
845 
846     	/* first we look directly right */
847 	if(!eol) {
848 	    p->right_corridor = min(p->right_corridor, np->brect.xmin);
849 	}
850 
851 	/* Now we look upwards (left and right).
852 	 *
853 	 * We keep an iterator to the last word in the previous line (upwards)
854 	 * to avoid repeated skipping backwards over preceding words
855 	 * on the same baseline.
856 	 */
857     	for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
858 	    q = vj->data;
859     	    if(q->row < p->row) break;
860 	    	/* We need not look further up */
861 	    if(q->brect.xmin < p->brect.xmin) {
862 	    	q->right_corridor = min(q->right_corridor, p->brect.xmin);
863 	    }
864 	    else if(p->brect.xmin < q->brect.xmin) {
865 	    	p->right_corridor = min(p->right_corridor, q->brect.xmin);
866 	    }
867 	}
868 
869 	if(eol) llvi = vi;
870     }
871 
872     pmesg(50, "END page_find_right_corridors\n");
873 }
874 
875 
page_find_right_walls(void)876 static void page_find_right_walls(void)
877 {
878     vitor_t vi, nvi, vj, llvi;
879     page_word_t * p, * np, * q;
880     int eop, eol;
881 
882     pmesg(50, "BEGIN page_find_right_walls\n");
883 
884     llvi = vlist_rend(&page_words);
885 
886     for(
887     	vi = vlist_begin(&page_words);
888 	vi != vlist_end(&page_words);
889 	vi = nvi
890     ) {
891 	p = vi->data;
892     	nvi = vi->next;
893 	np = nvi->data;
894 
895     	eop = (nvi == vlist_end(&page_words));
896 	eol = eop || np->brect.ymax != p->brect.ymax;
897 
898     	/* This has to be done somewhere. The most natural place is here. */
899     	p->right_wall = min(p->right_wall, p->right_corridor);
900 
901     	/* We need not look directly right, we've already done so to
902 	 * find right_corridor. But if we are at the end of the line,
903 	 * we want stuff on other lines starting clearly right from where we
904 	 * end to appear right of us.
905 	 */
906     	if(eol) {
907 	    p->right_wall = min(
908 	    	p->right_wall,
909 		p->brect.xmax + average_box_width
910 	    );
911 	    /* FIXME: is average_box_width the right amount of slack? Or
912 	     * use no slack at all ?
913 	     */
914 	}
915 
916 	/* Now we look upwards (left and right).
917 	 *
918 	 * We keep an iterator to the last word in the previous line (upwards)
919 	 * to avoid repeated skipping backwards over preceding words
920 	 * on the same baseline.
921 	 */
922     	for(vj = llvi; vj != vlist_rend(&page_words); vj = vj->prev) {
923 	    q = vj->data;
924     	    if(q->brect.ymax < p->brect.ymin) break;
925 	    	/* We need not look further up */
926 	    if(box_sloppy_x_cmp(&(q->omega->b), &(p->alpha->b)) < 0) {
927     	    	/* *q ends left from *p */
928 	    	q->right_wall = min(q->right_wall, p->brect.xmin);
929 	    }
930 	    else if(box_sloppy_x_cmp(&(p->omega->b), &(q->alpha->b)) < 0) {
931     	    	/* *q starts right from *p */
932 	    	p->right_wall = min(p->right_wall, q->brect.xmin);
933 	    }
934 	    /* FIXME: what about crashing words? Should we do something here?
935 	     * And is box_sloppy_x_cmp() really the right criterion?
936 	     */
937 	}
938 
939 	if(eol) llvi = vi;
940     }
941 
942     pmesg(50, "END page_find_right_walls\n");
943 }
944 
945 
page_determine_lines(void)946 static void page_determine_lines(void)
947 {
948     vitor_t vi;
949     page_word_t * p;
950 
951     pmesg(50, "BEGIN page_determine_lines\n");
952 
953     for(
954     	vi = vlist_begin(&page_words);
955 	vi != vlist_end(&page_words);
956 	vi = vi->next
957     ) {
958     	p = vi->data;
959     	pmesg(
960 	    100,
961 	    "x = %ld, y = %ld, ceiling = %ld\n",
962 	    p->brect.xmin,
963 	    p->brect.ymax,
964 	    p->ceiling
965 	);
966 
967 	if(p->ceiling >= top_margin) {
968 	    scdf_force_min_integral(
969 	    	&line_density,
970 		p->ceiling,
971 		p->brect.ymax,
972 		1
973 	    );
974 	}
975     }
976 
977     scdf_normalize(&line_density);
978     y2line = scdf_floor_of_integral(&line_density);
979 
980     for(
981     	vi = vlist_begin(&page_words);
982 	vi != vlist_end(&page_words);
983 	vi = vi->next
984     ) {
985     	p = vi->data;
986 	p->row = (sint32) scdf_eval(y2line, p->brect.ymax);
987 	pmesg(100, "word at row %i\n", p->row);
988     }
989 
990     pmesg(50, "END page_determine_lines\n");
991 }
992 
993 
page_determine_cols(void)994 static void page_determine_cols(void)
995 {
996     vitor_t vi, vj;
997     page_word_t * p;
998     int eop, eol;
999 
1000     pmesg(50, "BEGIN page_determine_cols\n");
1001 
1002     for(
1003     	vi = vlist_begin(&page_words);
1004 	vi != vlist_end(&page_words);
1005 	vi = vj
1006     ) {
1007     	p = vi->data;
1008     	pmesg(
1009 	    100,
1010 	    "x = %ld, y = %ld, right_wall = %ld, right_corridor = %ld\n",
1011 	    p->brect.xmin,
1012 	    p->brect.ymax,
1013 	    p->right_wall,
1014 	    p->right_corridor
1015 	);
1016     	assert(p->right_wall <= p->right_corridor);
1017     	assert(p->right_corridor <= right_margin);
1018 
1019 	if(p->right_wall < p->right_corridor) {
1020 	    scdf_force_min_integral(
1021 		&col_density,
1022 		p->brect.xmin,
1023 		p->right_wall,
1024 		p->output_width
1025 	    );
1026 	}
1027 	if(p->right_wall == p->right_corridor || p->trailing_spaces > 0) {
1028     	    scdf_force_min_integral(
1029 		&col_density,
1030 		p->brect.xmin,
1031 		p->right_corridor,
1032 		p->output_width + p->trailing_spaces
1033 	    );
1034 	}
1035 
1036 	vj = vi->next;
1037 	eop = (vj == vlist_end(&page_words));
1038 	eol = eop || (p->brect.ymax != vitor2ptr(vj, page_word_t)->brect.ymax);
1039 	if(eol) scdf_normalize(&col_density);
1040     }
1041 
1042     x2col = scdf_floor_of_integral(&col_density);
1043 
1044     for(
1045     	vi = vlist_begin(&page_words);
1046 	vi != vlist_end(&page_words);
1047 	vi = vi->next
1048     ) {
1049     	p = vi->data;
1050 	p->column = (sint32) scdf_eval(x2col, p->brect.xmin);
1051 	pmesg(100, "word at column %i\n", p->column);
1052     }
1053 
1054     pmesg(50, "END page_determine_cols\n");
1055 }
1056 
1057 
1058 /***************** rect_t method implementations **********************/
intervals_intersect(sint32 a,sint32 b,sint32 c,sint32 d)1059 int intervals_intersect(
1060     sint32 a, sint32 b,
1061     sint32 c, sint32 d
1062 )
1063 {
1064     if(a > b || c > d) return 0; /* One of the intervals is empty */
1065     return c <= b && a <= d;	/* yes, this works in ALL remaining cases :) */
1066 }
1067 
rect_intersects(const rect_t * this,const rect_t * that)1068 int rect_intersects(const rect_t * this, const rect_t * that)
1069 {
1070     return (
1071     	intervals_intersect(this->xmin, this->xmax, that->xmin, that->xmax)
1072 	&& intervals_intersect(this->ymin, this->ymax, that->ymin, that->ymax)
1073     );
1074 }
1075 
rect_contains_point(const rect_t * this,sint32 x,sint32 y)1076 int rect_contains_point(const rect_t * this, sint32 x, sint32 y)
1077 {
1078     return (
1079     	this->xmin <= x && x <= this->xmax
1080 	&& this->ymin <= y && y <= this->ymax
1081     );
1082 }
1083 
rect_set(rect_t * this,sint32 xmin,sint32 xmax,sint32 ymin,sint32 ymax)1084 void rect_set(
1085     rect_t * this,
1086     sint32 xmin,
1087     sint32 xmax,
1088     sint32 ymin,
1089     sint32 ymax
1090 )
1091 {
1092     this->xmin = xmin;
1093     this->xmax = xmax;
1094     this->ymin = ymin;
1095     this->ymax = ymax;
1096 }
1097 
rect_set_empty(rect_t * this)1098 void rect_set_empty(rect_t * this)
1099 {
1100     rect_set(this, 1, 0, 1, 0);
1101 }
1102 
rect_is_empty(const rect_t * this)1103 int rect_is_empty(const rect_t * this)
1104 {
1105     return this->xmin > this->xmax || this->ymin > this->ymax;
1106 }
1107 
rect_union_with(rect_t * this,const rect_t * that)1108 void rect_union_with(rect_t * this, const rect_t * that)
1109 {
1110     if(rect_is_empty(that)) return;
1111 
1112     if(rect_is_empty(this)) *this = *that;
1113     else {
1114     	this->xmin = min(this->xmin, that->xmin);
1115     	this->xmax = max(this->xmax, that->xmax);
1116     	this->ymin = min(this->ymin, that->ymin);
1117     	this->ymax = max(this->ymax, that->ymax);
1118     }
1119 }
1120