1 /*
2 * tables.c:
3 * Table support for htmlise.
4 *
5 * The idea here is that we look for a paragraph which looks like a table
6 * heading followed by a number of paragraphs which look like table cells.
7 * We do not support sophisticated tables or physical mark-up like borders
8 * and alignment. Tables look like this:
9 *
10 * Heading 1 Heading 2 Heading 3 <-- heading
11 * ---------- ---------- ------------- <-- `table rule'
12 * Text in More text Third cell |
13 * cell in another |<-- first row
14 * cell |
15 *
16 * New row Next cell Bottom-right |
17 * in new row cell of |<-- second row
18 * table |
19 *
20 * The table is recognised because its first paragraph contains a line of
21 * dashes interleaved with spaces, the spaces are present on every line of the
22 * table, and no line in the paragraph is longer than the line of dashes. The
23 * text above the dashes is the table heading, and is marked up as <th></th>,
24 * and that below is just data, which lives in <td></td>. Further paragraphs
25 * which have lined-up spaces and consist only of lines of length equal to or
26 * shorter than the header rule length are also considered to be part of the
27 * table, with each paragraph being an individual row. The contents of each
28 * cell are then processed recursively by htmlise.
29 *
30 * Where there are several possible overlapping tables which could be
31 * constructed by the procedure, we pick the earliest-starting one.
32 *
33 * Extensions:
34 *
35 * - should also permit horizontal tables (with the <th></th> on the left)
36 *
37 * Copyright (c) 2003 Chris Lightfoot. All rights reserved.
38 * Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/
39 *
40 */
41
42 static const char rcsid[] = "$Id: tables.c,v 1.4 2004/01/29 22:52:28 chris Exp $";
43
44 #include <assert.h>
45
46 #include <stdlib.h>
47 #include <string.h>
48
49 #include "htmlise.h"
50
51 struct table_layout {
52 /* ruleline is the line number of the table rule in the first paragraph of
53 * the table; ruleindent the indent of the table rule and width the line
54 * length of the table rule. ntablecolumns is the number of columns in the
55 * table. */
56 size_t ruleline, ruleindent, width, ntablecolumns;
57 /* is_gap consists of width flags each of which is true if the
58 * corresponding column must be whitespace in each line of the table. */
59 char *is_gap;
60 /* col and ncols store the starting column positions and character widths
61 * of the ntablecolumns individual table columns. */
62 size_t *col, *ncols;
63 };
64
65 /* paragraph_is_start_of_table PARAGRAPH LAYOUT
66 * If PARAGRAPH is a possible start-of-table, fill in LAYOUT with the layout of
67 * the table and return 1. Otherwise, return 0. The table rule is set to the
68 * first possible candidate rule. On successful return, LAYOUT->is_gap, col and
69 * ncols are allocated on the heap and must be freed by the caller. */
paragraph_is_start_of_table(const struct paragraph * P,struct table_layout * L)70 static int paragraph_is_start_of_table(const struct paragraph *P, struct table_layout *L) {
71 size_t i, j, minindent = 1000000, maxlen = 0, *candidates, ncandidates = 0;
72 char *is_gap = NULL;
73 int ret = 0;
74
75 candidates = malloc(P->nlines * sizeof *candidates);
76
77 /* Look at each line in the paragraph and find the minimum indent and
78 * maximum length of the lines. Also identify any candidate table rules. */
79 for (i = 0; i < P->nlines; ++i) {
80 size_t indent, len, ngaps = 0;
81 indent = strspn(P->lines[i], " ");
82 len = strlen(P->lines[i]);
83
84 /* Find horizontal extent of paragraph. */
85 if (indent < minindent)
86 minindent = indent;
87
88 if (len > maxlen) {
89 is_gap = realloc(is_gap, len);
90 for (j = maxlen; j < len; ++j)
91 is_gap[j] = P->lines[i][j] == ' ' ? 1 : 0;
92 maxlen = len;
93 }
94
95 /* Find gaps. */
96 for (j = 0; j < len; ++j)
97 if (P->lines[i][j] != ' ')
98 is_gap[j] = 0;
99 else
100 ++ngaps;
101
102 /* Is this line a candidate table rule? It can't be the first line of
103 * the paragraph, obviously. */
104 if (i > 0 && strspn(P->lines[i], "- ") == len && ngaps > 0)
105 candidates[ncandidates++] = i;
106 }
107
108 /* Now find any candidate table rule which satisfies the constraints. */
109 for (i = 0; i < ncandidates; ++i) {
110 size_t len;
111 char *line;
112 line = P->lines[candidates[i]];
113 len = strlen(line);
114 for (j = 0; j < len; ++j)
115 if (line[j] == ' ' && !is_gap[j])
116 break;
117 if (j == len) {
118 int f;
119 size_t j, n;
120 struct table_layout Lz = { 0 };
121
122 *L = Lz;
123
124 /* Choose this one. */
125 ret = 1;
126 L->is_gap = is_gap;
127 is_gap = NULL; /* Don't free it. */
128 L->ruleline = candidates[i];
129 L->ruleindent = minindent;
130 L->width = maxlen;
131
132 /* Figure out how many columns we have and where in the line
133 * they are. */
134 for (j = 0, f = 0, L->ntablecolumns = 0; j < L->width; ++j) {
135 if (!L->is_gap[j] && !f) {
136 f = 1;
137 ++L->ntablecolumns;
138 } else if (L->is_gap[j])
139 f = 0;
140 }
141
142 L->col = malloc(L->ntablecolumns * sizeof *L->col);
143 L->ncols = malloc(L->ntablecolumns * sizeof *L->ncols);
144
145 for (j = 0, n = 0; j < L->width; ++j) {
146 if (!L->is_gap[j] && (j == 0 || L->is_gap[j - 1]))
147 L->col[n] = j;
148
149 if (L->is_gap[j] && j > 0 && !L->is_gap[j - 1]) {
150 L->ncols[n] = j - L->col[n];
151 ++n;
152 }
153 }
154
155 if (n < L->ntablecolumns) {
156 L->ncols[n] = j - L->col[n] + 1;
157 ++n;
158 }
159
160 assert(n == L->ntablecolumns);
161
162 break;
163 }
164 }
165
166 if (candidates)
167 free(candidates);
168 if (is_gap)
169 free(is_gap);
170
171 return ret;
172 }
173
174 /* paragraph_is_part_of_table PARAGRAPH LAYOUT FIRST
175 * If PARAGRAPH fits the given table LAYOUT startbing with FIRST, return 1.
176 * Otherwise, return 0. We permit a table to be the contents of a bullet, so if
177 * PARAGRAPH is has the same or smaller indent as FIRST and has a leader, it
178 * cannot be part of the table. */
paragraph_is_part_of_table(const struct paragraph * P,const struct table_layout * L,const struct paragraph * first)179 static int paragraph_is_part_of_table(const struct paragraph *P, const struct table_layout *L, const struct paragraph *first) {
180 size_t i;
181
182 for (i = 0; i < P->nlines; ++i) {
183 char *line;
184 size_t j, len;
185 line = P->lines[i];
186 len = strlen(line);
187 if (len > L->width)
188 return 0;
189 for (j = 0; j < len; ++j)
190 if (L->is_gap[j] && line[j] != ' ')
191 return 0;
192 }
193
194 if (P->type != none && P->indent <= first->indent)
195 return 0;
196
197 return 1;
198 }
199
200 /* extract_paragraphs PARAGRAPH STARTLINE NLINES STARTCOL NCOLS
201 * Process the rectangle defined by STARTLINE, NLINES and STARTCOL, NCOLS
202 * in the text of PARAGRAPH, breaking it into individual paragraphs and
203 * returning them as a linked list. If no paragraphs can be extracted, return
204 * NULL. */
extract_paragraphs(const struct paragraph * para,const size_t startline,const size_t nlines,const size_t startcol,const size_t ncols)205 static struct paragraph *extract_paragraphs(const struct paragraph *para, const size_t startline, const size_t nlines, const size_t startcol, const size_t ncols) {
206 size_t n;
207 struct paragraph *ret = NULL, *last = NULL, *cur = NULL;
208 char *buf;
209
210 buf = malloc(ncols + 1);
211
212 /* Walk through the lines in para, excising the bit of text defined by the
213 * startcol and ncols, and add a copy of same to the current paragraph. */
214 for (n = startline; n < startline + nlines; ++n) {
215 size_t len;
216 memset(buf, 0, ncols + 1);
217 len = strlen(para->lines[n]);
218 if (len > startcol) {
219 size_t m;
220 m = ncols;
221 if (m > len - startcol)
222 m = len - startcol;
223 memcpy(buf, para->lines[n] + startcol, m);
224 /* Strip trailing whitespace. */
225 while (m > 0) {
226 if (buf[m - 1] == ' ')
227 buf[m - 1] = 0;
228 else break;
229 --m;
230 }
231 }
232
233 /* Line ends paragraph. Classify and save it. */
234 if (!*buf && cur) {
235 classify_paragraph(cur);
236 if (last) {
237 last->next = cur;
238 cur->prev = last;
239 } else
240 last = cur;
241 if (!ret)
242 ret = last;
243 cur = NULL;
244 }
245
246 if (*buf) {
247 if (!cur) {
248 /* Line starts new paragraph. */
249 alloc_struct(paragraph, cur);
250 cur->lines = malloc((para->nlines - n) * sizeof *cur->lines);
251 }
252 cur->lines[cur->nlines++] = strdup(buf);
253 }
254 }
255
256 if (cur) {
257 classify_paragraph(cur);
258 if (last) {
259 last->next = cur;
260 cur->prev = last;
261 } else
262 last = cur;
263 if (!ret)
264 ret = last;
265 }
266
267 free(buf);
268
269 /* Recursively process these paragraphs. */
270 process_tables(ret);
271
272 return ret;
273 }
274
275 /* process_table_rows FIRST LAST LAYOUT
276 * Consume paragraphs from FIRST to LAST inclusive which are part of the given
277 * LAYOUT, replacing each paragraph with one or more rows of table cells. */
process_table_rows(struct paragraph * first,struct paragraph * last,const struct table_layout * L)278 static struct paragraph *process_table_rows(struct paragraph *first, struct paragraph *last, const struct table_layout *L) {
279 struct paragraph *P, *ret = NULL;
280 int is_first_para = 1, only_one_para;
281 size_t i;
282
283 /*
284 * Each paragraph except the first represents a single table row, so we
285 * can replace paragraphs with row containers in-place.
286 */
287
288 only_one_para = (first == last);
289
290 P = first;
291 while (1) {
292 struct paragraph *cells = NULL, *l = NULL;
293 size_t startline = 0;
294
295 if (is_first_para) {
296 /*
297 * The first row of the table is a special case, because of the
298 * rule line, above which is the header and below which may be cell
299 * data.
300 */
301 struct paragraph *newpara;
302
303 for (i = 0; i < L->ntablecolumns; ++i) {
304 /* Add a <th> to the row. */
305 struct paragraph *p_th;
306 alloc_struct(paragraph, p_th);
307 p_th->container = "th";
308 p_th->contents = extract_paragraphs(P, 0, L->ruleline, L->col[i], L->ncols[i]);
309 p_th->prev = l;
310 if (l) {
311 l->next = p_th;
312 l = l->next;
313 } else {
314 l = cells = p_th;
315 }
316 }
317
318 /* Because we don't want to move first or last, we insert a new
319 * paragraph after first and replace first with this row. */
320 alloc_struct(paragraph, newpara);
321 *newpara = *first;
322
323 newpara->prev = first;
324
325 first->next = newpara;
326 if (newpara->next)
327 newpara->next->prev = newpara;
328
329 first->container = "tr";
330 first->contents = cells;
331 first->nlines = 0;
332 first->lines = NULL;
333
334 P = newpara;
335
336 /* Bits under rule line are normal cells. */
337 startline = L->ruleline + 1;
338
339 is_first_para = 0;
340
341 }
342
343 if (startline < P->nlines) {
344 cells = l = NULL;
345 for (i = 0; i < L->ntablecolumns; ++i) {
346 /* Add a <td> to the row. */
347 struct paragraph *p_td;
348 alloc_struct(paragraph, p_td);
349 p_td->container = "td";
350 p_td->contents = extract_paragraphs(P, startline, P->nlines - startline, L->col[i], L->ncols[i]);
351 p_td->prev = l;
352 if (l) {
353 l->next = p_td;
354 l = l->next;
355 } else {
356 l = cells = p_td;
357 }
358 }
359
360 /* Replace this paragraph's contents. */
361 P->container = "tr";
362 P->contents = cells;
363 for (i = 0; i < P->nlines; ++i)
364 free(P->lines[i]);
365 free(P->lines);
366 P->nlines = 0;
367 }
368
369 if (only_one_para || P == last)
370 break;
371 P = P->next;
372 }
373
374 return ret;
375 }
376
377 /* process_tables PARAGRAPHS
378 * Go through the list of PARAGRAPHS, identifying extents which form part of
379 * a table and splitting them up into individual cells. Each cell is processed
380 * recursively by process_tables, so that tables-within-tables may be
381 * implemented (god help us). */
process_tables(struct paragraph * paras)382 void process_tables(struct paragraph *paras) {
383 struct paragraph *P;
384
385 for (P = paras; P; ) {
386 struct table_layout tl;
387 if (paragraph_is_start_of_table(P, &tl)) {
388 struct paragraph *first, *last, *rows;
389
390 first = P;
391 for (last = first; last->next && paragraph_is_part_of_table(last->next, &tl, first); last = last->next);
392
393 /* Found a table from first to last inclusive. */
394 rows = process_table_rows(first, last, &tl);
395
396 /* A one-paragraph table will have expanded to two. */
397 if (first == last)
398 last = first->next;
399
400 create_container("table", first, last, 0);
401
402 free(tl.is_gap);
403 free(tl.col);
404 free(tl.ncols);
405 }
406 P = P->next;
407 }
408 }
409
410