1 /*
2  * tables.c:
3  * Table support for htmlise.
4  *
5  * The idea here is that we look for a paragraph which looks like a table
6  * heading followed by a number of paragraphs which look like table cells.
7  * We do not support sophisticated tables or physical mark-up like borders
8  * and alignment. Tables look like this:
9  *
10  *      Heading 1  Heading 2  Heading 3             <-- heading
11  *      ---------- ---------- -------------         <-- `table rule'
12  *      Text in    More text  Third cell           |
13  *      cell       in another                      |<-- first row
14  *                 cell                            |
15  *
16  *      New row    Next cell  Bottom-right         |
17  *                 in new row cell of              |<-- second row
18  *                            table                |
19  *
20  * The table is recognised because its first paragraph contains a line of
21  * dashes interleaved with spaces, the spaces are present on every line of the
22  * table, and no line in the paragraph is longer than the line of dashes. The
23  * text above the dashes is the table heading, and is marked up as <th></th>,
24  * and that below is just data, which lives in <td></td>. Further paragraphs
25  * which have lined-up spaces and consist only of lines of length equal to or
26  * shorter than the header rule length are also considered to be part of the
27  * table, with each paragraph being an individual row. The contents of each
28  * cell are then processed recursively by htmlise.
29  *
30  * Where there are several possible overlapping tables which could be
31  * constructed by the procedure, we pick the earliest-starting one.
32  *
33  * Extensions:
34  *
35  *  - should also permit horizontal tables (with the <th></th> on the left)
36  *
37  * Copyright (c) 2003 Chris Lightfoot. All rights reserved.
38  * Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/
39  *
40  */
41 
42 static const char rcsid[] = "$Id: tables.c,v 1.4 2004/01/29 22:52:28 chris Exp $";
43 
44 #include <assert.h>
45 
46 #include <stdlib.h>
47 #include <string.h>
48 
49 #include "htmlise.h"
50 
51 struct table_layout {
52     /* ruleline is the line number of the table rule in the first paragraph of
53      * the table; ruleindent the indent of the table rule and width the line
54      * length of the table rule. ntablecolumns is the number of columns in the
55      * table. */
56     size_t ruleline, ruleindent, width, ntablecolumns;
57     /* is_gap consists of width flags each of which is true if the
58      * corresponding column must be whitespace in each line of the table. */
59     char *is_gap;
60     /* col and ncols store the starting column positions and character widths
61      * of the ntablecolumns individual table columns. */
62     size_t *col, *ncols;
63 };
64 
65 /* paragraph_is_start_of_table PARAGRAPH LAYOUT
66  * If PARAGRAPH is a possible start-of-table, fill in LAYOUT with the layout of
67  * the table and return 1. Otherwise, return 0. The table rule is set to the
68  * first possible candidate rule. On successful return, LAYOUT->is_gap, col and
69  * ncols are allocated on the heap and must be freed by the caller. */
paragraph_is_start_of_table(const struct paragraph * P,struct table_layout * L)70 static int paragraph_is_start_of_table(const struct paragraph *P, struct table_layout *L) {
71     size_t i, j, minindent = 1000000, maxlen = 0, *candidates, ncandidates = 0;
72     char *is_gap = NULL;
73     int ret = 0;
74 
75     candidates = malloc(P->nlines * sizeof *candidates);
76 
77     /* Look at each line in the paragraph and find the minimum indent and
78      * maximum length of the lines. Also identify any candidate table rules. */
79     for (i = 0; i < P->nlines; ++i) {
80         size_t indent, len, ngaps = 0;
81         indent = strspn(P->lines[i], " ");
82         len = strlen(P->lines[i]);
83 
84         /* Find horizontal extent of paragraph. */
85         if (indent < minindent)
86             minindent = indent;
87 
88         if (len > maxlen) {
89             is_gap = realloc(is_gap, len);
90             for (j = maxlen; j < len; ++j)
91                 is_gap[j] = P->lines[i][j] == ' ' ? 1 : 0;
92             maxlen = len;
93         }
94 
95         /* Find gaps. */
96         for (j = 0; j < len; ++j)
97             if (P->lines[i][j] != ' ')
98                 is_gap[j] = 0;
99             else
100                 ++ngaps;
101 
102         /* Is this line a candidate table rule? It can't be the first line of
103          * the paragraph, obviously. */
104         if (i > 0 && strspn(P->lines[i], "- ") == len && ngaps > 0)
105             candidates[ncandidates++] = i;
106     }
107 
108     /* Now find any candidate table rule which satisfies the constraints. */
109     for (i = 0; i < ncandidates; ++i) {
110         size_t len;
111         char *line;
112         line = P->lines[candidates[i]];
113         len = strlen(line);
114         for (j = 0; j < len; ++j)
115             if (line[j] == ' ' && !is_gap[j])
116                 break;
117         if (j == len) {
118             int f;
119             size_t j, n;
120             struct table_layout Lz = { 0 };
121 
122             *L = Lz;
123 
124             /* Choose this one. */
125             ret = 1;
126             L->is_gap = is_gap;
127             is_gap = NULL; /* Don't free it. */
128             L->ruleline = candidates[i];
129             L->ruleindent = minindent;
130             L->width = maxlen;
131 
132             /* Figure out how many columns we have and where in the line
133              * they are. */
134             for (j = 0, f = 0, L->ntablecolumns = 0; j < L->width; ++j) {
135                 if (!L->is_gap[j] && !f) {
136                     f = 1;
137                     ++L->ntablecolumns;
138                 } else if (L->is_gap[j])
139                     f = 0;
140             }
141 
142             L->col   = malloc(L->ntablecolumns * sizeof *L->col);
143             L->ncols = malloc(L->ntablecolumns * sizeof *L->ncols);
144 
145             for (j = 0, n = 0; j < L->width; ++j) {
146                 if (!L->is_gap[j] && (j == 0 || L->is_gap[j - 1]))
147                     L->col[n] = j;
148 
149                 if (L->is_gap[j] && j > 0 && !L->is_gap[j - 1]) {
150                     L->ncols[n] = j - L->col[n];
151                     ++n;
152                 }
153             }
154 
155             if (n < L->ntablecolumns) {
156                 L->ncols[n] = j - L->col[n] + 1;
157                 ++n;
158             }
159 
160             assert(n == L->ntablecolumns);
161 
162             break;
163         }
164     }
165 
166     if (candidates)
167         free(candidates);
168     if (is_gap)
169         free(is_gap);
170 
171     return ret;
172 }
173 
174 /* paragraph_is_part_of_table PARAGRAPH LAYOUT FIRST
175  * If PARAGRAPH fits the given table LAYOUT startbing with FIRST, return 1.
176  * Otherwise, return 0. We permit a table to be the contents of a bullet, so if
177  * PARAGRAPH is has the same or smaller indent as FIRST and has a leader, it
178  * cannot be part of the table. */
paragraph_is_part_of_table(const struct paragraph * P,const struct table_layout * L,const struct paragraph * first)179 static int paragraph_is_part_of_table(const struct paragraph *P, const struct table_layout *L, const struct paragraph *first) {
180     size_t i;
181 
182     for (i = 0; i < P->nlines; ++i) {
183         char *line;
184         size_t j, len;
185         line = P->lines[i];
186         len = strlen(line);
187         if (len > L->width)
188             return 0;
189         for (j = 0; j < len; ++j)
190             if (L->is_gap[j] && line[j] != ' ')
191                 return 0;
192     }
193 
194     if (P->type != none && P->indent <= first->indent)
195         return 0;
196 
197     return 1;
198 }
199 
200 /* extract_paragraphs PARAGRAPH STARTLINE NLINES STARTCOL NCOLS
201  * Process the rectangle defined by STARTLINE, NLINES and STARTCOL, NCOLS
202  * in the text of PARAGRAPH, breaking it into individual paragraphs and
203  * returning them as a linked list. If no paragraphs can be extracted, return
204  * NULL. */
extract_paragraphs(const struct paragraph * para,const size_t startline,const size_t nlines,const size_t startcol,const size_t ncols)205 static struct paragraph *extract_paragraphs(const struct paragraph *para, const size_t startline, const size_t nlines, const size_t startcol, const size_t ncols) {
206     size_t n;
207     struct paragraph *ret = NULL, *last = NULL, *cur = NULL;
208     char *buf;
209 
210     buf = malloc(ncols + 1);
211 
212     /* Walk through the lines in para, excising the bit of text defined by the
213      * startcol and ncols, and add a copy of same to the current paragraph. */
214     for (n = startline; n < startline + nlines; ++n) {
215         size_t len;
216         memset(buf, 0, ncols + 1);
217         len = strlen(para->lines[n]);
218         if (len > startcol) {
219             size_t m;
220             m = ncols;
221             if (m > len - startcol)
222                 m = len - startcol;
223             memcpy(buf, para->lines[n] + startcol, m);
224             /* Strip trailing whitespace. */
225             while (m > 0) {
226                 if (buf[m - 1] == ' ')
227                     buf[m - 1] = 0;
228                 else break;
229                 --m;
230             }
231         }
232 
233         /* Line ends paragraph. Classify and save it. */
234         if (!*buf && cur) {
235             classify_paragraph(cur);
236             if (last) {
237                 last->next = cur;
238                 cur->prev = last;
239             } else
240                 last = cur;
241             if (!ret)
242                 ret = last;
243             cur = NULL;
244         }
245 
246         if (*buf) {
247             if (!cur) {
248                 /* Line starts new paragraph. */
249                 alloc_struct(paragraph, cur);
250                 cur->lines = malloc((para->nlines - n) * sizeof *cur->lines);
251             }
252             cur->lines[cur->nlines++] = strdup(buf);
253         }
254     }
255 
256     if (cur) {
257         classify_paragraph(cur);
258         if (last) {
259             last->next = cur;
260             cur->prev = last;
261         } else
262             last = cur;
263         if (!ret)
264             ret = last;
265     }
266 
267     free(buf);
268 
269     /* Recursively process these paragraphs. */
270     process_tables(ret);
271 
272     return ret;
273 }
274 
275 /* process_table_rows FIRST LAST LAYOUT
276  * Consume paragraphs from FIRST to LAST inclusive which are part of the given
277  * LAYOUT, replacing each paragraph with one or more rows of table cells. */
process_table_rows(struct paragraph * first,struct paragraph * last,const struct table_layout * L)278 static struct paragraph *process_table_rows(struct paragraph *first, struct paragraph *last, const struct table_layout *L) {
279     struct paragraph *P, *ret = NULL;
280     int is_first_para = 1, only_one_para;
281     size_t i;
282 
283     /*
284      * Each paragraph except the first represents a single table row, so we
285      * can replace paragraphs with row containers in-place.
286      */
287 
288     only_one_para = (first == last);
289 
290     P = first;
291     while (1) {
292         struct paragraph *cells = NULL, *l = NULL;
293         size_t startline = 0;
294 
295         if (is_first_para) {
296             /*
297              * The first row of the table is a special case, because of the
298              * rule line, above which is the header and below which may be cell
299              * data.
300              */
301             struct paragraph *newpara;
302 
303             for (i = 0; i < L->ntablecolumns; ++i) {
304                 /* Add a <th> to the row. */
305                 struct paragraph *p_th;
306                 alloc_struct(paragraph, p_th);
307                 p_th->container = "th";
308                 p_th->contents = extract_paragraphs(P, 0, L->ruleline, L->col[i], L->ncols[i]);
309                 p_th->prev = l;
310                 if (l) {
311                     l->next = p_th;
312                     l = l->next;
313                 } else {
314                     l = cells = p_th;
315                 }
316             }
317 
318             /* Because we don't want to move first or last, we insert a new
319              * paragraph after first and replace first with this row. */
320             alloc_struct(paragraph, newpara);
321             *newpara = *first;
322 
323             newpara->prev = first;
324 
325             first->next = newpara;
326             if (newpara->next)
327                 newpara->next->prev = newpara;
328 
329             first->container = "tr";
330             first->contents = cells;
331             first->nlines = 0;
332             first->lines = NULL;
333 
334             P = newpara;
335 
336             /* Bits under rule line are normal cells. */
337             startline = L->ruleline + 1;
338 
339             is_first_para = 0;
340 
341         }
342 
343         if (startline < P->nlines) {
344             cells = l = NULL;
345             for (i = 0; i < L->ntablecolumns; ++i) {
346                 /* Add a <td> to the row. */
347                 struct paragraph *p_td;
348                 alloc_struct(paragraph, p_td);
349                 p_td->container = "td";
350                 p_td->contents = extract_paragraphs(P, startline, P->nlines - startline, L->col[i], L->ncols[i]);
351                 p_td->prev = l;
352                 if (l) {
353                     l->next = p_td;
354                     l = l->next;
355                 } else {
356                     l = cells = p_td;
357                 }
358             }
359 
360             /* Replace this paragraph's contents. */
361             P->container = "tr";
362             P->contents = cells;
363             for (i = 0; i < P->nlines; ++i)
364                 free(P->lines[i]);
365             free(P->lines);
366             P->nlines = 0;
367         }
368 
369         if (only_one_para || P == last)
370             break;
371         P = P->next;
372     }
373 
374     return ret;
375 }
376 
377 /* process_tables PARAGRAPHS
378  * Go through the list of PARAGRAPHS, identifying extents which form part of
379  * a table and splitting them up into individual cells. Each cell is processed
380  * recursively by process_tables, so that tables-within-tables may be
381  * implemented (god help us). */
process_tables(struct paragraph * paras)382 void process_tables(struct paragraph *paras) {
383     struct paragraph *P;
384 
385     for (P = paras; P; ) {
386         struct table_layout tl;
387         if (paragraph_is_start_of_table(P, &tl)) {
388             struct paragraph *first, *last, *rows;
389 
390             first = P;
391             for (last = first; last->next && paragraph_is_part_of_table(last->next, &tl, first); last = last->next);
392 
393             /* Found a table from first to last inclusive. */
394             rows = process_table_rows(first, last, &tl);
395 
396             /* A one-paragraph table will have expanded to two. */
397             if (first == last)
398                 last = first->next;
399 
400             create_container("table", first, last, 0);
401 
402             free(tl.is_gap);
403             free(tl.col);
404             free(tl.ncols);
405         }
406         P = P->next;
407     }
408 }
409 
410