htmlise/htmlise-0.2/tables.c

/*
 * tables.c:
 * Table support for htmlise.
 *
 * The idea here is that we look for a paragraph which looks like a table
 * heading followed by a number of paragraphs which look like table cells.
 * We do not support sophisticated tables or physical mark-up like borders
 * and alignment. Tables look like this:
 *
 *      Heading 1  Heading 2  Heading 3             <-- heading
 *      ---------- ---------- -------------         <-- `table rule'
 *      Text in    More text  Third cell           |
 *      cell       in another                      |<-- first row
 *                 cell                            |
 *
 *      New row    Next cell  Bottom-right         |
 *                 in new row cell of              |<-- second row
 *                            table                |
 *
 * The table is recognised because its first paragraph contains a line of
 * dashes interleaved with spaces, the spaces are present on every line of the
 * table, and no line in the paragraph is longer than the line of dashes. The
 * text above the dashes is the table heading, and is marked up as <th></th>,
 * and that below is just data, which lives in <td></td>. Further paragraphs
 * which have lined-up spaces and consist only of lines of length equal to or
 * shorter than the header rule length are also considered to be part of the
 * table, with each paragraph being an individual row. The contents of each
 * cell are then processed recursively by htmlise.
 *
 * Where there are several possible overlapping tables which could be
 * constructed by the procedure, we pick the earliest-starting one.
 *
 * Extensions:
 *
 *  - should also permit horizontal tables (with the <th></th> on the left)
 *
 * Copyright (c) 2003 Chris Lightfoot. All rights reserved.
 * Email: chris@ex-parrot.com; WWW: http://www.ex-parrot.com/~chris/
 *
 */

static const char rcsid[] = "$Id: tables.c,v 1.4 2004/01/29 22:52:28 chris Exp $";

#include <assert.h>

#include <stdlib.h>
#include <string.h>

#include "htmlise.h"

struct table_layout {
    /* ruleline is the line number of the table rule in the first paragraph of
     * the table; ruleindent the indent of the table rule and width the line
     * length of the table rule. ntablecolumns is the number of columns in the
     * table. */
    size_t ruleline, ruleindent, width, ntablecolumns;
    /* is_gap consists of width flags each of which is true if the
     * corresponding column must be whitespace in each line of the table. */
    char *is_gap;
    /* col and ncols store the starting column positions and character widths
     * of the ntablecolumns individual table columns. */
    size_t *col, *ncols;
};

/* paragraph_is_start_of_table PARAGRAPH LAYOUT
 * If PARAGRAPH is a possible start-of-table, fill in LAYOUT with the layout of
 * the table and return 1. Otherwise, return 0. The table rule is set to the
 * first possible candidate rule. On successful return, LAYOUT->is_gap, col and
 * ncols are allocated on the heap and must be freed by the caller. */
static int paragraph_is_start_of_table(const struct paragraph *P, struct table_layout *L) {
    size_t i, j, minindent = 1000000, maxlen = 0, *candidates, ncandidates = 0;
    char *is_gap = NULL;
    int ret = 0;

    candidates = malloc(P->nlines * sizeof *candidates);

    /* Look at each line in the paragraph and find the minimum indent and
     * maximum length of the lines. Also identify any candidate table rules. */
    for (i = 0; i < P->nlines; ++i) {
        size_t indent, len, ngaps = 0;
        indent = strspn(P->lines[i], " ");
        len = strlen(P->lines[i]);

        /* Find horizontal extent of paragraph. */
        if (indent < minindent)
            minindent = indent;

        if (len > maxlen) {
            is_gap = realloc(is_gap, len);
            for (j = maxlen; j < len; ++j)
                is_gap[j] = P->lines[i][j] == ' ' ? 1 : 0;
            maxlen = len;
        }

        /* Find gaps. */
        for (j = 0; j < len; ++j)
            if (P->lines[i][j] != ' ')
                is_gap[j] = 0;
            else
                ++ngaps;

        /* Is this line a candidate table rule? It can't be the first line of
         * the paragraph, obviously. */
        if (i > 0 && strspn(P->lines[i], "- ") == len && ngaps > 0)
            candidates[ncandidates++] = i;
    }

    /* Now find any candidate table rule which satisfies the constraints. */
    for (i = 0; i < ncandidates; ++i) {
        size_t len;
        char *line;
        line = P->lines[candidates[i]];
        len = strlen(line);
        for (j = 0; j < len; ++j)
            if (line[j] == ' ' && !is_gap[j])
                break;
        if (j == len) {
            int f;
            size_t j, n;
            struct table_layout Lz = { 0 };

            *L = Lz;

            /* Choose this one. */
            ret = 1;
            L->is_gap = is_gap;
            is_gap = NULL; /* Don't free it. */
            L->ruleline = candidates[i];
            L->ruleindent = minindent;
            L->width = maxlen;

            /* Figure out how many columns we have and where in the line
             * they are. */
            for (j = 0, f = 0, L->ntablecolumns = 0; j < L->width; ++j) {
                if (!L->is_gap[j] && !f) {
                    f = 1;
                    ++L->ntablecolumns;
                } else if (L->is_gap[j])
                    f = 0;
            }

            L->col   = malloc(L->ntablecolumns * sizeof *L->col);
            L->ncols = malloc(L->ntablecolumns * sizeof *L->ncols);

            for (j = 0, n = 0; j < L->width; ++j) {
                if (!L->is_gap[j] && (j == 0 || L->is_gap[j - 1]))
                    L->col[n] = j;

                if (L->is_gap[j] && j > 0 && !L->is_gap[j - 1]) {
                    L->ncols[n] = j - L->col[n];
                    ++n;
                }
            }

            if (n < L->ntablecolumns) {
                L->ncols[n] = j - L->col[n] + 1;
                ++n;
            }

            assert(n == L->ntablecolumns);

            break;
        }
    }

    if (candidates)
        free(candidates);
    if (is_gap)
        free(is_gap);

    return ret;
}

/* paragraph_is_part_of_table PARAGRAPH LAYOUT FIRST
 * If PARAGRAPH fits the given table LAYOUT startbing with FIRST, return 1.
 * Otherwise, return 0. We permit a table to be the contents of a bullet, so if
 * PARAGRAPH is has the same or smaller indent as FIRST and has a leader, it
 * cannot be part of the table. */
static int paragraph_is_part_of_table(const struct paragraph *P, const struct table_layout *L, const struct paragraph *first) {
    size_t i;

    for (i = 0; i < P->nlines; ++i) {
        char *line;
        size_t j, len;
        line = P->lines[i];
        len = strlen(line);
        if (len > L->width)
            return 0;
        for (j = 0; j < len; ++j)
            if (L->is_gap[j] && line[j] != ' ')
                return 0;
    }

    if (P->type != none && P->indent <= first->indent)
        return 0;

    return 1;
}

/* extract_paragraphs PARAGRAPH STARTLINE NLINES STARTCOL NCOLS
 * Process the rectangle defined by STARTLINE, NLINES and STARTCOL, NCOLS
 * in the text of PARAGRAPH, breaking it into individual paragraphs and
 * returning them as a linked list. If no paragraphs can be extracted, return
 * NULL. */
static struct paragraph *extract_paragraphs(const struct paragraph *para, const size_t startline, const size_t nlines, const size_t startcol, const size_t ncols) {
    size_t n;
    struct paragraph *ret = NULL, *last = NULL, *cur = NULL;
    char *buf;

    buf = malloc(ncols + 1);

    /* Walk through the lines in para, excising the bit of text defined by the
     * startcol and ncols, and add a copy of same to the current paragraph. */
    for (n = startline; n < startline + nlines; ++n) {
        size_t len;
        memset(buf, 0, ncols + 1);
        len = strlen(para->lines[n]);
        if (len > startcol) {
            size_t m;
            m = ncols;
            if (m > len - startcol)
                m = len - startcol;
            memcpy(buf, para->lines[n] + startcol, m);
            /* Strip trailing whitespace. */
            while (m > 0) {
                if (buf[m - 1] == ' ')
                    buf[m - 1] = 0;
                else break;
                --m;
            }
        }

        /* Line ends paragraph. Classify and save it. */
        if (!*buf && cur) {
            classify_paragraph(cur);
            if (last) {
                last->next = cur;
                cur->prev = last;
            } else
                last = cur;
            if (!ret)
                ret = last;
            cur = NULL;
        }

        if (*buf) {
            if (!cur) {
                /* Line starts new paragraph. */
                alloc_struct(paragraph, cur);
                cur->lines = malloc((para->nlines - n) * sizeof *cur->lines);
            }
            cur->lines[cur->nlines++] = strdup(buf);
        }
    }

    if (cur) {
        classify_paragraph(cur);
        if (last) {
            last->next = cur;
            cur->prev = last;
        } else
            last = cur;
        if (!ret)
            ret = last;
    }

    free(buf);

    /* Recursively process these paragraphs. */
    process_tables(ret);

    return ret;
}

/* process_table_rows FIRST LAST LAYOUT
 * Consume paragraphs from FIRST to LAST inclusive which are part of the given
 * LAYOUT, replacing each paragraph with one or more rows of table cells. */
static struct paragraph *process_table_rows(struct paragraph *first, struct paragraph *last, const struct table_layout *L) {
    struct paragraph *P, *ret = NULL;
    int is_first_para = 1, only_one_para;
    size_t i;

    /*
     * Each paragraph except the first represents a single table row, so we
     * can replace paragraphs with row containers in-place.
     */

    only_one_para = (first == last);

    P = first;
    while (1) {
        struct paragraph *cells = NULL, *l = NULL;
        size_t startline = 0;

        if (is_first_para) {
            /*
             * The first row of the table is a special case, because of the
             * rule line, above which is the header and below which may be cell
             * data.
             */
            struct paragraph *newpara;

            for (i = 0; i < L->ntablecolumns; ++i) {
                /* Add a <th> to the row. */
                struct paragraph *p_th;
                alloc_struct(paragraph, p_th);
                p_th->container = "th";
                p_th->contents = extract_paragraphs(P, 0, L->ruleline, L->col[i], L->ncols[i]);
                p_th->prev = l;
                if (l) {
                    l->next = p_th;
                    l = l->next;
                } else {
                    l = cells = p_th;
                }
            }

            /* Because we don't want to move first or last, we insert a new
             * paragraph after first and replace first with this row. */
            alloc_struct(paragraph, newpara);
            *newpara = *first;

            newpara->prev = first;

            first->next = newpara;
            if (newpara->next)
                newpara->next->prev = newpara;

            first->container = "tr";
            first->contents = cells;
            first->nlines = 0;
            first->lines = NULL;

            P = newpara;

            /* Bits under rule line are normal cells. */
            startline = L->ruleline + 1;

            is_first_para = 0;

        }

        if (startline < P->nlines) {
            cells = l = NULL;
            for (i = 0; i < L->ntablecolumns; ++i) {
                /* Add a <td> to the row. */
                struct paragraph *p_td;
                alloc_struct(paragraph, p_td);
                p_td->container = "td";
                p_td->contents = extract_paragraphs(P, startline, P->nlines - startline, L->col[i], L->ncols[i]);
                p_td->prev = l;
                if (l) {
                    l->next = p_td;
                    l = l->next;
                } else {
                    l = cells = p_td;
                }
            }

            /* Replace this paragraph's contents. */
            P->container = "tr";
            P->contents = cells;
            for (i = 0; i < P->nlines; ++i)
                free(P->lines[i]);
            free(P->lines);
            P->nlines = 0;
        }

        if (only_one_para || P == last)
            break;
        P = P->next;
    }

    return ret;
}

/* process_tables PARAGRAPHS
 * Go through the list of PARAGRAPHS, identifying extents which form part of
 * a table and splitting them up into individual cells. Each cell is processed
 * recursively by process_tables, so that tables-within-tables may be
 * implemented (god help us). */
void process_tables(struct paragraph *paras) {
    struct paragraph *P;

    for (P = paras; P; ) {
        struct table_layout tl;
        if (paragraph_is_start_of_table(P, &tl)) {
            struct paragraph *first, *last, *rows;

            first = P;
            for (last = first; last->next && paragraph_is_part_of_table(last->next, &tl, first); last = last->next);

            /* Found a table from first to last inclusive. */
            rows = process_table_rows(first, last, &tl);

            /* A one-paragraph table will have expanded to two. */
            if (first == last)
                last = first->next;

            create_container("table", first, last, 0);

            free(tl.is_gap);
            free(tl.col);
            free(tl.ncols);
        }
        P = P->next;
    }
}