1<?php
2// csv.php -- HotCRP CSV parsing functions
3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE.
4
5class CsvParser {
6    private $lines;
7    private $lpos = 0;
8    private $type;
9    private $typefn;
10    private $header = false;
11    private $comment_chars = false;
12    private $comment_function = null;
13
14    const TYPE_COMMA = 1;
15    const TYPE_PIPE = 2;
16    const TYPE_BAR = 2;
17    const TYPE_TAB = 4;
18    const TYPE_DOUBLEBAR = 8;
19    const TYPE_GUESS = 7;
20
21    static public function split_lines($str) {
22        $b = array();
23        foreach (preg_split('/([^\r\n]*(?:\z|\r\n?|\n))/', $str, 0, PREG_SPLIT_DELIM_CAPTURE) as $line)
24            if ($line !== "")
25                $b[] = $line;
26        return $b;
27    }
28
29    function __construct($str, $type = self::TYPE_COMMA) {
30        $this->lines = is_array($str) ? $str : self::split_lines($str);
31        $this->set_type($type);
32    }
33
34    private function set_type($type) {
35        $this->type = $type;
36        if ($this->type === self::TYPE_COMMA)
37            $this->typefn = "parse_comma";
38        else if ($this->type === self::TYPE_BAR)
39            $this->typefn = "parse_bar";
40        else if ($this->type === self::TYPE_TAB)
41            $this->typefn = "parse_tab";
42        else if ($this->type === self::TYPE_DOUBLEBAR)
43            $this->typefn = "parse_doublebar";
44        else
45            $this->typefn = "parse_guess";
46    }
47
48    function set_comment_chars($s) {
49        $this->comment_chars = $s;
50    }
51
52    function set_comment_function($f) {
53        $this->comment_function = $f;
54    }
55
56    function header() {
57        return $this->header;
58    }
59
60    function set_header($header) {
61        $this->header = $header;
62    }
63
64    static function linelen($line) {
65        $len = strlen($line);
66        if ($len > 0 && $line[$len - 1] === "\n")
67            --$len;
68        if ($len > 0 && $line[$len - 1] === "\r")
69            --$len;
70        return $len;
71    }
72
73    function lineno() {
74        return $this->lpos;
75    }
76
77    function next() {
78        while (($line = $this->shift()) === null)
79            /* loop */;
80        return $line;
81    }
82
83    function unshift($line) {
84        if ($line === null || $line === false)
85            /* do nothing */;
86        else if ($this->lpos > 0) {
87            $this->lines[$this->lpos - 1] = $line;
88            --$this->lpos;
89        } else
90            array_unshift($this->lines, $line);
91    }
92
93    function shift() {
94        if ($this->lpos >= count($this->lines))
95            return false;
96        $line = $this->lines[$this->lpos];
97        ++$this->lpos;
98        if (is_array($line))
99            return self::reparse($line, $this->header);
100        // blank lines, comments
101        if ($line === "" || $line[0] === "\n" || $line[0] === "\r")
102            return null;
103        if ($this->comment_chars
104            && strpos($this->comment_chars, $line[0]) !== false) {
105            $this->comment_function && call_user_func($this->comment_function, $line);
106            return null;
107        }
108        // split on type
109        $fn = $this->typefn;
110        return $this->$fn($line, $this->header);
111    }
112
113    private function parse_guess($line, $header) {
114        $pipe = $tab = $comma = $doublepipe = -1;
115        if ($this->type & self::TYPE_BAR)
116            $pipe = substr_count($line, "|");
117        if ($this->type & self::TYPE_DOUBLEBAR)
118            $doublepipe = substr_count($line, "||");
119        if ($doublepipe > 0 && $pipe > 0 && $doublepipe * 2.1 > $pipe)
120            $pipe = -1;
121        if ($this->type & self::TYPE_TAB)
122            $tab = substr_count($line, "\t");
123        if ($this->type & self::TYPE_COMMA)
124            $comma = substr_count($line, ",");
125        if ($tab > $pipe && $tab > $doublepipe && $tab > $comma)
126            $this->set_type(self::TYPE_TAB);
127        else if ($doublepipe > $pipe && $doublepipe > $comma)
128            $this->set_type(self::TYPE_DOUBLEBAR);
129        else if ($pipe > $comma)
130            $this->set_type(self::TYPE_PIPE);
131        else
132            $this->set_type(self::TYPE_COMMA);
133        $fn = $this->typefn;
134        assert($fn !== "parse_guess");
135        return $this->$fn($line, $header);
136    }
137
138    function parse_comma($line, $header) {
139        $i = 0;
140        $a = array();
141        $linelen = self::linelen($line);
142        $pos = 0;
143        while ($pos != $linelen) {
144            if ($i && $line[$pos] === ",")
145                ++$pos;
146            $bpos = $pos;
147            if ($pos != $linelen && $line[$pos] === "\"") {
148                while (1) {
149                    $pos = strpos($line, "\"", $pos + 1);
150                    if ($pos === false) {
151                        $pos = $linelen;
152                        if ($this->lpos == count($this->lines))
153                            break;
154                        $line .= $this->lines[$this->lpos];
155                        ++$this->lpos;
156                        $linelen = self::linelen($line);
157                    } else if ($pos + 1 < $linelen && $line[$pos + 1] === "\"")
158                        ++$pos;
159                    else
160                        break;
161                }
162                $field = str_replace("\"\"", "\"", substr($line, $bpos + 1, $pos - $bpos - 1));
163                if ($pos != $linelen)
164                    ++$pos;
165            } else {
166                $pos = strpos($line, ",", $pos);
167                if ($pos === false)
168                    $pos = $linelen;
169                $field = substr($line, $bpos, $pos - $bpos);
170            }
171            if ($header && get_s($header, $i) !== "")
172                $a[$header[$i]] = $field;
173            else
174                $a[$i] = $field;
175            ++$i;
176        }
177        return $a;
178    }
179
180    function parse_bar($line, $header) {
181        $i = 0;
182        $a = array();
183        $linelen = self::linelen($line);
184        $pos = 0;
185        while ($pos != $linelen) {
186            $bpos = $pos;
187            $pos = strpos($line, "|", $pos);
188            if ($pos === false)
189                $pos = $linelen;
190            $field = substr($line, $bpos, $pos - $bpos);
191            if ($header && get_s($header, $i) !== "")
192                $a[$header[$i]] = $field;
193            else
194                $a[$i] = $field;
195            ++$i;
196            if ($pos != $linelen && $line[$pos] === "|")
197                ++$pos;
198        }
199        return $a;
200    }
201
202    function parse_doublebar($line, $header) {
203        $i = 0;
204        $a = array();
205        $linelen = self::linelen($line);
206        $pos = 0;
207        while ($pos != $linelen) {
208            $bpos = $pos;
209            $pos = strpos($line, "||", $pos);
210            if ($pos === false)
211                $pos = $linelen;
212            $field = substr($line, $bpos, $pos - $bpos);
213            if ($header && get_s($header, $i) !== "")
214                $a[$header[$i]] = $field;
215            else
216                $a[$i] = $field;
217            ++$i;
218            if ($pos + 1 <= $linelen && $line[$pos] === "|" && $line[$pos + 1] === "|")
219                $pos += 2;
220        }
221        return $a;
222    }
223
224    function parse_tab($line, $header) {
225        $i = 0;
226        $a = array();
227        $linelen = self::linelen($line);
228        $pos = 0;
229        while ($pos != $linelen) {
230            $bpos = $pos;
231            $pos = strpos($line, "\t", $pos);
232            if ($pos === false)
233                $pos = $linelen;
234            $field = substr($line, $bpos, $pos - $bpos);
235            if ($header && get_s($header, $i) !== "")
236                $a[$header[$i]] = $field;
237            else
238                $a[$i] = $field;
239            ++$i;
240            if ($pos != $linelen && $line[$pos] === "\t")
241                ++$pos;
242        }
243        return $a;
244    }
245
246    static function reparse($line, $header) {
247        $i = 0;
248        $a = array();
249        foreach ($line as $field) {
250            if ($header && get_s($header, $i) !== "")
251                $a[$header[$i]] = $field;
252            else
253                $a[$i] = $field;
254            ++$i;
255        }
256        return $a;
257    }
258}
259
260class CsvGenerator {
261    const TYPE_COMMA = 0;
262    const TYPE_PIPE = 1;
263    const TYPE_TAB = 2;
264    const FLAG_TYPE = 3;
265    const FLAG_ALWAYS_QUOTE = 4;
266    const FLAG_CRLF = 8;
267    const FLAG_CR = 16;
268    const FLAG_LF = 0;
269    const FLAG_ITEM_COMMENTS = 32;
270
271    private $type;
272    private $flags;
273    private $lines = array();
274    private $lines_length = 0;
275    public $headerline = "";
276    private $selection = null;
277    private $selection_is_names = false;
278    private $lf = "\n";
279    private $comment = "# ";
280    private $inline = null;
281    private $filename;
282
283    static function always_quote($text) {
284        return '"' . str_replace('"', '""', $text) . '"';
285    }
286
287    static function quote($text, $quote_empty = false) {
288        if ($text === "")
289            return $quote_empty ? '""' : $text;
290        else if (preg_match('/\A[-_@\$+A-Za-z0-9.](?:[-_@\$+A-Za-z0-9. \t]*[-_\$+A-Za-z0-9.]|)\z/', $text))
291            return $text;
292        else
293            return self::always_quote($text);
294    }
295
296
297    function __construct($flags = self::TYPE_COMMA) {
298        $this->type = $flags & self::FLAG_TYPE;
299        $this->flags = $flags;
300        if ($this->flags & self::FLAG_CRLF)
301            $this->lf = "\r\n";
302        else if ($this->flags & self::FLAG_CR)
303            $this->lf = "\r";
304    }
305
306    function select($selection, $header = null) {
307        assert(empty($this->lines) && $this->headerline === "");
308        if ($header === false || $header === []) {
309            $this->selection = $selection;
310        } else if ($header !== null) {
311            assert(is_array($selection) && !is_associative_array($selection)
312                   && is_array($header) && !is_associative_array($header)
313                   && count($selection) === count($header));
314            $this->add($header);
315            $this->selection = $selection;
316        } else if (is_associative_array($selection)) {
317            $this->add(array_values($selection));
318            $this->selection = array_keys($selection);
319        } else {
320            $this->add($selection);
321            $this->selection = $selection;
322        }
323        $this->selection_is_names = true;
324        foreach ($this->selection as $s) {
325            if (ctype_digit($s))
326                $this->selection_is_names = false;
327        }
328        if (!empty($this->lines)) {
329            $this->headerline = $this->lines[0];
330            $this->lines = [];
331            $this->lines_length = 0;
332        }
333        return $this;
334    }
335
336    function set_filename($filename) {
337        $this->filename = $filename;
338    }
339
340    function set_inline($inline) {
341        $this->inline = $inline;
342    }
343
344
345    function is_empty() {
346        return empty($this->lines);
347    }
348
349    function is_csv() {
350        return $this->type == self::TYPE_COMMA;
351    }
352
353    function extension() {
354        return $this->type == self::TYPE_COMMA ? ".csv" : ".txt";
355    }
356
357    private function apply_selection($row, $is_array) {
358        if (!$this->selection
359            || empty($row)
360            || ($this->selection_is_names
361                && $is_array
362                && !is_associative_array($row)
363                && count($row) <= count($this->selection))) {
364            return $row;
365        }
366        $selected = array();
367        $i = 0;
368        foreach ($this->selection as $key) {
369            if (isset($row[$key])) {
370                while (count($selected) < $i)
371                    $selected[] = "";
372                $selected[] = $row[$key];
373            }
374            ++$i;
375        }
376        if (empty($selected) && $is_array) {
377            for ($i = 0;
378                 array_key_exists($i, $row) && $i != count($this->selection);
379                 ++$i)
380                $selected[] = $row[$i];
381        }
382        return $selected;
383    }
384
385    function add_string($text) {
386        $this->lines[] = $text;
387        $this->lines_length += strlen($text);
388        return $this;
389    }
390
391    function add_comment($text) {
392        preg_match_all('/([^\r\n]*)(?:\r\n?|\n|\z)/', $text, $m);
393        if ($m[1][count($m[1]) - 1] === "")
394            array_pop($m[1]);
395        foreach ($m[1] as $x)
396            $this->add_string($this->comment . $x . $this->lf);
397        return $this;
398    }
399
400    function add($row) {
401        if (is_string($row)) {
402            error_log("unexpected CsvGenerator::add(string): " . json_encode(debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS)));
403            $this->add_string($row);
404            return $this;
405        } else if (empty($row))
406            return $this;
407        reset($row);
408        if (is_array(current($row)) || is_object(current($row))) {
409            foreach ($row as $x)
410                $this->add($x);
411        } else {
412            $is_array = is_array($row);
413            if (!$is_array)
414                $row = (array) $row;
415            if (($this->flags & self::FLAG_ITEM_COMMENTS)
416                && $this->selection
417                && isset($row["__precomment__"])
418                && ($cmt = (string) $row["__precomment__"]) !== "")
419                $this->add_comment($cmt);
420            $srow = $row;
421            if ($this->selection)
422                $srow = $this->apply_selection($srow, $is_array);
423            if ($this->type == self::TYPE_COMMA) {
424                if ($this->flags & self::FLAG_ALWAYS_QUOTE) {
425                    foreach ($srow as &$x)
426                        $x = self::always_quote($x);
427                } else {
428                    foreach ($srow as &$x)
429                        $x = self::quote($x);
430                }
431                $this->add_string(join(",", $srow) . $this->lf);
432            } else if ($this->type == self::TYPE_TAB)
433                $this->add_string(join("\t", $srow) . $this->lf);
434            else
435                $this->add_string(join("|", $srow) . $this->lf);
436            if (($this->flags & self::FLAG_ITEM_COMMENTS)
437                && $this->selection
438                && isset($row["__postcomment__"])
439                && ($cmt = (string) $row["__postcomment__"]) !== "") {
440                $this->add_comment($cmt);
441                $this->add_string($this->lf);
442            }
443        }
444        return $this;
445    }
446
447    function sort($flags = SORT_NORMAL) {
448        sort($this->lines, $flags);
449        return $this;
450    }
451
452
453    function unparse() {
454        return $this->headerline . join("", $this->lines);
455    }
456
457    function download_headers() {
458        if ($this->is_csv())
459            header("Content-Type: text/csv; charset=utf-8; header=" . ($this->headerline !== "" ? "present" : "absent"));
460        else
461            header("Content-Type: text/plain; charset=utf-8");
462        $inline = $this->inline;
463        if ($inline === null)
464            $inline = Mimetype::disposition_inline($this->is_csv() ? "text/csv" : "text/plain");
465        $filename = $this->filename;
466        if (!$filename)
467            $filename = "data" . $this->extension();
468        header("Content-Disposition: " . ($inline ? "inline" : "attachment") . "; filename=" . mime_quote_string($filename));
469        // reduce likelihood of XSS attacks in IE
470        header("X-Content-Type-Options: nosniff");
471    }
472
473    function download() {
474        global $zlib_output_compression;
475        if (!$zlib_output_compression)
476            header("Content-Length: " . (strlen($this->headerline) + $this->lines_length));
477        echo $this->headerline;
478        // try to avoid out-of-memory
479        if ($this->lines_length <= 10000000)
480            echo join("", $this->lines);
481        else
482            foreach ($this->lines as $line)
483                echo $line;
484    }
485}
486