1<?php 2// csv.php -- HotCRP CSV parsing functions 3// Copyright (c) 2006-2018 Eddie Kohler; see LICENSE. 4 5class CsvParser { 6 private $lines; 7 private $lpos = 0; 8 private $type; 9 private $typefn; 10 private $header = false; 11 private $comment_chars = false; 12 private $comment_function = null; 13 14 const TYPE_COMMA = 1; 15 const TYPE_PIPE = 2; 16 const TYPE_BAR = 2; 17 const TYPE_TAB = 4; 18 const TYPE_DOUBLEBAR = 8; 19 const TYPE_GUESS = 7; 20 21 static public function split_lines($str) { 22 $b = array(); 23 foreach (preg_split('/([^\r\n]*(?:\z|\r\n?|\n))/', $str, 0, PREG_SPLIT_DELIM_CAPTURE) as $line) 24 if ($line !== "") 25 $b[] = $line; 26 return $b; 27 } 28 29 function __construct($str, $type = self::TYPE_COMMA) { 30 $this->lines = is_array($str) ? $str : self::split_lines($str); 31 $this->set_type($type); 32 } 33 34 private function set_type($type) { 35 $this->type = $type; 36 if ($this->type === self::TYPE_COMMA) 37 $this->typefn = "parse_comma"; 38 else if ($this->type === self::TYPE_BAR) 39 $this->typefn = "parse_bar"; 40 else if ($this->type === self::TYPE_TAB) 41 $this->typefn = "parse_tab"; 42 else if ($this->type === self::TYPE_DOUBLEBAR) 43 $this->typefn = "parse_doublebar"; 44 else 45 $this->typefn = "parse_guess"; 46 } 47 48 function set_comment_chars($s) { 49 $this->comment_chars = $s; 50 } 51 52 function set_comment_function($f) { 53 $this->comment_function = $f; 54 } 55 56 function header() { 57 return $this->header; 58 } 59 60 function set_header($header) { 61 $this->header = $header; 62 } 63 64 static function linelen($line) { 65 $len = strlen($line); 66 if ($len > 0 && $line[$len - 1] === "\n") 67 --$len; 68 if ($len > 0 && $line[$len - 1] === "\r") 69 --$len; 70 return $len; 71 } 72 73 function lineno() { 74 return $this->lpos; 75 } 76 77 function next() { 78 while (($line = $this->shift()) === null) 79 /* loop */; 80 return $line; 81 } 82 83 function unshift($line) { 84 if ($line === null || $line === false) 85 /* do nothing */; 86 else if ($this->lpos > 0) { 87 $this->lines[$this->lpos - 1] = $line; 88 --$this->lpos; 89 } else 90 array_unshift($this->lines, $line); 91 } 92 93 function shift() { 94 if ($this->lpos >= count($this->lines)) 95 return false; 96 $line = $this->lines[$this->lpos]; 97 ++$this->lpos; 98 if (is_array($line)) 99 return self::reparse($line, $this->header); 100 // blank lines, comments 101 if ($line === "" || $line[0] === "\n" || $line[0] === "\r") 102 return null; 103 if ($this->comment_chars 104 && strpos($this->comment_chars, $line[0]) !== false) { 105 $this->comment_function && call_user_func($this->comment_function, $line); 106 return null; 107 } 108 // split on type 109 $fn = $this->typefn; 110 return $this->$fn($line, $this->header); 111 } 112 113 private function parse_guess($line, $header) { 114 $pipe = $tab = $comma = $doublepipe = -1; 115 if ($this->type & self::TYPE_BAR) 116 $pipe = substr_count($line, "|"); 117 if ($this->type & self::TYPE_DOUBLEBAR) 118 $doublepipe = substr_count($line, "||"); 119 if ($doublepipe > 0 && $pipe > 0 && $doublepipe * 2.1 > $pipe) 120 $pipe = -1; 121 if ($this->type & self::TYPE_TAB) 122 $tab = substr_count($line, "\t"); 123 if ($this->type & self::TYPE_COMMA) 124 $comma = substr_count($line, ","); 125 if ($tab > $pipe && $tab > $doublepipe && $tab > $comma) 126 $this->set_type(self::TYPE_TAB); 127 else if ($doublepipe > $pipe && $doublepipe > $comma) 128 $this->set_type(self::TYPE_DOUBLEBAR); 129 else if ($pipe > $comma) 130 $this->set_type(self::TYPE_PIPE); 131 else 132 $this->set_type(self::TYPE_COMMA); 133 $fn = $this->typefn; 134 assert($fn !== "parse_guess"); 135 return $this->$fn($line, $header); 136 } 137 138 function parse_comma($line, $header) { 139 $i = 0; 140 $a = array(); 141 $linelen = self::linelen($line); 142 $pos = 0; 143 while ($pos != $linelen) { 144 if ($i && $line[$pos] === ",") 145 ++$pos; 146 $bpos = $pos; 147 if ($pos != $linelen && $line[$pos] === "\"") { 148 while (1) { 149 $pos = strpos($line, "\"", $pos + 1); 150 if ($pos === false) { 151 $pos = $linelen; 152 if ($this->lpos == count($this->lines)) 153 break; 154 $line .= $this->lines[$this->lpos]; 155 ++$this->lpos; 156 $linelen = self::linelen($line); 157 } else if ($pos + 1 < $linelen && $line[$pos + 1] === "\"") 158 ++$pos; 159 else 160 break; 161 } 162 $field = str_replace("\"\"", "\"", substr($line, $bpos + 1, $pos - $bpos - 1)); 163 if ($pos != $linelen) 164 ++$pos; 165 } else { 166 $pos = strpos($line, ",", $pos); 167 if ($pos === false) 168 $pos = $linelen; 169 $field = substr($line, $bpos, $pos - $bpos); 170 } 171 if ($header && get_s($header, $i) !== "") 172 $a[$header[$i]] = $field; 173 else 174 $a[$i] = $field; 175 ++$i; 176 } 177 return $a; 178 } 179 180 function parse_bar($line, $header) { 181 $i = 0; 182 $a = array(); 183 $linelen = self::linelen($line); 184 $pos = 0; 185 while ($pos != $linelen) { 186 $bpos = $pos; 187 $pos = strpos($line, "|", $pos); 188 if ($pos === false) 189 $pos = $linelen; 190 $field = substr($line, $bpos, $pos - $bpos); 191 if ($header && get_s($header, $i) !== "") 192 $a[$header[$i]] = $field; 193 else 194 $a[$i] = $field; 195 ++$i; 196 if ($pos != $linelen && $line[$pos] === "|") 197 ++$pos; 198 } 199 return $a; 200 } 201 202 function parse_doublebar($line, $header) { 203 $i = 0; 204 $a = array(); 205 $linelen = self::linelen($line); 206 $pos = 0; 207 while ($pos != $linelen) { 208 $bpos = $pos; 209 $pos = strpos($line, "||", $pos); 210 if ($pos === false) 211 $pos = $linelen; 212 $field = substr($line, $bpos, $pos - $bpos); 213 if ($header && get_s($header, $i) !== "") 214 $a[$header[$i]] = $field; 215 else 216 $a[$i] = $field; 217 ++$i; 218 if ($pos + 1 <= $linelen && $line[$pos] === "|" && $line[$pos + 1] === "|") 219 $pos += 2; 220 } 221 return $a; 222 } 223 224 function parse_tab($line, $header) { 225 $i = 0; 226 $a = array(); 227 $linelen = self::linelen($line); 228 $pos = 0; 229 while ($pos != $linelen) { 230 $bpos = $pos; 231 $pos = strpos($line, "\t", $pos); 232 if ($pos === false) 233 $pos = $linelen; 234 $field = substr($line, $bpos, $pos - $bpos); 235 if ($header && get_s($header, $i) !== "") 236 $a[$header[$i]] = $field; 237 else 238 $a[$i] = $field; 239 ++$i; 240 if ($pos != $linelen && $line[$pos] === "\t") 241 ++$pos; 242 } 243 return $a; 244 } 245 246 static function reparse($line, $header) { 247 $i = 0; 248 $a = array(); 249 foreach ($line as $field) { 250 if ($header && get_s($header, $i) !== "") 251 $a[$header[$i]] = $field; 252 else 253 $a[$i] = $field; 254 ++$i; 255 } 256 return $a; 257 } 258} 259 260class CsvGenerator { 261 const TYPE_COMMA = 0; 262 const TYPE_PIPE = 1; 263 const TYPE_TAB = 2; 264 const FLAG_TYPE = 3; 265 const FLAG_ALWAYS_QUOTE = 4; 266 const FLAG_CRLF = 8; 267 const FLAG_CR = 16; 268 const FLAG_LF = 0; 269 const FLAG_ITEM_COMMENTS = 32; 270 271 private $type; 272 private $flags; 273 private $lines = array(); 274 private $lines_length = 0; 275 public $headerline = ""; 276 private $selection = null; 277 private $selection_is_names = false; 278 private $lf = "\n"; 279 private $comment = "# "; 280 private $inline = null; 281 private $filename; 282 283 static function always_quote($text) { 284 return '"' . str_replace('"', '""', $text) . '"'; 285 } 286 287 static function quote($text, $quote_empty = false) { 288 if ($text === "") 289 return $quote_empty ? '""' : $text; 290 else if (preg_match('/\A[-_@\$+A-Za-z0-9.](?:[-_@\$+A-Za-z0-9. \t]*[-_\$+A-Za-z0-9.]|)\z/', $text)) 291 return $text; 292 else 293 return self::always_quote($text); 294 } 295 296 297 function __construct($flags = self::TYPE_COMMA) { 298 $this->type = $flags & self::FLAG_TYPE; 299 $this->flags = $flags; 300 if ($this->flags & self::FLAG_CRLF) 301 $this->lf = "\r\n"; 302 else if ($this->flags & self::FLAG_CR) 303 $this->lf = "\r"; 304 } 305 306 function select($selection, $header = null) { 307 assert(empty($this->lines) && $this->headerline === ""); 308 if ($header === false || $header === []) { 309 $this->selection = $selection; 310 } else if ($header !== null) { 311 assert(is_array($selection) && !is_associative_array($selection) 312 && is_array($header) && !is_associative_array($header) 313 && count($selection) === count($header)); 314 $this->add($header); 315 $this->selection = $selection; 316 } else if (is_associative_array($selection)) { 317 $this->add(array_values($selection)); 318 $this->selection = array_keys($selection); 319 } else { 320 $this->add($selection); 321 $this->selection = $selection; 322 } 323 $this->selection_is_names = true; 324 foreach ($this->selection as $s) { 325 if (ctype_digit($s)) 326 $this->selection_is_names = false; 327 } 328 if (!empty($this->lines)) { 329 $this->headerline = $this->lines[0]; 330 $this->lines = []; 331 $this->lines_length = 0; 332 } 333 return $this; 334 } 335 336 function set_filename($filename) { 337 $this->filename = $filename; 338 } 339 340 function set_inline($inline) { 341 $this->inline = $inline; 342 } 343 344 345 function is_empty() { 346 return empty($this->lines); 347 } 348 349 function is_csv() { 350 return $this->type == self::TYPE_COMMA; 351 } 352 353 function extension() { 354 return $this->type == self::TYPE_COMMA ? ".csv" : ".txt"; 355 } 356 357 private function apply_selection($row, $is_array) { 358 if (!$this->selection 359 || empty($row) 360 || ($this->selection_is_names 361 && $is_array 362 && !is_associative_array($row) 363 && count($row) <= count($this->selection))) { 364 return $row; 365 } 366 $selected = array(); 367 $i = 0; 368 foreach ($this->selection as $key) { 369 if (isset($row[$key])) { 370 while (count($selected) < $i) 371 $selected[] = ""; 372 $selected[] = $row[$key]; 373 } 374 ++$i; 375 } 376 if (empty($selected) && $is_array) { 377 for ($i = 0; 378 array_key_exists($i, $row) && $i != count($this->selection); 379 ++$i) 380 $selected[] = $row[$i]; 381 } 382 return $selected; 383 } 384 385 function add_string($text) { 386 $this->lines[] = $text; 387 $this->lines_length += strlen($text); 388 return $this; 389 } 390 391 function add_comment($text) { 392 preg_match_all('/([^\r\n]*)(?:\r\n?|\n|\z)/', $text, $m); 393 if ($m[1][count($m[1]) - 1] === "") 394 array_pop($m[1]); 395 foreach ($m[1] as $x) 396 $this->add_string($this->comment . $x . $this->lf); 397 return $this; 398 } 399 400 function add($row) { 401 if (is_string($row)) { 402 error_log("unexpected CsvGenerator::add(string): " . json_encode(debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS))); 403 $this->add_string($row); 404 return $this; 405 } else if (empty($row)) 406 return $this; 407 reset($row); 408 if (is_array(current($row)) || is_object(current($row))) { 409 foreach ($row as $x) 410 $this->add($x); 411 } else { 412 $is_array = is_array($row); 413 if (!$is_array) 414 $row = (array) $row; 415 if (($this->flags & self::FLAG_ITEM_COMMENTS) 416 && $this->selection 417 && isset($row["__precomment__"]) 418 && ($cmt = (string) $row["__precomment__"]) !== "") 419 $this->add_comment($cmt); 420 $srow = $row; 421 if ($this->selection) 422 $srow = $this->apply_selection($srow, $is_array); 423 if ($this->type == self::TYPE_COMMA) { 424 if ($this->flags & self::FLAG_ALWAYS_QUOTE) { 425 foreach ($srow as &$x) 426 $x = self::always_quote($x); 427 } else { 428 foreach ($srow as &$x) 429 $x = self::quote($x); 430 } 431 $this->add_string(join(",", $srow) . $this->lf); 432 } else if ($this->type == self::TYPE_TAB) 433 $this->add_string(join("\t", $srow) . $this->lf); 434 else 435 $this->add_string(join("|", $srow) . $this->lf); 436 if (($this->flags & self::FLAG_ITEM_COMMENTS) 437 && $this->selection 438 && isset($row["__postcomment__"]) 439 && ($cmt = (string) $row["__postcomment__"]) !== "") { 440 $this->add_comment($cmt); 441 $this->add_string($this->lf); 442 } 443 } 444 return $this; 445 } 446 447 function sort($flags = SORT_NORMAL) { 448 sort($this->lines, $flags); 449 return $this; 450 } 451 452 453 function unparse() { 454 return $this->headerline . join("", $this->lines); 455 } 456 457 function download_headers() { 458 if ($this->is_csv()) 459 header("Content-Type: text/csv; charset=utf-8; header=" . ($this->headerline !== "" ? "present" : "absent")); 460 else 461 header("Content-Type: text/plain; charset=utf-8"); 462 $inline = $this->inline; 463 if ($inline === null) 464 $inline = Mimetype::disposition_inline($this->is_csv() ? "text/csv" : "text/plain"); 465 $filename = $this->filename; 466 if (!$filename) 467 $filename = "data" . $this->extension(); 468 header("Content-Disposition: " . ($inline ? "inline" : "attachment") . "; filename=" . mime_quote_string($filename)); 469 // reduce likelihood of XSS attacks in IE 470 header("X-Content-Type-Options: nosniff"); 471 } 472 473 function download() { 474 global $zlib_output_compression; 475 if (!$zlib_output_compression) 476 header("Content-Length: " . (strlen($this->headerline) + $this->lines_length)); 477 echo $this->headerline; 478 // try to avoid out-of-memory 479 if ($this->lines_length <= 10000000) 480 echo join("", $this->lines); 481 else 482 foreach ($this->lines as $line) 483 echo $line; 484 } 485} 486