1<?php 2 3namespace PhpOffice\PhpSpreadsheet\Reader; 4 5use PhpOffice\PhpSpreadsheet\Cell\Coordinate; 6use PhpOffice\PhpSpreadsheet\Shared\StringHelper; 7use PhpOffice\PhpSpreadsheet\Spreadsheet; 8 9class Csv extends BaseReader 10{ 11 /** 12 * Input encoding. 13 * 14 * @var string 15 */ 16 private $inputEncoding = 'UTF-8'; 17 18 /** 19 * Delimiter. 20 * 21 * @var string 22 */ 23 private $delimiter; 24 25 /** 26 * Enclosure. 27 * 28 * @var string 29 */ 30 private $enclosure = '"'; 31 32 /** 33 * Sheet index to read. 34 * 35 * @var int 36 */ 37 private $sheetIndex = 0; 38 39 /** 40 * Load rows contiguously. 41 * 42 * @var bool 43 */ 44 private $contiguous = false; 45 46 /** 47 * Row counter for loading rows contiguously. 48 * 49 * @var int 50 */ 51 private $contiguousRow = -1; 52 53 /** 54 * The character that can escape the enclosure. 55 * 56 * @var string 57 */ 58 private $escapeCharacter = '\\'; 59 60 /** 61 * Create a new CSV Reader instance. 62 */ 63 public function __construct() 64 { 65 $this->readFilter = new DefaultReadFilter(); 66 } 67 68 /** 69 * Set input encoding. 70 * 71 * @param string $pValue Input encoding, eg: 'UTF-8' 72 * 73 * @return Csv 74 */ 75 public function setInputEncoding($pValue) 76 { 77 $this->inputEncoding = $pValue; 78 79 return $this; 80 } 81 82 /** 83 * Get input encoding. 84 * 85 * @return string 86 */ 87 public function getInputEncoding() 88 { 89 return $this->inputEncoding; 90 } 91 92 /** 93 * Move filepointer past any BOM marker. 94 */ 95 protected function skipBOM() 96 { 97 rewind($this->fileHandle); 98 99 switch ($this->inputEncoding) { 100 case 'UTF-8': 101 fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ? 102 fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0); 103 104 break; 105 case 'UTF-16LE': 106 fgets($this->fileHandle, 3) == "\xFF\xFE" ? 107 fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0); 108 109 break; 110 case 'UTF-16BE': 111 fgets($this->fileHandle, 3) == "\xFE\xFF" ? 112 fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0); 113 114 break; 115 case 'UTF-32LE': 116 fgets($this->fileHandle, 5) == "\xFF\xFE\x00\x00" ? 117 fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0); 118 119 break; 120 case 'UTF-32BE': 121 fgets($this->fileHandle, 5) == "\x00\x00\xFE\xFF" ? 122 fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0); 123 124 break; 125 default: 126 break; 127 } 128 } 129 130 /** 131 * Identify any separator that is explicitly set in the file. 132 */ 133 protected function checkSeparator() 134 { 135 $line = fgets($this->fileHandle); 136 if ($line === false) { 137 return; 138 } 139 140 if ((strlen(trim($line, "\r\n")) == 5) && (stripos($line, 'sep=') === 0)) { 141 $this->delimiter = substr($line, 4, 1); 142 143 return; 144 } 145 146 return $this->skipBOM(); 147 } 148 149 /** 150 * Infer the separator if it isn't explicitly set in the file or specified by the user. 151 */ 152 protected function inferSeparator() 153 { 154 if ($this->delimiter !== null) { 155 return; 156 } 157 158 $potentialDelimiters = [',', ';', "\t", '|', ':', ' ']; 159 $counts = []; 160 foreach ($potentialDelimiters as $delimiter) { 161 $counts[$delimiter] = []; 162 } 163 164 // Count how many times each of the potential delimiters appears in each line 165 $numberLines = 0; 166 while (($line = $this->getNextLine()) !== false && (++$numberLines < 1000)) { 167 $countLine = []; 168 for ($i = strlen($line) - 1; $i >= 0; --$i) { 169 $char = $line[$i]; 170 if (isset($counts[$char])) { 171 if (!isset($countLine[$char])) { 172 $countLine[$char] = 0; 173 } 174 ++$countLine[$char]; 175 } 176 } 177 foreach ($potentialDelimiters as $delimiter) { 178 $counts[$delimiter][] = isset($countLine[$delimiter]) 179 ? $countLine[$delimiter] 180 : 0; 181 } 182 } 183 184 // If number of lines is 0, nothing to infer : fall back to the default 185 if ($numberLines === 0) { 186 $this->delimiter = reset($potentialDelimiters); 187 188 return $this->skipBOM(); 189 } 190 191 // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently) 192 $meanSquareDeviations = []; 193 $middleIdx = floor(($numberLines - 1) / 2); 194 195 foreach ($potentialDelimiters as $delimiter) { 196 $series = $counts[$delimiter]; 197 sort($series); 198 199 $median = ($numberLines % 2) 200 ? $series[$middleIdx] 201 : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2; 202 203 if ($median === 0) { 204 continue; 205 } 206 207 $meanSquareDeviations[$delimiter] = array_reduce( 208 $series, 209 function ($sum, $value) use ($median) { 210 return $sum + pow($value - $median, 2); 211 } 212 ) / count($series); 213 } 214 215 // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected) 216 $min = INF; 217 foreach ($potentialDelimiters as $delimiter) { 218 if (!isset($meanSquareDeviations[$delimiter])) { 219 continue; 220 } 221 222 if ($meanSquareDeviations[$delimiter] < $min) { 223 $min = $meanSquareDeviations[$delimiter]; 224 $this->delimiter = $delimiter; 225 } 226 } 227 228 // If no delimiter could be detected, fall back to the default 229 if ($this->delimiter === null) { 230 $this->delimiter = reset($potentialDelimiters); 231 } 232 233 return $this->skipBOM(); 234 } 235 236 /** 237 * Get the next full line from the file. 238 * 239 * @param string $line 240 * 241 * @return bool|string 242 */ 243 private function getNextLine($line = '') 244 { 245 // Get the next line in the file 246 $newLine = fgets($this->fileHandle); 247 248 // Return false if there is no next line 249 if ($newLine === false) { 250 return false; 251 } 252 253 // Add the new line to the line passed in 254 $line = $line . $newLine; 255 256 // Drop everything that is enclosed to avoid counting false positives in enclosures 257 $enclosure = preg_quote($this->enclosure, '/'); 258 $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/U', '', $line); 259 260 // See if we have any enclosures left in the line 261 $matches = []; 262 preg_match('/(' . $enclosure . ')/', $line, $matches); 263 264 // if we still have an enclosure then we need to read the next line aswell 265 if (count($matches) > 0) { 266 $line = $this->getNextLine($line); 267 } 268 269 return $line; 270 } 271 272 /** 273 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns). 274 * 275 * @param string $pFilename 276 * 277 * @throws Exception 278 * 279 * @return array 280 */ 281 public function listWorksheetInfo($pFilename) 282 { 283 // Open file 284 if (!$this->canRead($pFilename)) { 285 throw new Exception($pFilename . ' is an Invalid Spreadsheet file.'); 286 } 287 $this->openFile($pFilename); 288 $fileHandle = $this->fileHandle; 289 290 // Skip BOM, if any 291 $this->skipBOM(); 292 $this->checkSeparator(); 293 $this->inferSeparator(); 294 295 $worksheetInfo = []; 296 $worksheetInfo[0]['worksheetName'] = 'Worksheet'; 297 $worksheetInfo[0]['lastColumnLetter'] = 'A'; 298 $worksheetInfo[0]['lastColumnIndex'] = 0; 299 $worksheetInfo[0]['totalRows'] = 0; 300 $worksheetInfo[0]['totalColumns'] = 0; 301 302 // Loop through each line of the file in turn 303 while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure, $this->escapeCharacter)) !== false) { 304 ++$worksheetInfo[0]['totalRows']; 305 $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], count($rowData) - 1); 306 } 307 308 $worksheetInfo[0]['lastColumnLetter'] = Coordinate::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex'] + 1); 309 $worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1; 310 311 // Close file 312 fclose($fileHandle); 313 314 return $worksheetInfo; 315 } 316 317 /** 318 * Loads Spreadsheet from file. 319 * 320 * @param string $pFilename 321 * 322 * @throws Exception 323 * 324 * @return Spreadsheet 325 */ 326 public function load($pFilename) 327 { 328 // Create new Spreadsheet 329 $spreadsheet = new Spreadsheet(); 330 331 // Load into this instance 332 return $this->loadIntoExisting($pFilename, $spreadsheet); 333 } 334 335 /** 336 * Loads PhpSpreadsheet from file into PhpSpreadsheet instance. 337 * 338 * @param string $pFilename 339 * @param Spreadsheet $spreadsheet 340 * 341 * @throws Exception 342 * 343 * @return Spreadsheet 344 */ 345 public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet) 346 { 347 $lineEnding = ini_get('auto_detect_line_endings'); 348 ini_set('auto_detect_line_endings', true); 349 350 // Open file 351 if (!$this->canRead($pFilename)) { 352 throw new Exception($pFilename . ' is an Invalid Spreadsheet file.'); 353 } 354 $this->openFile($pFilename); 355 $fileHandle = $this->fileHandle; 356 357 // Skip BOM, if any 358 $this->skipBOM(); 359 $this->checkSeparator(); 360 $this->inferSeparator(); 361 362 // Create new PhpSpreadsheet object 363 while ($spreadsheet->getSheetCount() <= $this->sheetIndex) { 364 $spreadsheet->createSheet(); 365 } 366 $sheet = $spreadsheet->setActiveSheetIndex($this->sheetIndex); 367 368 // Set our starting row based on whether we're in contiguous mode or not 369 $currentRow = 1; 370 if ($this->contiguous) { 371 $currentRow = ($this->contiguousRow == -1) ? $sheet->getHighestRow() : $this->contiguousRow; 372 } 373 374 // Loop through each line of the file in turn 375 while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure, $this->escapeCharacter)) !== false) { 376 $columnLetter = 'A'; 377 foreach ($rowData as $rowDatum) { 378 if ($rowDatum != '' && $this->readFilter->readCell($columnLetter, $currentRow)) { 379 // Convert encoding if necessary 380 if ($this->inputEncoding !== 'UTF-8') { 381 $rowDatum = StringHelper::convertEncoding($rowDatum, 'UTF-8', $this->inputEncoding); 382 } 383 384 // Set cell value 385 $sheet->getCell($columnLetter . $currentRow)->setValue($rowDatum); 386 } 387 ++$columnLetter; 388 } 389 ++$currentRow; 390 } 391 392 // Close file 393 fclose($fileHandle); 394 395 if ($this->contiguous) { 396 $this->contiguousRow = $currentRow; 397 } 398 399 ini_set('auto_detect_line_endings', $lineEnding); 400 401 // Return 402 return $spreadsheet; 403 } 404 405 /** 406 * Get delimiter. 407 * 408 * @return string 409 */ 410 public function getDelimiter() 411 { 412 return $this->delimiter; 413 } 414 415 /** 416 * Set delimiter. 417 * 418 * @param string $delimiter Delimiter, eg: ',' 419 * 420 * @return CSV 421 */ 422 public function setDelimiter($delimiter) 423 { 424 $this->delimiter = $delimiter; 425 426 return $this; 427 } 428 429 /** 430 * Get enclosure. 431 * 432 * @return string 433 */ 434 public function getEnclosure() 435 { 436 return $this->enclosure; 437 } 438 439 /** 440 * Set enclosure. 441 * 442 * @param string $enclosure Enclosure, defaults to " 443 * 444 * @return CSV 445 */ 446 public function setEnclosure($enclosure) 447 { 448 if ($enclosure == '') { 449 $enclosure = '"'; 450 } 451 $this->enclosure = $enclosure; 452 453 return $this; 454 } 455 456 /** 457 * Get sheet index. 458 * 459 * @return int 460 */ 461 public function getSheetIndex() 462 { 463 return $this->sheetIndex; 464 } 465 466 /** 467 * Set sheet index. 468 * 469 * @param int $pValue Sheet index 470 * 471 * @return CSV 472 */ 473 public function setSheetIndex($pValue) 474 { 475 $this->sheetIndex = $pValue; 476 477 return $this; 478 } 479 480 /** 481 * Set Contiguous. 482 * 483 * @param bool $contiguous 484 * 485 * @return Csv 486 */ 487 public function setContiguous($contiguous) 488 { 489 $this->contiguous = (bool) $contiguous; 490 if (!$contiguous) { 491 $this->contiguousRow = -1; 492 } 493 494 return $this; 495 } 496 497 /** 498 * Get Contiguous. 499 * 500 * @return bool 501 */ 502 public function getContiguous() 503 { 504 return $this->contiguous; 505 } 506 507 /** 508 * Set escape backslashes. 509 * 510 * @param string $escapeCharacter 511 * 512 * @return $this 513 */ 514 public function setEscapeCharacter($escapeCharacter) 515 { 516 $this->escapeCharacter = $escapeCharacter; 517 518 return $this; 519 } 520 521 /** 522 * Get escape backslashes. 523 * 524 * @return string 525 */ 526 public function getEscapeCharacter() 527 { 528 return $this->escapeCharacter; 529 } 530 531 /** 532 * Can the current IReader read the file? 533 * 534 * @param string $pFilename 535 * 536 * @return bool 537 */ 538 public function canRead($pFilename) 539 { 540 // Check if file exists 541 try { 542 $this->openFile($pFilename); 543 } catch (Exception $e) { 544 return false; 545 } 546 547 fclose($this->fileHandle); 548 549 // Trust file extension if any 550 if (strtolower(pathinfo($pFilename, PATHINFO_EXTENSION)) === 'csv') { 551 return true; 552 } 553 554 // Attempt to guess mimetype 555 $type = mime_content_type($pFilename); 556 $supportedTypes = [ 557 'text/csv', 558 'text/plain', 559 'inode/x-empty', 560 ]; 561 562 return in_array($type, $supportedTypes, true); 563 } 564} 565