1<?php
2
3namespace PhpOffice\PhpSpreadsheet\Reader;
4
5use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
6use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
7use PhpOffice\PhpSpreadsheet\Spreadsheet;
8
9class Csv extends BaseReader
10{
11    /**
12     * Input encoding.
13     *
14     * @var string
15     */
16    private $inputEncoding = 'UTF-8';
17
18    /**
19     * Delimiter.
20     *
21     * @var string
22     */
23    private $delimiter;
24
25    /**
26     * Enclosure.
27     *
28     * @var string
29     */
30    private $enclosure = '"';
31
32    /**
33     * Sheet index to read.
34     *
35     * @var int
36     */
37    private $sheetIndex = 0;
38
39    /**
40     * Load rows contiguously.
41     *
42     * @var bool
43     */
44    private $contiguous = false;
45
46    /**
47     * Row counter for loading rows contiguously.
48     *
49     * @var int
50     */
51    private $contiguousRow = -1;
52
53    /**
54     * The character that can escape the enclosure.
55     *
56     * @var string
57     */
58    private $escapeCharacter = '\\';
59
60    /**
61     * Create a new CSV Reader instance.
62     */
63    public function __construct()
64    {
65        $this->readFilter = new DefaultReadFilter();
66    }
67
68    /**
69     * Set input encoding.
70     *
71     * @param string $pValue Input encoding, eg: 'UTF-8'
72     *
73     * @return Csv
74     */
75    public function setInputEncoding($pValue)
76    {
77        $this->inputEncoding = $pValue;
78
79        return $this;
80    }
81
82    /**
83     * Get input encoding.
84     *
85     * @return string
86     */
87    public function getInputEncoding()
88    {
89        return $this->inputEncoding;
90    }
91
92    /**
93     * Move filepointer past any BOM marker.
94     */
95    protected function skipBOM()
96    {
97        rewind($this->fileHandle);
98
99        switch ($this->inputEncoding) {
100            case 'UTF-8':
101                fgets($this->fileHandle, 4) == "\xEF\xBB\xBF" ?
102                    fseek($this->fileHandle, 3) : fseek($this->fileHandle, 0);
103
104                break;
105            case 'UTF-16LE':
106                fgets($this->fileHandle, 3) == "\xFF\xFE" ?
107                    fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0);
108
109                break;
110            case 'UTF-16BE':
111                fgets($this->fileHandle, 3) == "\xFE\xFF" ?
112                    fseek($this->fileHandle, 2) : fseek($this->fileHandle, 0);
113
114                break;
115            case 'UTF-32LE':
116                fgets($this->fileHandle, 5) == "\xFF\xFE\x00\x00" ?
117                    fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0);
118
119                break;
120            case 'UTF-32BE':
121                fgets($this->fileHandle, 5) == "\x00\x00\xFE\xFF" ?
122                    fseek($this->fileHandle, 4) : fseek($this->fileHandle, 0);
123
124                break;
125            default:
126                break;
127        }
128    }
129
130    /**
131     * Identify any separator that is explicitly set in the file.
132     */
133    protected function checkSeparator()
134    {
135        $line = fgets($this->fileHandle);
136        if ($line === false) {
137            return;
138        }
139
140        if ((strlen(trim($line, "\r\n")) == 5) && (stripos($line, 'sep=') === 0)) {
141            $this->delimiter = substr($line, 4, 1);
142
143            return;
144        }
145
146        return $this->skipBOM();
147    }
148
149    /**
150     * Infer the separator if it isn't explicitly set in the file or specified by the user.
151     */
152    protected function inferSeparator()
153    {
154        if ($this->delimiter !== null) {
155            return;
156        }
157
158        $potentialDelimiters = [',', ';', "\t", '|', ':', ' '];
159        $counts = [];
160        foreach ($potentialDelimiters as $delimiter) {
161            $counts[$delimiter] = [];
162        }
163
164        // Count how many times each of the potential delimiters appears in each line
165        $numberLines = 0;
166        while (($line = $this->getNextLine()) !== false && (++$numberLines < 1000)) {
167            $countLine = [];
168            for ($i = strlen($line) - 1; $i >= 0; --$i) {
169                $char = $line[$i];
170                if (isset($counts[$char])) {
171                    if (!isset($countLine[$char])) {
172                        $countLine[$char] = 0;
173                    }
174                    ++$countLine[$char];
175                }
176            }
177            foreach ($potentialDelimiters as $delimiter) {
178                $counts[$delimiter][] = isset($countLine[$delimiter])
179                    ? $countLine[$delimiter]
180                    : 0;
181            }
182        }
183
184        // If number of lines is 0, nothing to infer : fall back to the default
185        if ($numberLines === 0) {
186            $this->delimiter = reset($potentialDelimiters);
187
188            return $this->skipBOM();
189        }
190
191        // Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
192        $meanSquareDeviations = [];
193        $middleIdx = floor(($numberLines - 1) / 2);
194
195        foreach ($potentialDelimiters as $delimiter) {
196            $series = $counts[$delimiter];
197            sort($series);
198
199            $median = ($numberLines % 2)
200                ? $series[$middleIdx]
201                : ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
202
203            if ($median === 0) {
204                continue;
205            }
206
207            $meanSquareDeviations[$delimiter] = array_reduce(
208                $series,
209                function ($sum, $value) use ($median) {
210                    return $sum + pow($value - $median, 2);
211                }
212            ) / count($series);
213        }
214
215        // ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
216        $min = INF;
217        foreach ($potentialDelimiters as $delimiter) {
218            if (!isset($meanSquareDeviations[$delimiter])) {
219                continue;
220            }
221
222            if ($meanSquareDeviations[$delimiter] < $min) {
223                $min = $meanSquareDeviations[$delimiter];
224                $this->delimiter = $delimiter;
225            }
226        }
227
228        // If no delimiter could be detected, fall back to the default
229        if ($this->delimiter === null) {
230            $this->delimiter = reset($potentialDelimiters);
231        }
232
233        return $this->skipBOM();
234    }
235
236    /**
237     * Get the next full line from the file.
238     *
239     * @param string $line
240     *
241     * @return bool|string
242     */
243    private function getNextLine($line = '')
244    {
245        // Get the next line in the file
246        $newLine = fgets($this->fileHandle);
247
248        // Return false if there is no next line
249        if ($newLine === false) {
250            return false;
251        }
252
253        // Add the new line to the line passed in
254        $line = $line . $newLine;
255
256        // Drop everything that is enclosed to avoid counting false positives in enclosures
257        $enclosure = preg_quote($this->enclosure, '/');
258        $line = preg_replace('/(' . $enclosure . '.*' . $enclosure . ')/U', '', $line);
259
260        // See if we have any enclosures left in the line
261        $matches = [];
262        preg_match('/(' . $enclosure . ')/', $line, $matches);
263
264        // if we still have an enclosure then we need to read the next line aswell
265        if (count($matches) > 0) {
266            $line = $this->getNextLine($line);
267        }
268
269        return $line;
270    }
271
272    /**
273     * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
274     *
275     * @param string $pFilename
276     *
277     * @throws Exception
278     *
279     * @return array
280     */
281    public function listWorksheetInfo($pFilename)
282    {
283        // Open file
284        if (!$this->canRead($pFilename)) {
285            throw new Exception($pFilename . ' is an Invalid Spreadsheet file.');
286        }
287        $this->openFile($pFilename);
288        $fileHandle = $this->fileHandle;
289
290        // Skip BOM, if any
291        $this->skipBOM();
292        $this->checkSeparator();
293        $this->inferSeparator();
294
295        $worksheetInfo = [];
296        $worksheetInfo[0]['worksheetName'] = 'Worksheet';
297        $worksheetInfo[0]['lastColumnLetter'] = 'A';
298        $worksheetInfo[0]['lastColumnIndex'] = 0;
299        $worksheetInfo[0]['totalRows'] = 0;
300        $worksheetInfo[0]['totalColumns'] = 0;
301
302        // Loop through each line of the file in turn
303        while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure, $this->escapeCharacter)) !== false) {
304            ++$worksheetInfo[0]['totalRows'];
305            $worksheetInfo[0]['lastColumnIndex'] = max($worksheetInfo[0]['lastColumnIndex'], count($rowData) - 1);
306        }
307
308        $worksheetInfo[0]['lastColumnLetter'] = Coordinate::stringFromColumnIndex($worksheetInfo[0]['lastColumnIndex'] + 1);
309        $worksheetInfo[0]['totalColumns'] = $worksheetInfo[0]['lastColumnIndex'] + 1;
310
311        // Close file
312        fclose($fileHandle);
313
314        return $worksheetInfo;
315    }
316
317    /**
318     * Loads Spreadsheet from file.
319     *
320     * @param string $pFilename
321     *
322     * @throws Exception
323     *
324     * @return Spreadsheet
325     */
326    public function load($pFilename)
327    {
328        // Create new Spreadsheet
329        $spreadsheet = new Spreadsheet();
330
331        // Load into this instance
332        return $this->loadIntoExisting($pFilename, $spreadsheet);
333    }
334
335    /**
336     * Loads PhpSpreadsheet from file into PhpSpreadsheet instance.
337     *
338     * @param string $pFilename
339     * @param Spreadsheet $spreadsheet
340     *
341     * @throws Exception
342     *
343     * @return Spreadsheet
344     */
345    public function loadIntoExisting($pFilename, Spreadsheet $spreadsheet)
346    {
347        $lineEnding = ini_get('auto_detect_line_endings');
348        ini_set('auto_detect_line_endings', true);
349
350        // Open file
351        if (!$this->canRead($pFilename)) {
352            throw new Exception($pFilename . ' is an Invalid Spreadsheet file.');
353        }
354        $this->openFile($pFilename);
355        $fileHandle = $this->fileHandle;
356
357        // Skip BOM, if any
358        $this->skipBOM();
359        $this->checkSeparator();
360        $this->inferSeparator();
361
362        // Create new PhpSpreadsheet object
363        while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
364            $spreadsheet->createSheet();
365        }
366        $sheet = $spreadsheet->setActiveSheetIndex($this->sheetIndex);
367
368        // Set our starting row based on whether we're in contiguous mode or not
369        $currentRow = 1;
370        if ($this->contiguous) {
371            $currentRow = ($this->contiguousRow == -1) ? $sheet->getHighestRow() : $this->contiguousRow;
372        }
373
374        // Loop through each line of the file in turn
375        while (($rowData = fgetcsv($fileHandle, 0, $this->delimiter, $this->enclosure, $this->escapeCharacter)) !== false) {
376            $columnLetter = 'A';
377            foreach ($rowData as $rowDatum) {
378                if ($rowDatum != '' && $this->readFilter->readCell($columnLetter, $currentRow)) {
379                    // Convert encoding if necessary
380                    if ($this->inputEncoding !== 'UTF-8') {
381                        $rowDatum = StringHelper::convertEncoding($rowDatum, 'UTF-8', $this->inputEncoding);
382                    }
383
384                    // Set cell value
385                    $sheet->getCell($columnLetter . $currentRow)->setValue($rowDatum);
386                }
387                ++$columnLetter;
388            }
389            ++$currentRow;
390        }
391
392        // Close file
393        fclose($fileHandle);
394
395        if ($this->contiguous) {
396            $this->contiguousRow = $currentRow;
397        }
398
399        ini_set('auto_detect_line_endings', $lineEnding);
400
401        // Return
402        return $spreadsheet;
403    }
404
405    /**
406     * Get delimiter.
407     *
408     * @return string
409     */
410    public function getDelimiter()
411    {
412        return $this->delimiter;
413    }
414
415    /**
416     * Set delimiter.
417     *
418     * @param string $delimiter Delimiter, eg: ','
419     *
420     * @return CSV
421     */
422    public function setDelimiter($delimiter)
423    {
424        $this->delimiter = $delimiter;
425
426        return $this;
427    }
428
429    /**
430     * Get enclosure.
431     *
432     * @return string
433     */
434    public function getEnclosure()
435    {
436        return $this->enclosure;
437    }
438
439    /**
440     * Set enclosure.
441     *
442     * @param string $enclosure Enclosure, defaults to "
443     *
444     * @return CSV
445     */
446    public function setEnclosure($enclosure)
447    {
448        if ($enclosure == '') {
449            $enclosure = '"';
450        }
451        $this->enclosure = $enclosure;
452
453        return $this;
454    }
455
456    /**
457     * Get sheet index.
458     *
459     * @return int
460     */
461    public function getSheetIndex()
462    {
463        return $this->sheetIndex;
464    }
465
466    /**
467     * Set sheet index.
468     *
469     * @param int $pValue Sheet index
470     *
471     * @return CSV
472     */
473    public function setSheetIndex($pValue)
474    {
475        $this->sheetIndex = $pValue;
476
477        return $this;
478    }
479
480    /**
481     * Set Contiguous.
482     *
483     * @param bool $contiguous
484     *
485     * @return Csv
486     */
487    public function setContiguous($contiguous)
488    {
489        $this->contiguous = (bool) $contiguous;
490        if (!$contiguous) {
491            $this->contiguousRow = -1;
492        }
493
494        return $this;
495    }
496
497    /**
498     * Get Contiguous.
499     *
500     * @return bool
501     */
502    public function getContiguous()
503    {
504        return $this->contiguous;
505    }
506
507    /**
508     * Set escape backslashes.
509     *
510     * @param string $escapeCharacter
511     *
512     * @return $this
513     */
514    public function setEscapeCharacter($escapeCharacter)
515    {
516        $this->escapeCharacter = $escapeCharacter;
517
518        return $this;
519    }
520
521    /**
522     * Get escape backslashes.
523     *
524     * @return string
525     */
526    public function getEscapeCharacter()
527    {
528        return $this->escapeCharacter;
529    }
530
531    /**
532     * Can the current IReader read the file?
533     *
534     * @param string $pFilename
535     *
536     * @return bool
537     */
538    public function canRead($pFilename)
539    {
540        // Check if file exists
541        try {
542            $this->openFile($pFilename);
543        } catch (Exception $e) {
544            return false;
545        }
546
547        fclose($this->fileHandle);
548
549        // Trust file extension if any
550        if (strtolower(pathinfo($pFilename, PATHINFO_EXTENSION)) === 'csv') {
551            return true;
552        }
553
554        // Attempt to guess mimetype
555        $type = mime_content_type($pFilename);
556        $supportedTypes = [
557            'text/csv',
558            'text/plain',
559            'inode/x-empty',
560        ];
561
562        return in_array($type, $supportedTypes, true);
563    }
564}
565