1<?php
2
3namespace PhpOffice\PhpSpreadsheet\Reader;
4
5use DateTime;
6use DateTimeZone;
7use PhpOffice\PhpSpreadsheet\Cell\AddressHelper;
8use PhpOffice\PhpSpreadsheet\Cell\Coordinate;
9use PhpOffice\PhpSpreadsheet\Cell\DataType;
10use PhpOffice\PhpSpreadsheet\DefinedName;
11use PhpOffice\PhpSpreadsheet\Reader\Security\XmlScanner;
12use PhpOffice\PhpSpreadsheet\Reader\Xml\PageSettings;
13use PhpOffice\PhpSpreadsheet\Reader\Xml\Properties;
14use PhpOffice\PhpSpreadsheet\Reader\Xml\Style;
15use PhpOffice\PhpSpreadsheet\RichText\RichText;
16use PhpOffice\PhpSpreadsheet\Settings;
17use PhpOffice\PhpSpreadsheet\Shared\Date;
18use PhpOffice\PhpSpreadsheet\Shared\File;
19use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
20use PhpOffice\PhpSpreadsheet\Spreadsheet;
21use SimpleXMLElement;
22
23/**
24 * Reader for SpreadsheetML, the XML schema for Microsoft Office Excel 2003.
25 */
26class Xml extends BaseReader
27{
28    /**
29     * Formats.
30     *
31     * @var array
32     */
33    protected $styles = [];
34
35    /**
36     * Create a new Excel2003XML Reader instance.
37     */
38    public function __construct()
39    {
40        parent::__construct();
41        $this->securityScanner = XmlScanner::getInstance($this);
42    }
43
44    private $fileContents = '';
45
46    public static function xmlMappings(): array
47    {
48        return array_merge(
49            Style\Fill::FILL_MAPPINGS,
50            Style\Border::BORDER_MAPPINGS
51        );
52    }
53
54    /**
55     * Can the current IReader read the file?
56     *
57     * @param string $pFilename
58     *
59     * @return bool
60     */
61    public function canRead($pFilename)
62    {
63        //    Office                    xmlns:o="urn:schemas-microsoft-com:office:office"
64        //    Excel                    xmlns:x="urn:schemas-microsoft-com:office:excel"
65        //    XML Spreadsheet            xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet"
66        //    Spreadsheet component    xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet"
67        //    XML schema                 xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882"
68        //    XML data type            xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882"
69        //    MS-persist recordset    xmlns:rs="urn:schemas-microsoft-com:rowset"
70        //    Rowset                    xmlns:z="#RowsetSchema"
71        //
72
73        $signature = [
74            '<?xml version="1.0"',
75            'xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet',
76        ];
77
78        // Open file
79        $data = file_get_contents($pFilename);
80
81        // Why?
82        //$data = str_replace("'", '"', $data); // fix headers with single quote
83
84        $valid = true;
85        foreach ($signature as $match) {
86            // every part of the signature must be present
87            if (strpos($data, $match) === false) {
88                $valid = false;
89
90                break;
91            }
92        }
93
94        //    Retrieve charset encoding
95        if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $data, $matches)) {
96            $charSet = strtoupper($matches[1]);
97            if (1 == preg_match('/^ISO-8859-\d[\dL]?$/i', $charSet)) {
98                $data = StringHelper::convertEncoding($data, 'UTF-8', $charSet);
99                $data = preg_replace('/(<?xml.*encoding=[\'"]).*?([\'"].*?>)/um', '$1' . 'UTF-8' . '$2', $data, 1);
100            }
101        }
102        $this->fileContents = $data;
103
104        return $valid;
105    }
106
107    /**
108     * Check if the file is a valid SimpleXML.
109     *
110     * @param string $pFilename
111     *
112     * @return false|SimpleXMLElement
113     */
114    public function trySimpleXMLLoadString($pFilename)
115    {
116        try {
117            $xml = simplexml_load_string(
118                $this->securityScanner->scan($this->fileContents ?: file_get_contents($pFilename)),
119                'SimpleXMLElement',
120                Settings::getLibXmlLoaderOptions()
121            );
122        } catch (\Exception $e) {
123            throw new Exception('Cannot load invalid XML file: ' . $pFilename, 0, $e);
124        }
125        $this->fileContents = '';
126
127        return $xml;
128    }
129
130    /**
131     * Reads names of the worksheets from a file, without parsing the whole file to a Spreadsheet object.
132     *
133     * @param string $filename
134     *
135     * @return array
136     */
137    public function listWorksheetNames($filename)
138    {
139        File::assertFile($filename);
140        if (!$this->canRead($filename)) {
141            throw new Exception($filename . ' is an Invalid Spreadsheet file.');
142        }
143
144        $worksheetNames = [];
145
146        $xml = $this->trySimpleXMLLoadString($filename);
147        if ($xml === false) {
148            throw new Exception("Problem reading {$filename}");
149        }
150
151        $namespaces = $xml->getNamespaces(true);
152
153        $xml_ss = $xml->children($namespaces['ss']);
154        foreach ($xml_ss->Worksheet as $worksheet) {
155            $worksheet_ss = self::getAttributes($worksheet, $namespaces['ss']);
156            $worksheetNames[] = (string) $worksheet_ss['Name'];
157        }
158
159        return $worksheetNames;
160    }
161
162    /**
163     * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
164     *
165     * @param string $filename
166     *
167     * @return array
168     */
169    public function listWorksheetInfo($filename)
170    {
171        File::assertFile($filename);
172        if (!$this->canRead($filename)) {
173            throw new Exception($filename . ' is an Invalid Spreadsheet file.');
174        }
175
176        $worksheetInfo = [];
177
178        $xml = $this->trySimpleXMLLoadString($filename);
179        if ($xml === false) {
180            throw new Exception("Problem reading {$filename}");
181        }
182
183        $namespaces = $xml->getNamespaces(true);
184
185        $worksheetID = 1;
186        $xml_ss = $xml->children($namespaces['ss']);
187        foreach ($xml_ss->Worksheet as $worksheet) {
188            $worksheet_ss = self::getAttributes($worksheet, $namespaces['ss']);
189
190            $tmpInfo = [];
191            $tmpInfo['worksheetName'] = '';
192            $tmpInfo['lastColumnLetter'] = 'A';
193            $tmpInfo['lastColumnIndex'] = 0;
194            $tmpInfo['totalRows'] = 0;
195            $tmpInfo['totalColumns'] = 0;
196
197            $tmpInfo['worksheetName'] = "Worksheet_{$worksheetID}";
198            if (isset($worksheet_ss['Name'])) {
199                $tmpInfo['worksheetName'] = (string) $worksheet_ss['Name'];
200            }
201
202            if (isset($worksheet->Table->Row)) {
203                $rowIndex = 0;
204
205                foreach ($worksheet->Table->Row as $rowData) {
206                    $columnIndex = 0;
207                    $rowHasData = false;
208
209                    foreach ($rowData->Cell as $cell) {
210                        if (isset($cell->Data)) {
211                            $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex);
212                            $rowHasData = true;
213                        }
214
215                        ++$columnIndex;
216                    }
217
218                    ++$rowIndex;
219
220                    if ($rowHasData) {
221                        $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex);
222                    }
223                }
224            }
225
226            $tmpInfo['lastColumnLetter'] = Coordinate::stringFromColumnIndex($tmpInfo['lastColumnIndex'] + 1);
227            $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1;
228
229            $worksheetInfo[] = $tmpInfo;
230            ++$worksheetID;
231        }
232
233        return $worksheetInfo;
234    }
235
236    /**
237     * Loads Spreadsheet from file.
238     *
239     * @param string $filename
240     *
241     * @return Spreadsheet
242     */
243    public function load($filename)
244    {
245        // Create new Spreadsheet
246        $spreadsheet = new Spreadsheet();
247        $spreadsheet->removeSheetByIndex(0);
248
249        // Load into this instance
250        return $this->loadIntoExisting($filename, $spreadsheet);
251    }
252
253    /**
254     * Loads from file into Spreadsheet instance.
255     *
256     * @param string $filename
257     *
258     * @return Spreadsheet
259     */
260    public function loadIntoExisting($filename, Spreadsheet $spreadsheet)
261    {
262        File::assertFile($filename);
263        if (!$this->canRead($filename)) {
264            throw new Exception($filename . ' is an Invalid Spreadsheet file.');
265        }
266
267        $xml = $this->trySimpleXMLLoadString($filename);
268        if ($xml === false) {
269            throw new Exception("Problem reading {$filename}");
270        }
271
272        $namespaces = $xml->getNamespaces(true);
273
274        (new Properties($spreadsheet))->readProperties($xml, $namespaces);
275
276        $this->styles = (new Style())->parseStyles($xml, $namespaces);
277
278        $worksheetID = 0;
279        $xml_ss = $xml->children($namespaces['ss']);
280
281        /** @var null|SimpleXMLElement $worksheetx */
282        foreach ($xml_ss->Worksheet as $worksheetx) {
283            $worksheet = $worksheetx ?? new SimpleXMLElement('<xml></xml>');
284            $worksheet_ss = self::getAttributes($worksheet, $namespaces['ss']);
285
286            if (
287                isset($this->loadSheetsOnly, $worksheet_ss['Name']) &&
288                (!in_array($worksheet_ss['Name'], $this->loadSheetsOnly))
289            ) {
290                continue;
291            }
292
293            // Create new Worksheet
294            $spreadsheet->createSheet();
295            $spreadsheet->setActiveSheetIndex($worksheetID);
296            $worksheetName = '';
297            if (isset($worksheet_ss['Name'])) {
298                $worksheetName = (string) $worksheet_ss['Name'];
299                //    Use false for $updateFormulaCellReferences to prevent adjustment of worksheet references in
300                //        formula cells... during the load, all formulae should be correct, and we're simply bringing
301                //        the worksheet name in line with the formula, not the reverse
302                $spreadsheet->getActiveSheet()->setTitle($worksheetName, false, false);
303            }
304
305            // locally scoped defined names
306            if (isset($worksheet->Names[0])) {
307                foreach ($worksheet->Names[0] as $definedName) {
308                    $definedName_ss = self::getAttributes($definedName, $namespaces['ss']);
309                    $name = (string) $definedName_ss['Name'];
310                    $definedValue = (string) $definedName_ss['RefersTo'];
311                    $convertedValue = AddressHelper::convertFormulaToA1($definedValue);
312                    if ($convertedValue[0] === '=') {
313                        $convertedValue = substr($convertedValue, 1);
314                    }
315                    $spreadsheet->addDefinedName(DefinedName::createInstance($name, $spreadsheet->getActiveSheet(), $convertedValue, true));
316                }
317            }
318
319            $columnID = 'A';
320            if (isset($worksheet->Table->Column)) {
321                foreach ($worksheet->Table->Column as $columnData) {
322                    $columnData_ss = self::getAttributes($columnData, $namespaces['ss']);
323                    if (isset($columnData_ss['Index'])) {
324                        $columnID = Coordinate::stringFromColumnIndex((int) $columnData_ss['Index']);
325                    }
326                    if (isset($columnData_ss['Width'])) {
327                        $columnWidth = $columnData_ss['Width'];
328                        $spreadsheet->getActiveSheet()->getColumnDimension($columnID)->setWidth($columnWidth / 5.4);
329                    }
330                    ++$columnID;
331                }
332            }
333
334            $rowID = 1;
335            if (isset($worksheet->Table->Row)) {
336                $additionalMergedCells = 0;
337                foreach ($worksheet->Table->Row as $rowData) {
338                    $rowHasData = false;
339                    $row_ss = self::getAttributes($rowData, $namespaces['ss']);
340                    if (isset($row_ss['Index'])) {
341                        $rowID = (int) $row_ss['Index'];
342                    }
343
344                    $columnID = 'A';
345                    foreach ($rowData->Cell as $cell) {
346                        $cell_ss = self::getAttributes($cell, $namespaces['ss']);
347                        if (isset($cell_ss['Index'])) {
348                            $columnID = Coordinate::stringFromColumnIndex((int) $cell_ss['Index']);
349                        }
350                        $cellRange = $columnID . $rowID;
351
352                        if ($this->getReadFilter() !== null) {
353                            if (!$this->getReadFilter()->readCell($columnID, $rowID, $worksheetName)) {
354                                ++$columnID;
355
356                                continue;
357                            }
358                        }
359
360                        if (isset($cell_ss['HRef'])) {
361                            $spreadsheet->getActiveSheet()->getCell($cellRange)->getHyperlink()->setUrl((string) $cell_ss['HRef']);
362                        }
363
364                        if ((isset($cell_ss['MergeAcross'])) || (isset($cell_ss['MergeDown']))) {
365                            $columnTo = $columnID;
366                            if (isset($cell_ss['MergeAcross'])) {
367                                $additionalMergedCells += (int) $cell_ss['MergeAcross'];
368                                $columnTo = Coordinate::stringFromColumnIndex(Coordinate::columnIndexFromString($columnID) + $cell_ss['MergeAcross']);
369                            }
370                            $rowTo = $rowID;
371                            if (isset($cell_ss['MergeDown'])) {
372                                $rowTo = $rowTo + $cell_ss['MergeDown'];
373                            }
374                            $cellRange .= ':' . $columnTo . $rowTo;
375                            $spreadsheet->getActiveSheet()->mergeCells($cellRange);
376                        }
377
378                        $hasCalculatedValue = false;
379                        $cellDataFormula = '';
380                        if (isset($cell_ss['Formula'])) {
381                            $cellDataFormula = $cell_ss['Formula'];
382                            $hasCalculatedValue = true;
383                        }
384                        if (isset($cell->Data)) {
385                            $cellData = $cell->Data;
386                            $cellValue = (string) $cellData;
387                            $type = DataType::TYPE_NULL;
388                            $cellData_ss = self::getAttributes($cellData, $namespaces['ss']);
389                            if (isset($cellData_ss['Type'])) {
390                                $cellDataType = $cellData_ss['Type'];
391                                switch ($cellDataType) {
392                                    /*
393                                    const TYPE_STRING        = 's';
394                                    const TYPE_FORMULA        = 'f';
395                                    const TYPE_NUMERIC        = 'n';
396                                    const TYPE_BOOL            = 'b';
397                                    const TYPE_NULL            = 'null';
398                                    const TYPE_INLINE        = 'inlineStr';
399                                    const TYPE_ERROR        = 'e';
400                                    */
401                                    case 'String':
402                                        $type = DataType::TYPE_STRING;
403
404                                        break;
405                                    case 'Number':
406                                        $type = DataType::TYPE_NUMERIC;
407                                        $cellValue = (float) $cellValue;
408                                        if (floor($cellValue) == $cellValue) {
409                                            $cellValue = (int) $cellValue;
410                                        }
411
412                                        break;
413                                    case 'Boolean':
414                                        $type = DataType::TYPE_BOOL;
415                                        $cellValue = ($cellValue != 0);
416
417                                        break;
418                                    case 'DateTime':
419                                        $type = DataType::TYPE_NUMERIC;
420                                        $dateTime = new DateTime($cellValue, new DateTimeZone('UTC'));
421                                        $cellValue = Date::PHPToExcel($dateTime);
422
423                                        break;
424                                    case 'Error':
425                                        $type = DataType::TYPE_ERROR;
426                                        $hasCalculatedValue = false;
427
428                                        break;
429                                }
430                            }
431
432                            if ($hasCalculatedValue) {
433                                $type = DataType::TYPE_FORMULA;
434                                $columnNumber = Coordinate::columnIndexFromString($columnID);
435                                $cellDataFormula = AddressHelper::convertFormulaToA1($cellDataFormula, $rowID, $columnNumber);
436                            }
437
438                            $spreadsheet->getActiveSheet()->getCell($columnID . $rowID)->setValueExplicit((($hasCalculatedValue) ? $cellDataFormula : $cellValue), $type);
439                            if ($hasCalculatedValue) {
440                                $spreadsheet->getActiveSheet()->getCell($columnID . $rowID)->setCalculatedValue($cellValue);
441                            }
442                            $rowHasData = true;
443                        }
444
445                        if (isset($cell->Comment)) {
446                            $this->parseCellComment($cell->Comment, $namespaces, $spreadsheet, $columnID, $rowID);
447                        }
448
449                        if (isset($cell_ss['StyleID'])) {
450                            $style = (string) $cell_ss['StyleID'];
451                            if ((isset($this->styles[$style])) && (!empty($this->styles[$style]))) {
452                                //if (!$spreadsheet->getActiveSheet()->cellExists($columnID . $rowID)) {
453                                //    $spreadsheet->getActiveSheet()->getCell($columnID . $rowID)->setValue(null);
454                                //}
455                                $spreadsheet->getActiveSheet()->getStyle($cellRange)
456                                    ->applyFromArray($this->styles[$style]);
457                            }
458                        }
459                        ++$columnID;
460                        while ($additionalMergedCells > 0) {
461                            ++$columnID;
462                            --$additionalMergedCells;
463                        }
464                    }
465
466                    if ($rowHasData) {
467                        if (isset($row_ss['Height'])) {
468                            $rowHeight = $row_ss['Height'];
469                            $spreadsheet->getActiveSheet()->getRowDimension($rowID)->setRowHeight((float) $rowHeight);
470                        }
471                    }
472
473                    ++$rowID;
474                }
475
476                if (isset($namespaces['x'])) {
477                    $xmlX = $worksheet->children($namespaces['x']);
478                    if (isset($xmlX->WorksheetOptions)) {
479                        (new PageSettings($xmlX, $namespaces))->loadPageSettings($spreadsheet);
480                    }
481                }
482            }
483            ++$worksheetID;
484        }
485
486        // Globally scoped defined names
487        $activeWorksheet = $spreadsheet->setActiveSheetIndex(0);
488        if (isset($xml->Names[0])) {
489            foreach ($xml->Names[0] as $definedName) {
490                $definedName_ss = self::getAttributes($definedName, $namespaces['ss']);
491                $name = (string) $definedName_ss['Name'];
492                $definedValue = (string) $definedName_ss['RefersTo'];
493                $convertedValue = AddressHelper::convertFormulaToA1($definedValue);
494                if ($convertedValue[0] === '=') {
495                    $convertedValue = substr($convertedValue, 1);
496                }
497                $spreadsheet->addDefinedName(DefinedName::createInstance($name, $activeWorksheet, $convertedValue));
498            }
499        }
500
501        // Return
502        return $spreadsheet;
503    }
504
505    protected function parseCellComment(
506        SimpleXMLElement $comment,
507        array $namespaces,
508        Spreadsheet $spreadsheet,
509        string $columnID,
510        int $rowID
511    ): void {
512        $commentAttributes = $comment->attributes($namespaces['ss']);
513        $author = 'unknown';
514        if (isset($commentAttributes->Author)) {
515            $author = (string) $commentAttributes->Author;
516        }
517
518        $node = $comment->Data->asXML();
519        $annotation = strip_tags((string) $node);
520        $spreadsheet->getActiveSheet()->getComment($columnID . $rowID)
521            ->setAuthor($author)
522            ->setText($this->parseRichText($annotation));
523    }
524
525    protected function parseRichText(string $annotation): RichText
526    {
527        $value = new RichText();
528
529        $value->createText($annotation);
530
531        return $value;
532    }
533
534    private static function getAttributes(?SimpleXMLElement $simple, string $node): SimpleXMLElement
535    {
536        return ($simple === null)
537            ? new SimpleXMLElement('<xml></xml>')
538            : ($simple->attributes($node) ?? new SimpleXMLElement('<xml></xml>'));
539    }
540}
541