1<?php 2 3namespace PhpOffice\PhpSpreadsheet\Reader; 4 5use DateTime; 6use DateTimeZone; 7use PhpOffice\PhpSpreadsheet\Cell\AddressHelper; 8use PhpOffice\PhpSpreadsheet\Cell\Coordinate; 9use PhpOffice\PhpSpreadsheet\Cell\DataType; 10use PhpOffice\PhpSpreadsheet\DefinedName; 11use PhpOffice\PhpSpreadsheet\Reader\Security\XmlScanner; 12use PhpOffice\PhpSpreadsheet\Reader\Xml\PageSettings; 13use PhpOffice\PhpSpreadsheet\Reader\Xml\Properties; 14use PhpOffice\PhpSpreadsheet\Reader\Xml\Style; 15use PhpOffice\PhpSpreadsheet\RichText\RichText; 16use PhpOffice\PhpSpreadsheet\Settings; 17use PhpOffice\PhpSpreadsheet\Shared\Date; 18use PhpOffice\PhpSpreadsheet\Shared\File; 19use PhpOffice\PhpSpreadsheet\Shared\StringHelper; 20use PhpOffice\PhpSpreadsheet\Spreadsheet; 21use SimpleXMLElement; 22 23/** 24 * Reader for SpreadsheetML, the XML schema for Microsoft Office Excel 2003. 25 */ 26class Xml extends BaseReader 27{ 28 /** 29 * Formats. 30 * 31 * @var array 32 */ 33 protected $styles = []; 34 35 /** 36 * Create a new Excel2003XML Reader instance. 37 */ 38 public function __construct() 39 { 40 parent::__construct(); 41 $this->securityScanner = XmlScanner::getInstance($this); 42 } 43 44 private $fileContents = ''; 45 46 public static function xmlMappings(): array 47 { 48 return array_merge( 49 Style\Fill::FILL_MAPPINGS, 50 Style\Border::BORDER_MAPPINGS 51 ); 52 } 53 54 /** 55 * Can the current IReader read the file? 56 * 57 * @param string $pFilename 58 * 59 * @return bool 60 */ 61 public function canRead($pFilename) 62 { 63 // Office xmlns:o="urn:schemas-microsoft-com:office:office" 64 // Excel xmlns:x="urn:schemas-microsoft-com:office:excel" 65 // XML Spreadsheet xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" 66 // Spreadsheet component xmlns:c="urn:schemas-microsoft-com:office:component:spreadsheet" 67 // XML schema xmlns:s="uuid:BDC6E3F0-6DA3-11d1-A2A3-00AA00C14882" 68 // XML data type xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" 69 // MS-persist recordset xmlns:rs="urn:schemas-microsoft-com:rowset" 70 // Rowset xmlns:z="#RowsetSchema" 71 // 72 73 $signature = [ 74 '<?xml version="1.0"', 75 'xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet', 76 ]; 77 78 // Open file 79 $data = file_get_contents($pFilename); 80 81 // Why? 82 //$data = str_replace("'", '"', $data); // fix headers with single quote 83 84 $valid = true; 85 foreach ($signature as $match) { 86 // every part of the signature must be present 87 if (strpos($data, $match) === false) { 88 $valid = false; 89 90 break; 91 } 92 } 93 94 // Retrieve charset encoding 95 if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $data, $matches)) { 96 $charSet = strtoupper($matches[1]); 97 if (1 == preg_match('/^ISO-8859-\d[\dL]?$/i', $charSet)) { 98 $data = StringHelper::convertEncoding($data, 'UTF-8', $charSet); 99 $data = preg_replace('/(<?xml.*encoding=[\'"]).*?([\'"].*?>)/um', '$1' . 'UTF-8' . '$2', $data, 1); 100 } 101 } 102 $this->fileContents = $data; 103 104 return $valid; 105 } 106 107 /** 108 * Check if the file is a valid SimpleXML. 109 * 110 * @param string $pFilename 111 * 112 * @return false|SimpleXMLElement 113 */ 114 public function trySimpleXMLLoadString($pFilename) 115 { 116 try { 117 $xml = simplexml_load_string( 118 $this->securityScanner->scan($this->fileContents ?: file_get_contents($pFilename)), 119 'SimpleXMLElement', 120 Settings::getLibXmlLoaderOptions() 121 ); 122 } catch (\Exception $e) { 123 throw new Exception('Cannot load invalid XML file: ' . $pFilename, 0, $e); 124 } 125 $this->fileContents = ''; 126 127 return $xml; 128 } 129 130 /** 131 * Reads names of the worksheets from a file, without parsing the whole file to a Spreadsheet object. 132 * 133 * @param string $filename 134 * 135 * @return array 136 */ 137 public function listWorksheetNames($filename) 138 { 139 File::assertFile($filename); 140 if (!$this->canRead($filename)) { 141 throw new Exception($filename . ' is an Invalid Spreadsheet file.'); 142 } 143 144 $worksheetNames = []; 145 146 $xml = $this->trySimpleXMLLoadString($filename); 147 if ($xml === false) { 148 throw new Exception("Problem reading {$filename}"); 149 } 150 151 $namespaces = $xml->getNamespaces(true); 152 153 $xml_ss = $xml->children($namespaces['ss']); 154 foreach ($xml_ss->Worksheet as $worksheet) { 155 $worksheet_ss = self::getAttributes($worksheet, $namespaces['ss']); 156 $worksheetNames[] = (string) $worksheet_ss['Name']; 157 } 158 159 return $worksheetNames; 160 } 161 162 /** 163 * Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns). 164 * 165 * @param string $filename 166 * 167 * @return array 168 */ 169 public function listWorksheetInfo($filename) 170 { 171 File::assertFile($filename); 172 if (!$this->canRead($filename)) { 173 throw new Exception($filename . ' is an Invalid Spreadsheet file.'); 174 } 175 176 $worksheetInfo = []; 177 178 $xml = $this->trySimpleXMLLoadString($filename); 179 if ($xml === false) { 180 throw new Exception("Problem reading {$filename}"); 181 } 182 183 $namespaces = $xml->getNamespaces(true); 184 185 $worksheetID = 1; 186 $xml_ss = $xml->children($namespaces['ss']); 187 foreach ($xml_ss->Worksheet as $worksheet) { 188 $worksheet_ss = self::getAttributes($worksheet, $namespaces['ss']); 189 190 $tmpInfo = []; 191 $tmpInfo['worksheetName'] = ''; 192 $tmpInfo['lastColumnLetter'] = 'A'; 193 $tmpInfo['lastColumnIndex'] = 0; 194 $tmpInfo['totalRows'] = 0; 195 $tmpInfo['totalColumns'] = 0; 196 197 $tmpInfo['worksheetName'] = "Worksheet_{$worksheetID}"; 198 if (isset($worksheet_ss['Name'])) { 199 $tmpInfo['worksheetName'] = (string) $worksheet_ss['Name']; 200 } 201 202 if (isset($worksheet->Table->Row)) { 203 $rowIndex = 0; 204 205 foreach ($worksheet->Table->Row as $rowData) { 206 $columnIndex = 0; 207 $rowHasData = false; 208 209 foreach ($rowData->Cell as $cell) { 210 if (isset($cell->Data)) { 211 $tmpInfo['lastColumnIndex'] = max($tmpInfo['lastColumnIndex'], $columnIndex); 212 $rowHasData = true; 213 } 214 215 ++$columnIndex; 216 } 217 218 ++$rowIndex; 219 220 if ($rowHasData) { 221 $tmpInfo['totalRows'] = max($tmpInfo['totalRows'], $rowIndex); 222 } 223 } 224 } 225 226 $tmpInfo['lastColumnLetter'] = Coordinate::stringFromColumnIndex($tmpInfo['lastColumnIndex'] + 1); 227 $tmpInfo['totalColumns'] = $tmpInfo['lastColumnIndex'] + 1; 228 229 $worksheetInfo[] = $tmpInfo; 230 ++$worksheetID; 231 } 232 233 return $worksheetInfo; 234 } 235 236 /** 237 * Loads Spreadsheet from file. 238 * 239 * @param string $filename 240 * 241 * @return Spreadsheet 242 */ 243 public function load($filename) 244 { 245 // Create new Spreadsheet 246 $spreadsheet = new Spreadsheet(); 247 $spreadsheet->removeSheetByIndex(0); 248 249 // Load into this instance 250 return $this->loadIntoExisting($filename, $spreadsheet); 251 } 252 253 /** 254 * Loads from file into Spreadsheet instance. 255 * 256 * @param string $filename 257 * 258 * @return Spreadsheet 259 */ 260 public function loadIntoExisting($filename, Spreadsheet $spreadsheet) 261 { 262 File::assertFile($filename); 263 if (!$this->canRead($filename)) { 264 throw new Exception($filename . ' is an Invalid Spreadsheet file.'); 265 } 266 267 $xml = $this->trySimpleXMLLoadString($filename); 268 if ($xml === false) { 269 throw new Exception("Problem reading {$filename}"); 270 } 271 272 $namespaces = $xml->getNamespaces(true); 273 274 (new Properties($spreadsheet))->readProperties($xml, $namespaces); 275 276 $this->styles = (new Style())->parseStyles($xml, $namespaces); 277 278 $worksheetID = 0; 279 $xml_ss = $xml->children($namespaces['ss']); 280 281 /** @var null|SimpleXMLElement $worksheetx */ 282 foreach ($xml_ss->Worksheet as $worksheetx) { 283 $worksheet = $worksheetx ?? new SimpleXMLElement('<xml></xml>'); 284 $worksheet_ss = self::getAttributes($worksheet, $namespaces['ss']); 285 286 if ( 287 isset($this->loadSheetsOnly, $worksheet_ss['Name']) && 288 (!in_array($worksheet_ss['Name'], $this->loadSheetsOnly)) 289 ) { 290 continue; 291 } 292 293 // Create new Worksheet 294 $spreadsheet->createSheet(); 295 $spreadsheet->setActiveSheetIndex($worksheetID); 296 $worksheetName = ''; 297 if (isset($worksheet_ss['Name'])) { 298 $worksheetName = (string) $worksheet_ss['Name']; 299 // Use false for $updateFormulaCellReferences to prevent adjustment of worksheet references in 300 // formula cells... during the load, all formulae should be correct, and we're simply bringing 301 // the worksheet name in line with the formula, not the reverse 302 $spreadsheet->getActiveSheet()->setTitle($worksheetName, false, false); 303 } 304 305 // locally scoped defined names 306 if (isset($worksheet->Names[0])) { 307 foreach ($worksheet->Names[0] as $definedName) { 308 $definedName_ss = self::getAttributes($definedName, $namespaces['ss']); 309 $name = (string) $definedName_ss['Name']; 310 $definedValue = (string) $definedName_ss['RefersTo']; 311 $convertedValue = AddressHelper::convertFormulaToA1($definedValue); 312 if ($convertedValue[0] === '=') { 313 $convertedValue = substr($convertedValue, 1); 314 } 315 $spreadsheet->addDefinedName(DefinedName::createInstance($name, $spreadsheet->getActiveSheet(), $convertedValue, true)); 316 } 317 } 318 319 $columnID = 'A'; 320 if (isset($worksheet->Table->Column)) { 321 foreach ($worksheet->Table->Column as $columnData) { 322 $columnData_ss = self::getAttributes($columnData, $namespaces['ss']); 323 if (isset($columnData_ss['Index'])) { 324 $columnID = Coordinate::stringFromColumnIndex((int) $columnData_ss['Index']); 325 } 326 if (isset($columnData_ss['Width'])) { 327 $columnWidth = $columnData_ss['Width']; 328 $spreadsheet->getActiveSheet()->getColumnDimension($columnID)->setWidth($columnWidth / 5.4); 329 } 330 ++$columnID; 331 } 332 } 333 334 $rowID = 1; 335 if (isset($worksheet->Table->Row)) { 336 $additionalMergedCells = 0; 337 foreach ($worksheet->Table->Row as $rowData) { 338 $rowHasData = false; 339 $row_ss = self::getAttributes($rowData, $namespaces['ss']); 340 if (isset($row_ss['Index'])) { 341 $rowID = (int) $row_ss['Index']; 342 } 343 344 $columnID = 'A'; 345 foreach ($rowData->Cell as $cell) { 346 $cell_ss = self::getAttributes($cell, $namespaces['ss']); 347 if (isset($cell_ss['Index'])) { 348 $columnID = Coordinate::stringFromColumnIndex((int) $cell_ss['Index']); 349 } 350 $cellRange = $columnID . $rowID; 351 352 if ($this->getReadFilter() !== null) { 353 if (!$this->getReadFilter()->readCell($columnID, $rowID, $worksheetName)) { 354 ++$columnID; 355 356 continue; 357 } 358 } 359 360 if (isset($cell_ss['HRef'])) { 361 $spreadsheet->getActiveSheet()->getCell($cellRange)->getHyperlink()->setUrl((string) $cell_ss['HRef']); 362 } 363 364 if ((isset($cell_ss['MergeAcross'])) || (isset($cell_ss['MergeDown']))) { 365 $columnTo = $columnID; 366 if (isset($cell_ss['MergeAcross'])) { 367 $additionalMergedCells += (int) $cell_ss['MergeAcross']; 368 $columnTo = Coordinate::stringFromColumnIndex(Coordinate::columnIndexFromString($columnID) + $cell_ss['MergeAcross']); 369 } 370 $rowTo = $rowID; 371 if (isset($cell_ss['MergeDown'])) { 372 $rowTo = $rowTo + $cell_ss['MergeDown']; 373 } 374 $cellRange .= ':' . $columnTo . $rowTo; 375 $spreadsheet->getActiveSheet()->mergeCells($cellRange); 376 } 377 378 $hasCalculatedValue = false; 379 $cellDataFormula = ''; 380 if (isset($cell_ss['Formula'])) { 381 $cellDataFormula = $cell_ss['Formula']; 382 $hasCalculatedValue = true; 383 } 384 if (isset($cell->Data)) { 385 $cellData = $cell->Data; 386 $cellValue = (string) $cellData; 387 $type = DataType::TYPE_NULL; 388 $cellData_ss = self::getAttributes($cellData, $namespaces['ss']); 389 if (isset($cellData_ss['Type'])) { 390 $cellDataType = $cellData_ss['Type']; 391 switch ($cellDataType) { 392 /* 393 const TYPE_STRING = 's'; 394 const TYPE_FORMULA = 'f'; 395 const TYPE_NUMERIC = 'n'; 396 const TYPE_BOOL = 'b'; 397 const TYPE_NULL = 'null'; 398 const TYPE_INLINE = 'inlineStr'; 399 const TYPE_ERROR = 'e'; 400 */ 401 case 'String': 402 $type = DataType::TYPE_STRING; 403 404 break; 405 case 'Number': 406 $type = DataType::TYPE_NUMERIC; 407 $cellValue = (float) $cellValue; 408 if (floor($cellValue) == $cellValue) { 409 $cellValue = (int) $cellValue; 410 } 411 412 break; 413 case 'Boolean': 414 $type = DataType::TYPE_BOOL; 415 $cellValue = ($cellValue != 0); 416 417 break; 418 case 'DateTime': 419 $type = DataType::TYPE_NUMERIC; 420 $dateTime = new DateTime($cellValue, new DateTimeZone('UTC')); 421 $cellValue = Date::PHPToExcel($dateTime); 422 423 break; 424 case 'Error': 425 $type = DataType::TYPE_ERROR; 426 $hasCalculatedValue = false; 427 428 break; 429 } 430 } 431 432 if ($hasCalculatedValue) { 433 $type = DataType::TYPE_FORMULA; 434 $columnNumber = Coordinate::columnIndexFromString($columnID); 435 $cellDataFormula = AddressHelper::convertFormulaToA1($cellDataFormula, $rowID, $columnNumber); 436 } 437 438 $spreadsheet->getActiveSheet()->getCell($columnID . $rowID)->setValueExplicit((($hasCalculatedValue) ? $cellDataFormula : $cellValue), $type); 439 if ($hasCalculatedValue) { 440 $spreadsheet->getActiveSheet()->getCell($columnID . $rowID)->setCalculatedValue($cellValue); 441 } 442 $rowHasData = true; 443 } 444 445 if (isset($cell->Comment)) { 446 $this->parseCellComment($cell->Comment, $namespaces, $spreadsheet, $columnID, $rowID); 447 } 448 449 if (isset($cell_ss['StyleID'])) { 450 $style = (string) $cell_ss['StyleID']; 451 if ((isset($this->styles[$style])) && (!empty($this->styles[$style]))) { 452 //if (!$spreadsheet->getActiveSheet()->cellExists($columnID . $rowID)) { 453 // $spreadsheet->getActiveSheet()->getCell($columnID . $rowID)->setValue(null); 454 //} 455 $spreadsheet->getActiveSheet()->getStyle($cellRange) 456 ->applyFromArray($this->styles[$style]); 457 } 458 } 459 ++$columnID; 460 while ($additionalMergedCells > 0) { 461 ++$columnID; 462 --$additionalMergedCells; 463 } 464 } 465 466 if ($rowHasData) { 467 if (isset($row_ss['Height'])) { 468 $rowHeight = $row_ss['Height']; 469 $spreadsheet->getActiveSheet()->getRowDimension($rowID)->setRowHeight((float) $rowHeight); 470 } 471 } 472 473 ++$rowID; 474 } 475 476 if (isset($namespaces['x'])) { 477 $xmlX = $worksheet->children($namespaces['x']); 478 if (isset($xmlX->WorksheetOptions)) { 479 (new PageSettings($xmlX, $namespaces))->loadPageSettings($spreadsheet); 480 } 481 } 482 } 483 ++$worksheetID; 484 } 485 486 // Globally scoped defined names 487 $activeWorksheet = $spreadsheet->setActiveSheetIndex(0); 488 if (isset($xml->Names[0])) { 489 foreach ($xml->Names[0] as $definedName) { 490 $definedName_ss = self::getAttributes($definedName, $namespaces['ss']); 491 $name = (string) $definedName_ss['Name']; 492 $definedValue = (string) $definedName_ss['RefersTo']; 493 $convertedValue = AddressHelper::convertFormulaToA1($definedValue); 494 if ($convertedValue[0] === '=') { 495 $convertedValue = substr($convertedValue, 1); 496 } 497 $spreadsheet->addDefinedName(DefinedName::createInstance($name, $activeWorksheet, $convertedValue)); 498 } 499 } 500 501 // Return 502 return $spreadsheet; 503 } 504 505 protected function parseCellComment( 506 SimpleXMLElement $comment, 507 array $namespaces, 508 Spreadsheet $spreadsheet, 509 string $columnID, 510 int $rowID 511 ): void { 512 $commentAttributes = $comment->attributes($namespaces['ss']); 513 $author = 'unknown'; 514 if (isset($commentAttributes->Author)) { 515 $author = (string) $commentAttributes->Author; 516 } 517 518 $node = $comment->Data->asXML(); 519 $annotation = strip_tags((string) $node); 520 $spreadsheet->getActiveSheet()->getComment($columnID . $rowID) 521 ->setAuthor($author) 522 ->setText($this->parseRichText($annotation)); 523 } 524 525 protected function parseRichText(string $annotation): RichText 526 { 527 $value = new RichText(); 528 529 $value->createText($annotation); 530 531 return $value; 532 } 533 534 private static function getAttributes(?SimpleXMLElement $simple, string $node): SimpleXMLElement 535 { 536 return ($simple === null) 537 ? new SimpleXMLElement('<xml></xml>') 538 : ($simple->attributes($node) ?? new SimpleXMLElement('<xml></xml>')); 539 } 540} 541