1<?php
2
3if (!defined('PHPEXCEL_ROOT')) {
4    /**
5     * @ignore
6     */
7    define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
8    require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
9}
10
11/**
12 * PHPExcel_Reader_HTML
13 *
14 * Copyright (c) 2006 - 2015 PHPExcel
15 *
16 * This library is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU Lesser General Public
18 * License as published by the Free Software Foundation; either
19 * version 2.1 of the License, or (at your option) any later version.
20 *
21 * This library is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24 * Lesser General Public License for more details.
25 *
26 * You should have received a copy of the GNU Lesser General Public
27 * License along with this library; if not, write to the Free Software
28 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
29 *
30 * @category   PHPExcel
31 * @package    PHPExcel_Reader
32 * @copyright  Copyright (c) 2006 - 2015 PHPExcel (http://www.codeplex.com/PHPExcel)
33 * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt    LGPL
34 * @version    ##VERSION##, ##DATE##
35 */
36/** PHPExcel root directory */
37class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader
38{
39
40    /**
41     * Input encoding
42     *
43     * @var string
44     */
45    protected $inputEncoding = 'ANSI';
46
47    /**
48     * Sheet index to read
49     *
50     * @var int
51     */
52    protected $sheetIndex = 0;
53
54    /**
55     * Formats
56     *
57     * @var array
58     */
59    protected $formats = array(
60        'h1' => array(
61            'font' => array(
62                'bold' => true,
63                'size' => 24,
64            ),
65        ), //    Bold, 24pt
66        'h2' => array(
67            'font' => array(
68                'bold' => true,
69                'size' => 18,
70            ),
71        ), //    Bold, 18pt
72        'h3' => array(
73            'font' => array(
74                'bold' => true,
75                'size' => 13.5,
76            ),
77        ), //    Bold, 13.5pt
78        'h4' => array(
79            'font' => array(
80                'bold' => true,
81                'size' => 12,
82            ),
83        ), //    Bold, 12pt
84        'h5' => array(
85            'font' => array(
86                'bold' => true,
87                'size' => 10,
88            ),
89        ), //    Bold, 10pt
90        'h6' => array(
91            'font' => array(
92                'bold' => true,
93                'size' => 7.5,
94            ),
95        ), //    Bold, 7.5pt
96        'a' => array(
97            'font' => array(
98                'underline' => true,
99                'color' => array(
100                    'argb' => PHPExcel_Style_Color::COLOR_BLUE,
101                ),
102            ),
103        ), //    Blue underlined
104        'hr' => array(
105            'borders' => array(
106                'bottom' => array(
107                    'style' => PHPExcel_Style_Border::BORDER_THIN,
108                    'color' => array(
109                        PHPExcel_Style_Color::COLOR_BLACK,
110                    ),
111                ),
112            ),
113        ), //    Bottom border
114    );
115
116    protected $rowspan = array();
117
118    /**
119     * Create a new PHPExcel_Reader_HTML
120     */
121    public function __construct()
122    {
123        $this->readFilter = new PHPExcel_Reader_DefaultReadFilter();
124    }
125
126    /**
127     * Validate that the current file is an HTML file
128     *
129     * @return boolean
130     */
131    protected function isValidFormat()
132    {
133        //    Reading 2048 bytes should be enough to validate that the format is HTML
134        $data = fread($this->fileHandle, 2048);
135        if ((strpos($data, '<') !== false) &&
136                (strlen($data) !== strlen(strip_tags($data)))) {
137            return true;
138        }
139
140        return false;
141    }
142
143    /**
144     * Loads PHPExcel from file
145     *
146     * @param  string                    $pFilename
147     * @return PHPExcel
148     * @throws PHPExcel_Reader_Exception
149     */
150    public function load($pFilename)
151    {
152        // Create new PHPExcel
153        $objPHPExcel = new PHPExcel();
154
155        // Load into this instance
156        return $this->loadIntoExisting($pFilename, $objPHPExcel);
157    }
158
159    /**
160     * Set input encoding
161     *
162     * @param string $pValue Input encoding
163     */
164    public function setInputEncoding($pValue = 'ANSI')
165    {
166        $this->inputEncoding = $pValue;
167
168        return $this;
169    }
170
171    /**
172     * Get input encoding
173     *
174     * @return string
175     */
176    public function getInputEncoding()
177    {
178        return $this->inputEncoding;
179    }
180
181    //    Data Array used for testing only, should write to PHPExcel object on completion of tests
182    protected $dataArray = array();
183    protected $tableLevel = 0;
184    protected $nestedColumn = array('A');
185
186    protected function setTableStartColumn($column)
187    {
188        if ($this->tableLevel == 0) {
189            $column = 'A';
190        }
191        ++$this->tableLevel;
192        $this->nestedColumn[$this->tableLevel] = $column;
193
194        return $this->nestedColumn[$this->tableLevel];
195    }
196
197    protected function getTableStartColumn()
198    {
199        return $this->nestedColumn[$this->tableLevel];
200    }
201
202    protected function releaseTableStartColumn()
203    {
204        --$this->tableLevel;
205
206        return array_pop($this->nestedColumn);
207    }
208
209    protected function flushCell($sheet, $column, $row, &$cellContent)
210    {
211        if (is_string($cellContent)) {
212            //    Simple String content
213            if (trim($cellContent) > '') {
214                //    Only actually write it if there's content in the string
215//                echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />';
216                //    Write to worksheet to be done here...
217                //    ... we return the cell so we can mess about with styles more easily
218                $sheet->setCellValue($column . $row, $cellContent, true);
219                $this->dataArray[$row][$column] = $cellContent;
220            }
221        } else {
222            //    We have a Rich Text run
223            //    TODO
224            $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
225        }
226        $cellContent = (string) '';
227    }
228
229    protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null)
230    {
231        foreach ($element->childNodes as $child) {
232            if ($child instanceof DOMText) {
233                $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
234                if (is_string($cellContent)) {
235                    //    simply append the text if the cell content is a plain text string
236                    $cellContent .= $domText;
237                } else {
238                    //    but if we have a rich text run instead, we need to append it correctly
239                    //    TODO
240                }
241            } elseif ($child instanceof DOMElement) {
242//                echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />';
243
244                $attributeArray = array();
245                foreach ($child->attributes as $attribute) {
246//                    echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />';
247                    $attributeArray[$attribute->name] = $attribute->value;
248                }
249
250                switch ($child->nodeName) {
251                    case 'meta':
252                        foreach ($attributeArray as $attributeName => $attributeValue) {
253                            switch ($attributeName) {
254                                case 'content':
255                                    //    TODO
256                                    //    Extract character set, so we can convert to UTF-8 if required
257                                    break;
258                            }
259                        }
260                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
261                        break;
262                    case 'title':
263                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
264                        $sheet->setTitle($cellContent);
265                        $cellContent = '';
266                        break;
267                    case 'span':
268                    case 'div':
269                    case 'font':
270                    case 'i':
271                    case 'em':
272                    case 'strong':
273                    case 'b':
274//                        echo 'STYLING, SPAN OR DIV<br />';
275                        if ($cellContent > '') {
276                            $cellContent .= ' ';
277                        }
278                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
279                        if ($cellContent > '') {
280                            $cellContent .= ' ';
281                        }
282//                        echo 'END OF STYLING, SPAN OR DIV<br />';
283                        break;
284                    case 'hr':
285                        $this->flushCell($sheet, $column, $row, $cellContent);
286                        ++$row;
287                        if (isset($this->formats[$child->nodeName])) {
288                            $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
289                        } else {
290                            $cellContent = '----------';
291                            $this->flushCell($sheet, $column, $row, $cellContent);
292                        }
293                        ++$row;
294                        // Add a break after a horizontal rule, simply by allowing the code to dropthru
295                    case 'br':
296                        if ($this->tableLevel > 0) {
297                            //    If we're inside a table, replace with a \n
298                            $cellContent .= "\n";
299                        } else {
300                            //    Otherwise flush our existing content and move the row cursor on
301                            $this->flushCell($sheet, $column, $row, $cellContent);
302                            ++$row;
303                        }
304//                        echo 'HARD LINE BREAK: ' , '<br />';
305                        break;
306                    case 'a':
307//                        echo 'START OF HYPERLINK: ' , '<br />';
308                        foreach ($attributeArray as $attributeName => $attributeValue) {
309                            switch ($attributeName) {
310                                case 'href':
311//                                    echo 'Link to ' , $attributeValue , '<br />';
312                                    $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
313                                    if (isset($this->formats[$child->nodeName])) {
314                                        $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
315                                    }
316                                    break;
317                            }
318                        }
319                        $cellContent .= ' ';
320                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
321//                        echo 'END OF HYPERLINK:' , '<br />';
322                        break;
323                    case 'h1':
324                    case 'h2':
325                    case 'h3':
326                    case 'h4':
327                    case 'h5':
328                    case 'h6':
329                    case 'ol':
330                    case 'ul':
331                    case 'p':
332                        if ($this->tableLevel > 0) {
333                            //    If we're inside a table, replace with a \n
334                            $cellContent .= "\n";
335//                            echo 'LIST ENTRY: ' , '<br />';
336                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
337//                            echo 'END OF LIST ENTRY:' , '<br />';
338                        } else {
339                            if ($cellContent > '') {
340                                $this->flushCell($sheet, $column, $row, $cellContent);
341                                $row++;
342                            }
343//                            echo 'START OF PARAGRAPH: ' , '<br />';
344                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
345//                            echo 'END OF PARAGRAPH:' , '<br />';
346                            $this->flushCell($sheet, $column, $row, $cellContent);
347
348                            if (isset($this->formats[$child->nodeName])) {
349                                $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
350                            }
351
352                            $row++;
353                            $column = 'A';
354                        }
355                        break;
356                    case 'li':
357                        if ($this->tableLevel > 0) {
358                            //    If we're inside a table, replace with a \n
359                            $cellContent .= "\n";
360//                            echo 'LIST ENTRY: ' , '<br />';
361                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
362//                            echo 'END OF LIST ENTRY:' , '<br />';
363                        } else {
364                            if ($cellContent > '') {
365                                $this->flushCell($sheet, $column, $row, $cellContent);
366                            }
367                            ++$row;
368//                            echo 'LIST ENTRY: ' , '<br />';
369                            $this->processDomElement($child, $sheet, $row, $column, $cellContent);
370//                            echo 'END OF LIST ENTRY:' , '<br />';
371                            $this->flushCell($sheet, $column, $row, $cellContent);
372                            $column = 'A';
373                        }
374                        break;
375                    case 'table':
376                        $this->flushCell($sheet, $column, $row, $cellContent);
377                        $column = $this->setTableStartColumn($column);
378//                        echo 'START OF TABLE LEVEL ' , $this->tableLevel , '<br />';
379                        if ($this->tableLevel > 1) {
380                            --$row;
381                        }
382                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
383//                        echo 'END OF TABLE LEVEL ' , $this->tableLevel , '<br />';
384                        $column = $this->releaseTableStartColumn();
385                        if ($this->tableLevel > 1) {
386                            ++$column;
387                        } else {
388                            ++$row;
389                        }
390                        break;
391                    case 'thead':
392                    case 'tbody':
393                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
394                        break;
395                    case 'tr':
396                        $column = $this->getTableStartColumn();
397                        $cellContent = '';
398//                        echo 'START OF TABLE ' , $this->tableLevel , ' ROW<br />';
399                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
400                        ++$row;
401//                        echo 'END OF TABLE ' , $this->tableLevel , ' ROW<br />';
402                        break;
403                    case 'th':
404                    case 'td':
405//                        echo 'START OF TABLE ' , $this->tableLevel , ' CELL<br />';
406                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
407//                        echo 'END OF TABLE ' , $this->tableLevel , ' CELL<br />';
408
409                        while (isset($this->rowspan[$column . $row])) {
410                            ++$column;
411                        }
412
413                        $this->flushCell($sheet, $column, $row, $cellContent);
414
415//                        if (isset($attributeArray['style']) && !empty($attributeArray['style'])) {
416//                            $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']);
417//
418//                            if (!empty($styleAry)) {
419//                                $sheet->getStyle($column . $row)->applyFromArray($styleAry);
420//                            }
421//                        }
422
423                        if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
424                            //create merging rowspan and colspan
425                            $columnTo = $column;
426                            for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
427                                ++$columnTo;
428                            }
429                            $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
430                            foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
431                                $this->rowspan[$value] = true;
432                            }
433                            $sheet->mergeCells($range);
434                            $column = $columnTo;
435                        } elseif (isset($attributeArray['rowspan'])) {
436                            //create merging rowspan
437                            $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
438                            foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
439                                $this->rowspan[$value] = true;
440                            }
441                            $sheet->mergeCells($range);
442                        } elseif (isset($attributeArray['colspan'])) {
443                            //create merging colspan
444                            $columnTo = $column;
445                            for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
446                                ++$columnTo;
447                            }
448                            $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
449                            $column = $columnTo;
450                        }
451                        ++$column;
452                        break;
453                    case 'body':
454                        $row = 1;
455                        $column = 'A';
456                        $content = '';
457                        $this->tableLevel = 0;
458                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
459                        break;
460                    default:
461                        $this->processDomElement($child, $sheet, $row, $column, $cellContent);
462                }
463            }
464        }
465    }
466
467    /**
468     * Loads PHPExcel from file into PHPExcel instance
469     *
470     * @param  string                    $pFilename
471     * @param  PHPExcel                  $objPHPExcel
472     * @return PHPExcel
473     * @throws PHPExcel_Reader_Exception
474     */
475    public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
476    {
477        // Open file to validate
478        $this->openFile($pFilename);
479        if (!$this->isValidFormat()) {
480            fclose($this->fileHandle);
481            throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file.");
482        }
483        //    Close after validating
484        fclose($this->fileHandle);
485
486        // Create new PHPExcel
487        while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) {
488            $objPHPExcel->createSheet();
489        }
490        $objPHPExcel->setActiveSheetIndex($this->sheetIndex);
491
492        //    Create a new DOM object
493        $dom = new domDocument;
494        //    Reload the HTML file into the DOM object
495        $loaded = $dom->loadHTML(mb_convert_encoding($this->securityScanFile($pFilename), 'HTML-ENTITIES', 'UTF-8'));
496        if ($loaded === false) {
497            throw new PHPExcel_Reader_Exception('Failed to load ' . $pFilename . ' as a DOM Document');
498        }
499
500        //    Discard white space
501        $dom->preserveWhiteSpace = false;
502
503        $row = 0;
504        $column = 'A';
505        $content = '';
506        $this->processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content);
507
508        // Return
509        return $objPHPExcel;
510    }
511
512    /**
513     * Get sheet index
514     *
515     * @return int
516     */
517    public function getSheetIndex()
518    {
519        return $this->sheetIndex;
520    }
521
522    /**
523     * Set sheet index
524     *
525     * @param  int                  $pValue Sheet index
526     * @return PHPExcel_Reader_HTML
527     */
528    public function setSheetIndex($pValue = 0)
529    {
530        $this->sheetIndex = $pValue;
531
532        return $this;
533    }
534
535    /**
536     * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks
537     *
538     * @param     string         $xml
539     * @throws PHPExcel_Reader_Exception
540     */
541    public function securityScan($xml)
542    {
543        $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
544        if (preg_match($pattern, $xml)) {
545            throw new PHPExcel_Reader_Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
546        }
547        return $xml;
548    }
549}
550