1<?php
2/**
3 * Copyright 2016-2017 Horde LLC (http://www.horde.org/)
4 *
5 * See the enclosed file COPYING for license information (LGPL). If you
6 * did not receive this file, see http://www.horde.org/licenses/lgpl21.
7 *
8 * @author   Jan Schneider <jan@horde.org>
9 * @category Horde
10 * @license  http://www.horde.org/licenses/lgpl21 LGPL 2.1
11 * @package  Text_Filter
12 */
13
14/**
15 * Takes HTML and removes any MS Office formatting quirks.
16 *
17 * @author   Jan Schneider <jan@horde.org>
18 * @category Horde
19 * @license  http://www.horde.org/licenses/lgpl21 LGPL 2.1
20 * @package  Text_Filter
21 */
22class Horde_Text_Filter_Msoffice extends Horde_Text_Filter_Base
23{
24    /**
25     * Filter parameters.
26     *
27     * @var array
28     */
29    protected $_params = array(
30        'charset' => 'UTF-8',
31    );
32
33    /**
34     * Executes any code necessary after applying the filter patterns.
35     *
36     * @param string $text  The text after the filtering.
37     *
38     * @return string  The modified text.
39     */
40    public function postProcess($text)
41    {
42        // We cannot find those elements via DOM because HTML doesn't know
43        // about namespaces.
44        $text = str_replace('<o:p>&nbsp;</o:p>', '', $text);
45
46        try {
47            $dom = new Horde_Domhtml($text, $this->_params['charset']);
48        } catch (Exception $e) {
49            return $text;
50        }
51
52        // Replace all <p> elements of class "MsoNormal" with <br> elements,
53        // unless they contain other classes. Then replace with <div> elements.
54        foreach ($dom as $child) {
55            if ($child instanceof DOMElement &&
56                Horde_String::lower($child->tagName) == 'p') {
57            }
58            if (!($child instanceof DOMElement) ||
59                Horde_String::lower($child->tagName) != 'p' ||
60                !($css = $child->getAttribute('class')) ||
61                strpos($css, 'MsoNormal') === false) {
62                continue;
63            }
64            $css = trim(str_replace('MsoNormal', '', $css));
65            if (strlen($css)) {
66                $div = $dom->dom->createElement('div');
67                $div->setAttribute('class', $css);
68                foreach ($child->childNodes as $subchild) {
69                    $div->appendChild($subchild);
70                }
71                $child->parentNode->insertBefore($div, $child);
72            } elseif (strlen(preg_replace('/^\s*(.*)\s*$/u', '$1', $child->textContent))) {
73                while ($child->hasChildNodes()) {
74                    $tomove = $child->removeChild($child->firstChild);
75                    $child->parentNode->insertBefore($tomove, $child);
76                }
77                $child->parentNode->insertBefore(
78                    $dom->dom->createElement('br'), $child
79                );
80            }
81            $child->parentNode->removeChild($child);
82        }
83
84        return $dom->returnHtml(array('charset' => $this->_params['charset']));
85    }
86}