1<?php
2/**
3 * Copyright 2002-2017 Horde LLC (http://www.horde.org/)
4 *
5 * See the enclosed file COPYING for license information (LGPL). If you
6 * did not receive this file, see http://www.horde.org/licenses/lgpl21.
7 *
8 * @author   Michael J Rubinsky <mrubinsk@horde.org>
9 * @category Horde
10 * @license  http://www.horde.org/licenses/lgpl21 LGPL 2.1
11 * @package  Compress
12 */
13
14/**
15 * Object to parse RTF data encapsulated in a TNEF file.
16 *
17 * @author    Michael J Rubinsky <mrubinsk@horde.org>
18 * @category  Horde
19 * @copyright 2002-2017 Horde LLC
20 * @license   http://www.horde.org/licenses/lgpl21 LGPL 2.1
21 * @package   Compress
22 */
23class Horde_Compress_Tnef_Rtf extends Horde_Compress_Tnef_Object
24{
25    const UNCOMPRESSED = 0x414c454d;
26    const COMPRESSED   = 0x75465a4c;
27
28    /**
29     * RTF content.
30     *
31     * @var string
32     */
33    protected $_content = '';
34
35    /**
36     * Size of RTF content.
37     *
38     * @var integer
39     */
40    protected $_size = 0;
41
42    /**
43     * MIME type.
44     *
45     * @var string
46     */
47    public $type = 'application/rtf';
48
49    public function __construct($logger, $data)
50    {
51        parent::__construct($logger, $data);
52        $this->_decode();
53    }
54
55    public function __get($property)
56    {
57        if ($property == 'content') {
58            return $this->_content;
59        }
60
61        throw new InvalidArgumentException('Invalid property access.');
62    }
63
64    /**
65     * Output the data for this object in an array.
66     *
67     * @return array
68     *   - type: (string)    The MIME type of the content.
69     *   - subtype: (string) The MIME subtype.
70     *   - name: (string)    The filename.
71     *   - stream: (string)  The file data.
72     */
73    public function toArray()
74    {
75        return array(
76            'type'    => 'application',
77            'subtype' => 'rtf',
78            'name'    => 'Untitled.rtf',
79            'stream'  => $this->_content
80        );
81    }
82
83    /**
84     * Obtain a good-enough-for-our-needs plain text representation of
85     * the RTF document.
86     *
87     * @return string The plaintext.
88     */
89    public function toPlain()
90    {
91        return $this->_rtf2text($this->_content);
92    }
93
94    protected function _decode()
95    {
96        $c_size = $this->_geti($this->_data, 32);
97        $this->_size = $this->_geti($this->_data, 32);
98        $magic = $this->_geti($this->_data, 32);
99        $crc = $this->_geti($this->_data, 32);
100
101        $this->_logger->debug(sprintf(
102            'TNEF: compressed size: %s, size: %s, magic: %s, CRC: %s',
103            $c_size, $this->_size, $magic, $crc)
104        );
105
106        switch ($magic) {
107        case self::COMPRESSED:
108            $this->_decompress();
109            break;
110        case self::UNCOMPRESSED:
111            $this->_content = $this->_data;
112            break;
113        default:
114            $this->_logger->notice('TNEF: Unknown RTF compression.');
115        }
116    }
117
118    /**
119     * Decompress compressed RTF. Logic taken and adapted from NasMail RTF
120     * plugin.
121     *
122     * @return string
123     */
124    protected function _decompress()
125    {
126        $uncomp = '';
127        $in = $out = $flags = $flag_count = 0;
128
129        $preload = "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArialTimes New RomanCourier{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx";
130        $length_preload = strlen($preload);
131
132        for ($cnt = 0; $cnt < $length_preload; $cnt++) {
133            $uncomp .= $preload[$cnt];
134            ++$out;
135        }
136
137        while ($out < ($this->_size + $length_preload)) {
138            if (($flag_count++ % 8) == 0) {
139                $flags = ord($this->_data[$in++]);
140            } else {
141                $flags = $flags >> 1;
142            }
143
144            if (($flags & 1) != 0) {
145                $offset = ord($this->_data[$in++]);
146                $length = ord($this->_data[$in++]);
147                $offset = ($offset << 4) | ($length >> 4);
148                $length = ($length & 0xF) + 2;
149                $offset = ((int)($out / 4096)) * 4096 + $offset;
150                if ($offset >= $out) {
151                    $offset -= 4096;
152                }
153                $end = $offset + $length;
154                while ($offset < $end) {
155                    $uncomp.= $uncomp[$offset++];
156                    ++$out;
157                }
158            } else {
159                $uncomp .= $this->_data[$in++];
160                ++$out;
161            }
162        }
163        $this->_content = substr_replace($uncomp, "", 0, $length_preload);
164    }
165
166    /**
167     * Parse RTF data and return the best plaintext representation we can.
168     * Adapted from:
169     * http://webcheatsheet.com/php/reading_the_clean_text_from_rtf.php
170     *
171     * @param string $text  The RTF text.
172     *
173     * @return string   The plaintext.
174     */
175    protected function _rtf2text($text)
176    {
177        $document = '';
178        $stack = array();
179        $j = -1;
180
181        // Read the data character-by- character…
182        for ($i = 0, $len = strlen($text); $i < $len; $i++) {
183            $c = $text[$i];
184            switch ($c) {
185            case '\\':
186                // Key Word
187                $nextCharacter = $text[$i + 1];
188
189                // If it is another backslash or nonbreaking space or hyphen,
190                // then the character is plain text and add it to the output stream.
191                if ($nextCharacter == '\\' && $this->_rtfIsPlain($stack[$j])) {
192                    $document .= '\\';
193                } elseif ($nextCharacter == '~' && $this->_rtfIsPlain($stack[$j])) {
194                    $document .= ' ';
195                } elseif ($nextCharacter == '_' && $this->_rtfIsPlain($stack[$j])) {
196                    $document .= '-';
197                } elseif ($nextCharacter == '*') {
198                    // Add to the stack.
199                    $stack[$j]['*'] = true;
200                } elseif ($nextCharacter == "'") {
201                    // If it is a single quote, read next two characters that
202                    // are the hexadecimal notation of a character we should add
203                    // to the output stream.
204                    $hex = substr($text, $i + 2, 2);
205                    if ($this->_rtfIsPlain($stack[$j])) {
206                        $document .= html_entity_decode('&#' . hexdec($hex) .';');
207                    }
208                    //Shift the pointer.
209                    $i += 2;
210                } elseif ($nextCharacter >= 'a' && $nextCharacter <= 'z'
211                          || $nextCharacter >= 'A' && $nextCharacter <= 'Z') {
212                    // Since, we’ve found the alphabetic character, the next
213                    // characters are control words and, possibly, some digit
214                    // parameter.
215                    $word = '';
216                    $param = null;
217                    // Start reading characters after the backslash.
218                    for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
219                        $nextCharacter = $text[$k];
220                        // If the current character is a letter and there were
221                        // no digits before it, then we’re still reading the
222                        // control word. If there were digits, we should stop
223                        // since we reach the end of the control word.
224                        if ($nextCharacter >= 'a' && $nextCharacter <= 'z'
225                            || $nextCharacter >= 'A' && $nextCharacter <= 'Z') {
226                            if (!empty($param)) {
227                                break;
228                            }
229                            $word .= $nextCharacter;
230                        } elseif ($nextCharacter >= '0' && $nextCharacter <= '9') {
231                            // If it is a digit, store the parameter.
232                            $param .= $nextCharacter;
233                        } elseif ($nextCharacter == '-') {
234                            // Since minus sign may occur only before a digit
235                            // parameter, check whether $param is empty.
236                            // Otherwise, we reach the end of the control word.
237                            if (!empty($param)) {
238                                break;
239                            }
240                            $param .= $nextCharacter;
241                        } else {
242                            break;
243                        }
244                    }
245
246                    // Shift the pointer on the number of read characters.
247                    $i += $m - 1;
248
249                    // Start analyzing.We are interested mostly in control words
250                    $toText = '';
251                    switch (Horde_String::lower($word)) {
252                    // If the control word is "u", then its parameter is
253                    // the decimal notation of the Unicode character that
254                    // should be added to the output stream. We need to
255                    // check whether the stack contains \ucN control word.
256                    // If it does, we should remove the N characters from
257                    // the output stream.
258                    case 'u':
259                        $toText .= html_entity_decode('&#x' . dechex($param) .';');
260                        $ucDelta = @$stack[$j]['uc'];
261                        if ($ucDelta > 0) {
262                            $i += $ucDelta;
263                        }
264                        break;
265                    case 'par':
266                    case 'page':
267                    case 'column':
268                    case 'line':
269                    case 'lbr':
270                        $toText .= "\n";
271                        break;
272                    case 'emspace':
273                    case 'enspace':
274                    case 'qmspace':
275                        $toText .= ' ';
276                        break;
277                    case 'tab':
278                        $toText .= "\t";
279                        break;
280                    case 'chdate':
281                        $toText .= date('m.d.Y');
282                        break;
283                    case 'chdpl':
284                        $toText .= date('l, j F Y');
285                        break;
286                    case 'chdpa':
287                        $toText .= date('D, j M Y');
288                        break;
289                    case 'chtime':
290                        $toText .= date('H:i:s');
291                        break;
292                    case 'emdash':
293                        $toText .= html_entity_decode('&mdash;');
294                        break;
295                    case 'endash':
296                        $toText .= html_entity_decode('&ndash;');
297                        break;
298                    case 'bullet':
299                        $toText .= html_entity_decode('&#149;');
300                        break;
301                    case 'lquote':
302                        $toText .= html_entity_decode('&lsquo;');
303                        break;
304                    case 'rquote':
305                        $toText .= html_entity_decode('&rsquo;');
306                        break;
307                    case 'ldblquote':
308                        $toText .= html_entity_decode('&laquo;');
309                        break;
310                    case 'rdblquote':
311                        $toText .= html_entity_decode('&raquo;');
312                        break;
313                    default:
314                        $stack[$j][Horde_String::lower($word)] = empty($param) ? true : $param;
315                        break;
316                    }
317                    // Add data to the output stream if required.
318                    if ($this->_rtfIsPlain($stack[$j])) {
319                        $document .= $toText;
320                    }
321                }
322                $i++;
323                break;
324            case '{':
325                // New subgroup starts, add new stack element and write the data
326                // from previous stack element to it.
327                if (!empty($stack[$j])) {
328                    array_push($stack, $stack[$j++]);
329                } else {
330                    $j++;
331                }
332                break;
333            case '}':
334                array_pop($stack);
335                $j--;
336                break;
337            case '\0':
338            case '\r':
339            case '\f':
340            case '\n':
341                // Junk
342                break;
343            default:
344                // Add other data to the output stream if required.
345                if (!empty($stack[$j]) && $this->_rtfIsPlain($stack[$j])) {
346                    $document .= $c;
347                }
348                break;
349            }
350        }
351
352        return $document;
353    }
354
355    protected function _rtfIsPlain($s)
356    {
357        $notPlain = array('*', 'fonttbl', 'colortbl', 'datastore', 'themedata', 'stylesheet');
358        for ($i = 0; $i < count($notPlain); $i++) {
359            if (!empty($s[$notPlain[$i]])) {
360                return false;
361            }
362        }
363        return true;
364    }
365
366}
367