1<?php 2/** 3 * Copyright 2002-2017 Horde LLC (http://www.horde.org/) 4 * 5 * See the enclosed file COPYING for license information (LGPL). If you 6 * did not receive this file, see http://www.horde.org/licenses/lgpl21. 7 * 8 * @author Michael J Rubinsky <mrubinsk@horde.org> 9 * @category Horde 10 * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1 11 * @package Compress 12 */ 13 14/** 15 * Object to parse RTF data encapsulated in a TNEF file. 16 * 17 * @author Michael J Rubinsky <mrubinsk@horde.org> 18 * @category Horde 19 * @copyright 2002-2017 Horde LLC 20 * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1 21 * @package Compress 22 */ 23class Horde_Compress_Tnef_Rtf extends Horde_Compress_Tnef_Object 24{ 25 const UNCOMPRESSED = 0x414c454d; 26 const COMPRESSED = 0x75465a4c; 27 28 /** 29 * RTF content. 30 * 31 * @var string 32 */ 33 protected $_content = ''; 34 35 /** 36 * Size of RTF content. 37 * 38 * @var integer 39 */ 40 protected $_size = 0; 41 42 /** 43 * MIME type. 44 * 45 * @var string 46 */ 47 public $type = 'application/rtf'; 48 49 public function __construct($logger, $data) 50 { 51 parent::__construct($logger, $data); 52 $this->_decode(); 53 } 54 55 public function __get($property) 56 { 57 if ($property == 'content') { 58 return $this->_content; 59 } 60 61 throw new InvalidArgumentException('Invalid property access.'); 62 } 63 64 /** 65 * Output the data for this object in an array. 66 * 67 * @return array 68 * - type: (string) The MIME type of the content. 69 * - subtype: (string) The MIME subtype. 70 * - name: (string) The filename. 71 * - stream: (string) The file data. 72 */ 73 public function toArray() 74 { 75 return array( 76 'type' => 'application', 77 'subtype' => 'rtf', 78 'name' => 'Untitled.rtf', 79 'stream' => $this->_content 80 ); 81 } 82 83 /** 84 * Obtain a good-enough-for-our-needs plain text representation of 85 * the RTF document. 86 * 87 * @return string The plaintext. 88 */ 89 public function toPlain() 90 { 91 return $this->_rtf2text($this->_content); 92 } 93 94 protected function _decode() 95 { 96 $c_size = $this->_geti($this->_data, 32); 97 $this->_size = $this->_geti($this->_data, 32); 98 $magic = $this->_geti($this->_data, 32); 99 $crc = $this->_geti($this->_data, 32); 100 101 $this->_logger->debug(sprintf( 102 'TNEF: compressed size: %s, size: %s, magic: %s, CRC: %s', 103 $c_size, $this->_size, $magic, $crc) 104 ); 105 106 switch ($magic) { 107 case self::COMPRESSED: 108 $this->_decompress(); 109 break; 110 case self::UNCOMPRESSED: 111 $this->_content = $this->_data; 112 break; 113 default: 114 $this->_logger->notice('TNEF: Unknown RTF compression.'); 115 } 116 } 117 118 /** 119 * Decompress compressed RTF. Logic taken and adapted from NasMail RTF 120 * plugin. 121 * 122 * @return string 123 */ 124 protected function _decompress() 125 { 126 $uncomp = ''; 127 $in = $out = $flags = $flag_count = 0; 128 129 $preload = "{\\rtf1\\ansi\\mac\\deff0\\deftab720{\\fonttbl;}{\\f0\\fnil \\froman \\fswiss \\fmodern \\fscript \\fdecor MS Sans SerifSymbolArialTimes New RomanCourier{\\colortbl\\red0\\green0\\blue0\n\r\\par \\pard\\plain\\f0\\fs20\\b\\i\\u\\tab\\tx"; 130 $length_preload = strlen($preload); 131 132 for ($cnt = 0; $cnt < $length_preload; $cnt++) { 133 $uncomp .= $preload[$cnt]; 134 ++$out; 135 } 136 137 while ($out < ($this->_size + $length_preload)) { 138 if (($flag_count++ % 8) == 0) { 139 $flags = ord($this->_data[$in++]); 140 } else { 141 $flags = $flags >> 1; 142 } 143 144 if (($flags & 1) != 0) { 145 $offset = ord($this->_data[$in++]); 146 $length = ord($this->_data[$in++]); 147 $offset = ($offset << 4) | ($length >> 4); 148 $length = ($length & 0xF) + 2; 149 $offset = ((int)($out / 4096)) * 4096 + $offset; 150 if ($offset >= $out) { 151 $offset -= 4096; 152 } 153 $end = $offset + $length; 154 while ($offset < $end) { 155 $uncomp.= $uncomp[$offset++]; 156 ++$out; 157 } 158 } else { 159 $uncomp .= $this->_data[$in++]; 160 ++$out; 161 } 162 } 163 $this->_content = substr_replace($uncomp, "", 0, $length_preload); 164 } 165 166 /** 167 * Parse RTF data and return the best plaintext representation we can. 168 * Adapted from: 169 * http://webcheatsheet.com/php/reading_the_clean_text_from_rtf.php 170 * 171 * @param string $text The RTF text. 172 * 173 * @return string The plaintext. 174 */ 175 protected function _rtf2text($text) 176 { 177 $document = ''; 178 $stack = array(); 179 $j = -1; 180 181 // Read the data character-by- character… 182 for ($i = 0, $len = strlen($text); $i < $len; $i++) { 183 $c = $text[$i]; 184 switch ($c) { 185 case '\\': 186 // Key Word 187 $nextCharacter = $text[$i + 1]; 188 189 // If it is another backslash or nonbreaking space or hyphen, 190 // then the character is plain text and add it to the output stream. 191 if ($nextCharacter == '\\' && $this->_rtfIsPlain($stack[$j])) { 192 $document .= '\\'; 193 } elseif ($nextCharacter == '~' && $this->_rtfIsPlain($stack[$j])) { 194 $document .= ' '; 195 } elseif ($nextCharacter == '_' && $this->_rtfIsPlain($stack[$j])) { 196 $document .= '-'; 197 } elseif ($nextCharacter == '*') { 198 // Add to the stack. 199 $stack[$j]['*'] = true; 200 } elseif ($nextCharacter == "'") { 201 // If it is a single quote, read next two characters that 202 // are the hexadecimal notation of a character we should add 203 // to the output stream. 204 $hex = substr($text, $i + 2, 2); 205 if ($this->_rtfIsPlain($stack[$j])) { 206 $document .= html_entity_decode('&#' . hexdec($hex) .';'); 207 } 208 //Shift the pointer. 209 $i += 2; 210 } elseif ($nextCharacter >= 'a' && $nextCharacter <= 'z' 211 || $nextCharacter >= 'A' && $nextCharacter <= 'Z') { 212 // Since, we’ve found the alphabetic character, the next 213 // characters are control words and, possibly, some digit 214 // parameter. 215 $word = ''; 216 $param = null; 217 // Start reading characters after the backslash. 218 for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) { 219 $nextCharacter = $text[$k]; 220 // If the current character is a letter and there were 221 // no digits before it, then we’re still reading the 222 // control word. If there were digits, we should stop 223 // since we reach the end of the control word. 224 if ($nextCharacter >= 'a' && $nextCharacter <= 'z' 225 || $nextCharacter >= 'A' && $nextCharacter <= 'Z') { 226 if (!empty($param)) { 227 break; 228 } 229 $word .= $nextCharacter; 230 } elseif ($nextCharacter >= '0' && $nextCharacter <= '9') { 231 // If it is a digit, store the parameter. 232 $param .= $nextCharacter; 233 } elseif ($nextCharacter == '-') { 234 // Since minus sign may occur only before a digit 235 // parameter, check whether $param is empty. 236 // Otherwise, we reach the end of the control word. 237 if (!empty($param)) { 238 break; 239 } 240 $param .= $nextCharacter; 241 } else { 242 break; 243 } 244 } 245 246 // Shift the pointer on the number of read characters. 247 $i += $m - 1; 248 249 // Start analyzing.We are interested mostly in control words 250 $toText = ''; 251 switch (Horde_String::lower($word)) { 252 // If the control word is "u", then its parameter is 253 // the decimal notation of the Unicode character that 254 // should be added to the output stream. We need to 255 // check whether the stack contains \ucN control word. 256 // If it does, we should remove the N characters from 257 // the output stream. 258 case 'u': 259 $toText .= html_entity_decode('&#x' . dechex($param) .';'); 260 $ucDelta = @$stack[$j]['uc']; 261 if ($ucDelta > 0) { 262 $i += $ucDelta; 263 } 264 break; 265 case 'par': 266 case 'page': 267 case 'column': 268 case 'line': 269 case 'lbr': 270 $toText .= "\n"; 271 break; 272 case 'emspace': 273 case 'enspace': 274 case 'qmspace': 275 $toText .= ' '; 276 break; 277 case 'tab': 278 $toText .= "\t"; 279 break; 280 case 'chdate': 281 $toText .= date('m.d.Y'); 282 break; 283 case 'chdpl': 284 $toText .= date('l, j F Y'); 285 break; 286 case 'chdpa': 287 $toText .= date('D, j M Y'); 288 break; 289 case 'chtime': 290 $toText .= date('H:i:s'); 291 break; 292 case 'emdash': 293 $toText .= html_entity_decode('—'); 294 break; 295 case 'endash': 296 $toText .= html_entity_decode('–'); 297 break; 298 case 'bullet': 299 $toText .= html_entity_decode('•'); 300 break; 301 case 'lquote': 302 $toText .= html_entity_decode('‘'); 303 break; 304 case 'rquote': 305 $toText .= html_entity_decode('’'); 306 break; 307 case 'ldblquote': 308 $toText .= html_entity_decode('«'); 309 break; 310 case 'rdblquote': 311 $toText .= html_entity_decode('»'); 312 break; 313 default: 314 $stack[$j][Horde_String::lower($word)] = empty($param) ? true : $param; 315 break; 316 } 317 // Add data to the output stream if required. 318 if ($this->_rtfIsPlain($stack[$j])) { 319 $document .= $toText; 320 } 321 } 322 $i++; 323 break; 324 case '{': 325 // New subgroup starts, add new stack element and write the data 326 // from previous stack element to it. 327 if (!empty($stack[$j])) { 328 array_push($stack, $stack[$j++]); 329 } else { 330 $j++; 331 } 332 break; 333 case '}': 334 array_pop($stack); 335 $j--; 336 break; 337 case '\0': 338 case '\r': 339 case '\f': 340 case '\n': 341 // Junk 342 break; 343 default: 344 // Add other data to the output stream if required. 345 if (!empty($stack[$j]) && $this->_rtfIsPlain($stack[$j])) { 346 $document .= $c; 347 } 348 break; 349 } 350 } 351 352 return $document; 353 } 354 355 protected function _rtfIsPlain($s) 356 { 357 $notPlain = array('*', 'fonttbl', 'colortbl', 'datastore', 'themedata', 'stylesheet'); 358 for ($i = 0; $i < count($notPlain); $i++) { 359 if (!empty($s[$notPlain[$i]])) { 360 return false; 361 } 362 } 363 return true; 364 } 365 366} 367