1# -- 2# Copyright (C) 2001-2020 OTRS AG, https://otrs.com/ 3# -- 4# This software comes with ABSOLUTELY NO WARRANTY. For details, see 5# the enclosed file COPYING for license information (GPL). If you 6# did not receive this file, see https://www.gnu.org/licenses/gpl-3.0.txt. 7# -- 8 9package Kernel::System::HTMLUtils; 10 11use strict; 12use warnings; 13 14use utf8; 15 16use MIME::Base64; 17 18our @ObjectDependencies = ( 19 'Kernel::Config', 20 'Kernel::System::Encode', 21 'Kernel::System::Log', 22); 23 24=head1 NAME 25 26Kernel::System::HTMLUtils - creating and modifying html strings 27 28=head1 DESCRIPTION 29 30A module for creating and modifying html strings. 31 32=head1 PUBLIC INTERFACE 33 34=head2 new() 35 36Don't use the constructor directly, use the ObjectManager instead: 37 38 my $HTMLUtilsObject = $Kernel::OM->Get('Kernel::System::HTMLUtils'); 39 40=cut 41 42sub new { 43 my ( $Type, %Param ) = @_; 44 45 # allocate new hash for object 46 my $Self = {}; 47 bless( $Self, $Type ); 48 49 # get debug level from parent 50 $Self->{Debug} = $Param{Debug} || 0; 51 52 return $Self; 53} 54 55=head2 ToAscii() 56 57convert an HTML string to an ASCII string 58 59 my $Ascii = $HTMLUtilsObject->ToAscii( String => $String ); 60 61=cut 62 63sub ToAscii { 64 my ( $Self, %Param ) = @_; 65 66 # check needed stuff 67 for (qw(String)) { 68 if ( !defined $Param{$_} ) { 69 $Kernel::OM->Get('Kernel::System::Log')->Log( 70 Priority => 'error', 71 Message => "Need $_!" 72 ); 73 return; 74 } 75 } 76 77 # make sure to flag the input string as unicode (utf8) because replacements below can 78 # introduce unicode encoded characters (see bug#10970, bug#11596 and bug#12097 for more info) 79 $Kernel::OM->Get('Kernel::System::Encode')->EncodeInput( \$Param{String} ); 80 81 # get length of line for forcing line breakes 82 my $LineLength = $Kernel::OM->Get('Kernel::Config')->Get('Ticket::Frontend::TextAreaNote') || 78; 83 84 # find <a href=....> and replace it with [x] 85 my $LinkList = ''; 86 my $Counter = 0; 87 $Param{String} =~ s{ 88 <a\s.*?href=("|')(.+?)("|').*?> 89 } 90 { 91 my $Link = $2; 92 $Counter++; 93 $LinkList .= "[$Counter] $Link\n"; 94 "[$Counter]"; 95 }egxi; 96 97 # pre-process <blockquote> and <div style=\"cite\" 98 my %Cite; 99 $Counter = 0; 100 $Param{String} =~ s{ 101 <blockquote(.*?)>(.+?)</blockquote> 102 } 103 { 104 my $Ascii = $Self->ToAscii( 105 String => $2, 106 ); 107 # force line breaking 108 if ( length $Ascii > $LineLength ) { 109 $Ascii =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm; 110 } 111 $Ascii =~ s/^(.*?)$/> $1/gm; 112 $Counter++; 113 my $Key = "######Cite::$Counter######"; 114 $Cite{$Key} = $Ascii; 115 $Key; 116 }segxmi; 117 $Param{String} =~ s{ 118 <div.+?type="cite"[^>]*>(.+?)</div> 119 } 120 { 121 my $Ascii = $Self->ToAscii( 122 String => $1, 123 ); 124 # force line breaking 125 if ( length $Ascii > $LineLength ) { 126 $Ascii =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm; 127 } 128 $Ascii =~ s/^(.*?)$/> $1/gm; 129 $Counter++; 130 my $Key = "######Cite::$Counter######"; 131 $Cite{$Key} = $Ascii; 132 $Key; 133 }segxmi; 134 135 # remember <pre> and <code> tags 136 my %One2One; 137 $Counter = 0; 138 $Param{String} =~ s{ 139 <(pre|code)(.*?)>(.+?)</(pre|code)(.*?)> 140 } 141 { 142 my $Content = $3; 143 $Counter++; 144 my $Key = "######One2One::$Counter######"; 145 $One2One{$Key} = $Content; 146 $Key; 147 }segxmi; 148 149 # remove comments at the first place to avoid to much work 150 # for the regex engine 151 $Param{String} =~ s{<!-- .*? -->}{}xmgsi; 152 153 # remove empty lines 154 $Param{String} =~ s/^\s*//mg; 155 156 # fix some bad stuff from opera and others 157 $Param{String} =~ s/(\n\r|\r\r\n|\r\n)/\n/gs; 158 159 # remove new line after <br> 160 $Param{String} =~ s/(\<br(\s{0,3}|\s{1,3}.+?)(\/|)\>)(\n|\r)/$1/gsi; 161 162 # replace new lines with one space 163 $Param{String} =~ s/\n/ /gs; 164 $Param{String} =~ s/\r/ /gs; 165 166 # remove style tags 167 $Param{String} =~ s{<style [^>]*? />}{}xgsi; 168 $Param{String} =~ s{<style [^>]*? > .*? </style[^>]*>}{}xgsi; 169 170 # remove <br>,<br/>,<br />, <br class="name"/>, tags and replace it with \n 171 $Param{String} =~ s/\<br(\s{0,3}|\s{1,3}.+?)(\/|)\>/\n/gsi; 172 173 # remove </div> tags and replace it with \n 174 $Param{String} =~ s/<\/(\s{0,3})div>/\n/gsi; 175 176 # remove hr tags and replace it with \n 177 $Param{String} =~ s/\<(hr|hr.+?)\>/\n\n/gsi; 178 179 # remove p, table tags and replace it with \n 180 $Param{String} =~ s/\<(\/|)(p|p.+?|table|table.+?)\>/\n\n/gsi; 181 182 # remove opening tr, th tags and replace them with \n 183 $Param{String} =~ s/\<(tr|tr.+?|th|th.+?)\>/\n\n/gsi; 184 185 # convert li tags to \n - 186 $Param{String} =~ s/\<li\>/\n - /gsi; 187 188 # convert </ul> and </ol> tags to \n\n 189 $Param{String} =~ s/\<\/(ul|ol)\>/\n\n/gsi; 190 191 # remove </td> tags and replace them with " " 192 $Param{String} =~ s/<\/td[^>]*>/ /gsi; 193 194 # replace multiple spaces with just one space 195 $Param{String} =~ s/[ ]{2,}/ /mg; 196 197 # remember <pre> and <code> tags and replace it 198 for my $Key ( sort keys %One2One ) { 199 $Param{String} =~ s/$Key/\n\n\n$One2One{$Key}\n\n/g; 200 } 201 202 # strip all other tags 203 $Param{String} =~ s/\<.+?\>//gs; 204 205 # html encode based on cpan's HTML::Entities v1.35 206 my %Entity = ( 207 208 # Some normal chars that have special meaning in SGML context 209 amp => '&', # ampersand 210 'gt' => '>', # greater than 211 'lt' => '<', # less than 212 quot => '"', # double quote 213 apos => "'", # single quote 214 215 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML 216 AElig => chr(198), # capital AE diphthong (ligature) 217 Aacute => chr(193), # capital A, acute accent 218 Acirc => chr(194), # capital A, circumflex accent 219 Agrave => chr(192), # capital A, grave accent 220 Aring => chr(197), # capital A, ring 221 Atilde => chr(195), # capital A, tilde 222 Auml => chr(196), # capital A, dieresis or umlaut mark 223 Ccedil => chr(199), # capital C, cedilla 224 ETH => chr(208), # capital Eth, Icelandic 225 Eacute => chr(201), # capital E, acute accent 226 Ecirc => chr(202), # capital E, circumflex accent 227 Egrave => chr(200), # capital E, grave accent 228 Euml => chr(203), # capital E, dieresis or umlaut mark 229 Iacute => chr(205), # capital I, acute accent 230 Icirc => chr(206), # capital I, circumflex accent 231 Igrave => chr(204), # capital I, grave accent 232 Iuml => chr(207), # capital I, dieresis or umlaut mark 233 Ntilde => chr(209), # capital N, tilde 234 Oacute => chr(211), # capital O, acute accent 235 Ocirc => chr(212), # capital O, circumflex accent 236 Ograve => chr(210), # capital O, grave accent 237 Oslash => chr(216), # capital O, slash 238 Otilde => chr(213), # capital O, tilde 239 Ouml => chr(214), # capital O, dieresis or umlaut mark 240 THORN => chr(222), # capital THORN, Icelandic 241 Uacute => chr(218), # capital U, acute accent 242 Ucirc => chr(219), # capital U, circumflex accent 243 Ugrave => chr(217), # capital U, grave accent 244 Uuml => chr(220), # capital U, dieresis or umlaut mark 245 Yacute => chr(221), # capital Y, acute accent 246 aacute => chr(225), # small a, acute accent 247 acirc => chr(226), # small a, circumflex accent 248 aelig => chr(230), # small ae diphthong (ligature) 249 agrave => chr(224), # small a, grave accent 250 aring => chr(229), # small a, ring 251 atilde => chr(227), # small a, tilde 252 auml => chr(228), # small a, dieresis or umlaut mark 253 ccedil => chr(231), # small c, cedilla 254 eacute => chr(233), # small e, acute accent 255 ecirc => chr(234), # small e, circumflex accent 256 egrave => chr(232), # small e, grave accent 257 eth => chr(240), # small eth, Icelandic 258 euml => chr(235), # small e, dieresis or umlaut mark 259 iacute => chr(237), # small i, acute accent 260 icirc => chr(238), # small i, circumflex accent 261 igrave => chr(236), # small i, grave accent 262 iuml => chr(239), # small i, dieresis or umlaut mark 263 ntilde => chr(241), # small n, tilde 264 oacute => chr(243), # small o, acute accent 265 ocirc => chr(244), # small o, circumflex accent 266 ograve => chr(242), # small o, grave accent 267 oslash => chr(248), # small o, slash 268 otilde => chr(245), # small o, tilde 269 ouml => chr(246), # small o, dieresis or umlaut mark 270 szlig => chr(223), # small sharp s, German (sz ligature) 271 thorn => chr(254), # small thorn, Icelandic 272 uacute => chr(250), # small u, acute accent 273 ucirc => chr(251), # small u, circumflex accent 274 ugrave => chr(249), # small u, grave accent 275 uuml => chr(252), # small u, dieresis or umlaut mark 276 yacute => chr(253), # small y, acute accent 277 yuml => chr(255), # small y, dieresis or umlaut mark 278 279 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) 280 copy => chr(169), # copyright sign 281 reg => chr(174), # registered sign 282 nbsp => chr(160), # non breaking space 283 284 # Additional ISO-8859/1 entities listed in rfc1866 (section 14) 285 iexcl => chr(161), 286 cent => chr(162), 287 pound => chr(163), 288 curren => chr(164), 289 yen => chr(165), 290 brvbar => chr(166), 291 sect => chr(167), 292 uml => chr(168), 293 ordf => chr(170), 294 laquo => chr(171), 295 'not' => chr(172), # not is a keyword in perl 296 shy => chr(173), 297 macr => chr(175), 298 deg => chr(176), 299 plusmn => chr(177), 300 sup1 => chr(185), 301 sup2 => chr(178), 302 sup3 => chr(179), 303 acute => chr(180), 304 micro => chr(181), 305 para => chr(182), 306 middot => chr(183), 307 cedil => chr(184), 308 ordm => chr(186), 309 raquo => chr(187), 310 frac14 => chr(188), 311 frac12 => chr(189), 312 frac34 => chr(190), 313 iquest => chr(191), 314 'times' => chr(215), # times is a keyword in perl 315 divide => chr(247), 316 317 ( 318 $] > 5.007 319 ? ( 320 OElig => chr(338), 321 oelig => chr(339), 322 Scaron => chr(352), 323 scaron => chr(353), 324 Yuml => chr(376), 325 fnof => chr(402), 326 circ => chr(710), 327 tilde => chr(732), 328 Alpha => chr(913), 329 Beta => chr(914), 330 Gamma => chr(915), 331 Delta => chr(916), 332 Epsilon => chr(917), 333 Zeta => chr(918), 334 Eta => chr(919), 335 Theta => chr(920), 336 Iota => chr(921), 337 Kappa => chr(922), 338 Lambda => chr(923), 339 Mu => chr(924), 340 Nu => chr(925), 341 Xi => chr(926), 342 Omicron => chr(927), 343 Pi => chr(928), 344 Rho => chr(929), 345 Sigma => chr(931), 346 Tau => chr(932), 347 Upsilon => chr(933), 348 Phi => chr(934), 349 Chi => chr(935), 350 Psi => chr(936), 351 Omega => chr(937), 352 alpha => chr(945), 353 beta => chr(946), 354 gamma => chr(947), 355 delta => chr(948), 356 epsilon => chr(949), 357 zeta => chr(950), 358 eta => chr(951), 359 theta => chr(952), 360 iota => chr(953), 361 kappa => chr(954), 362 lambda => chr(955), 363 mu => chr(956), 364 nu => chr(957), 365 xi => chr(958), 366 omicron => chr(959), 367 pi => chr(960), 368 rho => chr(961), 369 sigmaf => chr(962), 370 sigma => chr(963), 371 tau => chr(964), 372 upsilon => chr(965), 373 phi => chr(966), 374 chi => chr(967), 375 psi => chr(968), 376 omega => chr(969), 377 thetasym => chr(977), 378 upsih => chr(978), 379 piv => chr(982), 380 ensp => chr(8194), 381 emsp => chr(8195), 382 thinsp => chr(8201), 383 zwnj => chr(8204), 384 zwj => chr(8205), 385 lrm => chr(8206), 386 rlm => chr(8207), 387 ndash => chr(8211), 388 mdash => chr(8212), 389 lsquo => chr(8216), 390 rsquo => chr(8217), 391 sbquo => chr(8218), 392 ldquo => chr(8220), 393 rdquo => chr(8221), 394 bdquo => chr(8222), 395 dagger => chr(8224), 396 Dagger => chr(8225), 397 bull => chr(8226), 398 hellip => chr(8230), 399 permil => chr(8240), 400 prime => chr(8242), 401 Prime => chr(8243), 402 lsaquo => chr(8249), 403 rsaquo => chr(8250), 404 oline => chr(8254), 405 frasl => chr(8260), 406 euro => chr(8364), 407 image => chr(8465), 408 weierp => chr(8472), 409 real => chr(8476), 410 trade => chr(8482), 411 alefsym => chr(8501), 412 larr => chr(8592), 413 uarr => chr(8593), 414 rarr => chr(8594), 415 darr => chr(8595), 416 harr => chr(8596), 417 crarr => chr(8629), 418 lArr => chr(8656), 419 uArr => chr(8657), 420 rArr => chr(8658), 421 dArr => chr(8659), 422 hArr => chr(8660), 423 forall => chr(8704), 424 part => chr(8706), 425 exist => chr(8707), 426 empty => chr(8709), 427 nabla => chr(8711), 428 isin => chr(8712), 429 notin => chr(8713), 430 ni => chr(8715), 431 prod => chr(8719), 432 sum => chr(8721), 433 minus => chr(8722), 434 lowast => chr(8727), 435 radic => chr(8730), 436 prop => chr(8733), 437 infin => chr(8734), 438 ang => chr(8736), 439 'and' => chr(8743), 440 'or' => chr(8744), 441 cap => chr(8745), 442 cup => chr(8746), 443 'int' => chr(8747), 444 there4 => chr(8756), 445 sim => chr(8764), 446 cong => chr(8773), 447 asymp => chr(8776), 448 'ne' => chr(8800), 449 equiv => chr(8801), 450 'le' => chr(8804), 451 'ge' => chr(8805), 452 'sub' => chr(8834), 453 sup => chr(8835), 454 nsub => chr(8836), 455 sube => chr(8838), 456 supe => chr(8839), 457 oplus => chr(8853), 458 otimes => chr(8855), 459 perp => chr(8869), 460 sdot => chr(8901), 461 lceil => chr(8968), 462 rceil => chr(8969), 463 lfloor => chr(8970), 464 rfloor => chr(8971), 465 lang => chr(9001), 466 rang => chr(9002), 467 loz => chr(9674), 468 spades => chr(9824), 469 clubs => chr(9827), 470 hearts => chr(9829), 471 diams => chr(9830), 472 ) 473 : () 474 ) 475 ); 476 477 # encode html entities like "–" 478 $Param{String} =~ s{ 479 (&\#(\d+);?) 480 } 481 { 482 my $ChrOrig = $1; 483 my $Dec = $2; 484 485 # Don't process UTF-16 surrogate pairs. Used on their own, these are not valid UTF-8 code 486 # points and can result in errors in old Perl versions. See bug#12588 for more information. 487 # - High Surrogate codes (U+D800-U+DBFF) 488 # - Low Surrogate codes (U+DC00-U+DFFF) 489 if ( $Dec >= 55296 && $Dec <= 57343 ) { 490 $ChrOrig; 491 } 492 else { 493 my $Chr = chr($Dec); 494 495 # Make sure we get valid UTF8 code points, but skip characters from 128 to 255 496 # (inclusive), since they are by default internally not encoded as UTF-8 for 497 # backward compatibility reasons. See bug#12457 for more information. 498 if ( $Dec < 128 || $Dec> 255 ) { 499 Encode::_utf8_off($Chr); 500 $Chr = Encode::decode('utf-8', $Chr, 0); 501 } 502 503 if ( $Chr ) { 504 $Chr; 505 } 506 else { 507 $ChrOrig; 508 } 509 } 510 }egx; 511 512 # encode html entities like "=" 513 $Param{String} =~ s{ 514 (&\#[xX]([0-9a-fA-F]+);?) 515 } 516 { 517 my $ChrOrig = $1; 518 my $Dec = hex( $2 ); 519 520 # Don't process UTF-16 surrogate pairs. Used on their own, these are not valid UTF-8 code 521 # points and can result in errors in old Perl versions. See bug#12588 for more information. 522 # - High Surrogate codes (U+D800-U+DBFF) 523 # - Low Surrogate codes (U+DC00-U+DFFF) 524 if ( $Dec >= 55296 && $Dec <= 57343 ) { 525 $ChrOrig; 526 } 527 else { 528 if ( $Dec ) { 529 my $Chr = chr( $Dec ); 530 531 # Make sure we get valid UTF8 code points, but skip characters from 128 to 255 532 # (inclusive), since they are by default internally not encoded as UTF-8 for 533 # backward compatibility reasons. See bug#12457 for more information. 534 if ( $Dec < 128 || $Dec > 255 ) { 535 Encode::_utf8_off($Chr); 536 $Chr = Encode::decode('utf-8', $Chr, 0); 537 } 538 539 if ( $Chr ) { 540 $Chr; 541 } 542 else { 543 $ChrOrig; 544 } 545 } 546 else { 547 $ChrOrig; 548 } 549 } 550 }egx; 551 552 # encode html entities like "&" 553 $Param{String} =~ s{ 554 (&(\w+);?) 555 } 556 { 557 if ( $Entity{$2} ) { 558 $Entity{$2}; 559 } 560 else { 561 $1; 562 } 563 }egx; 564 565 # remove empty lines 566 $Param{String} =~ s/^\s*\n\s*\n/\n/mg; 567 568 # force line breaking 569 if ( length $Param{String} > $LineLength ) { 570 $Param{String} =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm; 571 } 572 573 # remember <blockquote> and <div style=\"cite\" 574 for my $Key ( sort keys %Cite ) { 575 $Param{String} =~ s/$Key/$Cite{$Key}\n/g; 576 } 577 578 # add extracted links 579 if ($LinkList) { 580 $Param{String} .= "\n\n" . $LinkList; 581 } 582 583 return $Param{String}; 584} 585 586=head2 ToHTML() 587 588convert an ASCII string to an HTML string 589 590 my $HTMLString = $HTMLUtilsObject->ToHTML( 591 String => $String, 592 ReplaceDoubleSpace => 0, # replace with " ", optional 1 or 0 (defaults to 1) 593 ); 594 595=cut 596 597sub ToHTML { 598 my ( $Self, %Param ) = @_; 599 600 # check needed stuff 601 for (qw(String)) { 602 if ( !defined $Param{$_} ) { 603 $Kernel::OM->Get('Kernel::System::Log')->Log( 604 Priority => 'error', 605 Message => "Need $_!" 606 ); 607 return; 608 } 609 } 610 611 # fix some bad stuff from opera and others 612 $Param{String} =~ s/(\n\r|\r\r\n|\r\n)/\n/gs; 613 614 $Param{String} =~ s/&/&/g; 615 $Param{String} =~ s/</</g; 616 $Param{String} =~ s/>/>/g; 617 $Param{String} =~ s/"/"/g; 618 $Param{String} =~ s/(\n|\r)/<br\/>\n/g; 619 $Param{String} =~ s/ / /g if $Param{ReplaceDoubleSpace}; 620 621 return $Param{String}; 622} 623 624=head2 DocumentComplete() 625 626check and e. g. add <html> and <body> tags to given html string 627 628 my $HTMLDocument = $HTMLUtilsObject->DocumentComplete( 629 String => $String, 630 Charset => $Charset, 631 ); 632 633=cut 634 635sub DocumentComplete { 636 my ( $Self, %Param ) = @_; 637 638 # check needed stuff 639 for (qw(String Charset)) { 640 if ( !defined $Param{$_} ) { 641 $Kernel::OM->Get('Kernel::System::Log')->Log( 642 Priority => 'error', 643 Message => "Need $_!" 644 ); 645 return; 646 } 647 } 648 649 return $Param{String} if $Param{String} =~ /<html>/i; 650 651 my $Css = $Kernel::OM->Get('Kernel::Config')->Get('Frontend::RichText::DefaultCSS') 652 || 'font-size: 12px; font-family:Courier,monospace,fixed;'; 653 654 # escape special characters like double-quotes, e.g. used in font names with spaces 655 $Css = $Self->ToHTML( String => $Css ); 656 657 # Use the HTML5 doctype because it is compatible with HTML4 and causes the browsers 658 # to render the content in standards mode, which is more safe than quirks mode. 659 my $Body = '<!DOCTYPE html><html><head>'; 660 $Body 661 .= '<meta http-equiv="Content-Type" content="text/html; charset=' . $Param{Charset} . '"/>'; 662 $Body .= '</head><body style="' . $Css . '">' . $Param{String} . '</body></html>'; 663 return $Body; 664} 665 666=head2 DocumentStrip() 667 668remove html document tags from string 669 670 my $HTMLString = $HTMLUtilsObject->DocumentStrip( 671 String => $String, 672 ); 673 674=cut 675 676sub DocumentStrip { 677 my ( $Self, %Param ) = @_; 678 679 # check needed stuff 680 for (qw(String)) { 681 if ( !defined $Param{$_} ) { 682 $Kernel::OM->Get('Kernel::System::Log')->Log( 683 Priority => 'error', 684 Message => "Need $_!" 685 ); 686 return; 687 } 688 } 689 690 $Param{String} =~ s/^<\!DOCTYPE\s+HTML.+?>//gsi; 691 $Param{String} =~ s/<head>.+?<\/head>//gsi; 692 $Param{String} =~ s/<(html|body)(.*?)>//gsi; 693 $Param{String} =~ s/<\/(html|body)>//gsi; 694 695 return $Param{String}; 696} 697 698=head2 DocumentCleanup() 699 700perform some sanity checks on HTML content. 701 702 - Replace MS Word 12 <p|div> with class "MsoNormal" by using <br/> because 703 it's not used as <p><div> (margin:0cm; margin-bottom:.0001pt;). 704 705 - Replace <blockquote> by using 706 "<div style="border:none;border-left:solid blue 1.5pt;padding:0cm 0cm 0cm 4.0pt" type="cite">" 707 because of cross mail client and browser compatibility. 708 709 - If there is no HTML doctype present, inject the HTML5 doctype, because it is compatible with HTML4 710 and causes the browsers to render the content in standards mode, which is safer. 711 712 $HTMLBody = $HTMLUtilsObject->DocumentCleanup( 713 String => $HTMLBody, 714 ); 715 716=cut 717 718sub DocumentCleanup { 719 my ( $Self, %Param ) = @_; 720 721 # check needed stuff 722 for (qw(String)) { 723 if ( !defined $Param{$_} ) { 724 $Kernel::OM->Get('Kernel::System::Log')->Log( 725 Priority => 'error', 726 Message => "Need $_!" 727 ); 728 return; 729 } 730 } 731 732 # If the string starts with <html> directly, inject the doctype 733 $Param{String} =~ s{ \A \s* <html }{<!DOCTYPE html><html}gsmix; 734 735 # remove <base> tags - see bug#8880 736 $Param{String} =~ s{<base .*?>}{}xmsi; 737 738 # replace MS Word 12 <p|div> with class "MsoNormal" by using <br/> because 739 # it's not used as <p><div> (margin:0cm; margin-bottom:.0001pt;) 740 $Param{String} =~ s{ 741 <p\s{1,3}class=(|"|')MsoNormal(|"|')(.*?)>(.+?)</p> 742 } 743 { 744 $4 . '<br/>'; 745 }segxmi; 746 747 $Param{String} =~ s{ 748 <div\s{1,3}class=(|"|')MsoNormal(|"|')(.*?)>(.+?)</div> 749 } 750 { 751 $4 . '<br/>'; 752 }segxmi; 753 754 # replace <blockquote> by using 755 # "<div style="border:none;border-left:solid blue 1.5pt;padding:0cm 0cm 0cm 4.0pt" type="cite">" 756 # because of cross mail client and browser compatability 757 my $Style = "border:none;border-left:solid blue 1.5pt;padding:0cm 0cm 0cm 4.0pt"; 758 for ( 1 .. 10 ) { 759 $Param{String} =~ s{ 760 <blockquote(.*?)>(.+?)</blockquote> 761 } 762 { 763 "<div $1 style=\"$Style\">$2</div>"; 764 }segxmi; 765 } 766 767 return $Param{String}; 768} 769 770=head2 LinkQuote() 771 772detect links in HTML code, add C<a href> if missing 773 774 my $HTMLWithLinks = $HTMLUtilsObject->LinkQuote( 775 String => $HTMLString, 776 Target => 'TargetName', # content of target="?", e. g. _blank 777 TargetAdd => 1, # add target="_blank" to all existing "<a href" 778 ); 779 780also string ref is possible 781 782 my $HTMLWithLinksRef = $HTMLUtilsObject->LinkQuote( 783 String => \$HTMLStringRef, 784 ); 785 786=cut 787 788sub LinkQuote { 789 my ( $Self, %Param ) = @_; 790 791 my $String = $Param{String} || ''; 792 793 # check ref 794 my $StringScalar; 795 if ( !ref $String ) { 796 $StringScalar = $String; 797 $String = \$StringScalar; 798 799 # return if string is not a ref and it is empty 800 return $StringScalar if !$StringScalar; 801 } 802 803 # add target to already existing url of html string 804 if ( $Param{TargetAdd} ) { 805 806 # find target 807 my $Target = $Param{Target}; 808 if ( !$Target ) { 809 $Target = '_blank'; 810 } 811 812 # add target to existing "<a href" 813 ${$String} =~ s{ 814 (<a\s{1,10})([^>]+)> 815 } 816 { 817 my $Start = $1; 818 my $Value = $2; 819 if ( $Value !~ /href=/i || $Value =~ /target=/i ) { 820 "$Start$Value>"; 821 } 822 else { 823 "$Start$Value target=\"$Target\">"; 824 } 825 }egxsi; 826 } 827 828 my $Marker = "§" x 10; 829 830 # Remove existing <a>...</a> tags and their content to be re-inserted later, this must not be quoted. 831 # Also remove other tags to avoid quoting in tag parameters. 832 my $Counter = 0; 833 my %TagHash; 834 ${$String} =~ s{ 835 (<a\s[^>]*?>[^>]*</a>|<[^>]+?>) 836 } 837 { 838 my $Content = $1; 839 my $Key = "${Marker}TagHash-$Counter${Marker}"; 840 $TagHash{$Counter++} = $Content; 841 $Key; 842 }egxism; 843 844 # Add <a> tags for URLs in the content. 845 my $Target = ''; 846 if ( $Param{Target} ) { 847 $Target = " target=\"$Param{Target}\""; 848 } 849 ${$String} =~ s{ 850 ( # $1 greater-than and less-than sign 851 > | < | \s+ | §{10} | 852 (?: &[a-zA-Z0-9]+; ) # get html entities 853 ) 854 ( # $2 855 (?: # http or only www 856 (?: (?: http s? | ftp ) :\/\/) | # http://,https:// and ftp:// 857 (?: (?: www | ftp ) \.) # www. and ftp. 858 ) 859 ) 860 ( # $3 861 (?: [a-z0-9\-]+ \. )* # get subdomains, optional 862 [a-z0-9\-]+ # get top level domain 863 (?: # optional port number 864 [:] 865 [0-9]+ 866 )? 867 (?: # file path element 868 [\/\.] 869 | [a-zA-Z0-9\-_=%] 870 )* 871 (?: # param string 872 [\?] # if param string is there, "?" must be present 873 [a-zA-Z0-9&;=%\-_:\.\/]* # param string content, this will also catch entities like & 874 )? 875 (?: # link hash string 876 [\#] # 877 [a-zA-Z0-9&;=%\-_:\.\/]* # hash string content, this will also catch entities like & 878 )? 879 ) 880 (?= # $4 881 (?: 882 [\?,;!\.\)] (?: \s | $ ) # \)\s this construct is because of bug# 2450 883 | \" 884 | \] 885 | \s+ 886 | ' 887 | > # greater-than and less-than sign 888 | < # " 889 | (?: &[a-zA-Z0-9]+; )+ # html entities 890 | $ # bug# 2715 891 ) 892 | §{10} # ending TagHash 893 ) 894 } 895 { 896 my $Start = $1; 897 my $Protocol = $2; 898 my $Link = $3; 899 my $End = $4 || ''; 900 901 # there may different links for href and link body 902 my $HrefLink; 903 my $DisplayLink; 904 905 if ( $Protocol =~ m{\A ( http | https | ftp ) : \/ \/ }xi ) { 906 $DisplayLink = $Protocol . $Link; 907 $HrefLink = $DisplayLink; 908 } 909 else { 910 if ($Protocol =~ m{\A ftp }smx ) { 911 $HrefLink = 'ftp://'; 912 } 913 else { 914 $HrefLink = 'http://'; 915 } 916 917 if ( $Protocol ) { 918 $HrefLink .= $Protocol; 919 $DisplayLink = $Protocol; 920 } 921 922 $DisplayLink .= $Link; 923 $HrefLink .= $Link; 924 } 925 $Start . "<a href=\"$HrefLink\"$Target title=\"$HrefLink\">$DisplayLink<\/a>" . $End; 926 }egxism; 927 928 # Re-add previously removed tags. 929 ${$String} =~ s{${Marker}TagHash-(\d+)${Marker}}{$TagHash{$1}}egsxim; 930 931 # check ref && return result like called 932 if ( defined $StringScalar ) { 933 return ${$String}; 934 } 935 return $String; 936} 937 938=head2 Safety() 939 940To remove/strip active html tags/addons (javascript, C<applet>s, C<embed>s and C<object>s) 941from html strings. 942 943 my %Safe = $HTMLUtilsObject->Safety( 944 String => $HTMLString, 945 NoApplet => 1, 946 NoObject => 1, 947 NoEmbed => 1, 948 NoSVG => 1, 949 NoImg => 1, 950 NoIntSrcLoad => 0, 951 NoExtSrcLoad => 1, 952 NoJavaScript => 1, 953 ReplacementStr => 'string', # optional, string to show instead of applet, object, embed, svg and img tags 954 ); 955 956also string ref is possible 957 958 my %Safe = $HTMLUtilsObject->Safety( 959 String => \$HTMLStringRef, 960 NoApplet => 1, 961 NoObject => 1, 962 NoEmbed => 1, 963 NoSVG => 1, 964 NoImg => 1, 965 NoIntSrcLoad => 0, 966 NoExtSrcLoad => 1, 967 NoJavaScript => 1, 968 ); 969 970returns 971 972 my %Safe = ( 973 String => $HTMLString, # modified html string (scalar or ref) 974 Replace => 1, # info if something got replaced 975 ); 976 977=cut 978 979sub Safety { 980 my ( $Self, %Param ) = @_; 981 982 # check needed stuff 983 for (qw(String)) { 984 if ( !defined $Param{$_} ) { 985 $Kernel::OM->Get('Kernel::System::Log')->Log( 986 Priority => 'error', 987 Message => "Need $_!" 988 ); 989 return; 990 } 991 } 992 993 my $String = $Param{String} || ''; 994 995 # check ref 996 my $StringScalar; 997 if ( !ref $String ) { 998 $StringScalar = $String; 999 $String = \$StringScalar; 1000 } 1001 1002 my %Safety; 1003 1004 my $Replaced; 1005 1006 # In UTF-7, < and > can be encoded to mask them from security filters like this one. 1007 my $TagStart = '(?:<|[+]ADw-)'; 1008 my $TagEnd = '(?:>|[+]AD4-)'; 1009 1010 # This can also be entity-encoded to hide it from the parser. 1011 # Browsers seem to tolerate an omitted ";". 1012 my $JavaScriptPrefixRegex = ' 1013 (?: j | &\#106[;]? | &\#x6a[;]? ) 1014 (?: a | &\#97[;]? | &\#x61[;]? ) 1015 (?: v | &\#118[;]? | &\#x76[;]? ) 1016 (?: a | &\#97[;]? | &\#x61[;]? ) 1017 (?: s | &\#115[;]? | &\#x73[;]? ) 1018 (?: c | &\#99[;]? | &\#x63[;]? ) 1019 (?: r | &\#114[;]? | &\#x72[;]? ) 1020 (?: i | &\#105[;]? | &\#x69[;]? ) 1021 (?: p | &\#112[;]? | &\#x70[;]? ) 1022 (?: t | &\#116[;]? | &\#x74[;]? ) 1023 '; 1024 1025 my $ExpressionPrefixRegex = ' 1026 (?: e | &\#101[;]? | &\#x65[;]? ) 1027 (?: x | &\#120[;]? | &\#x78[;]? ) 1028 (?: p | &\#112[;]? | &\#x70[;]? ) 1029 (?: r | &\#114[;]? | &\#x72[;]? ) 1030 (?: e | &\#101[;]? | &\#x65[;]? ) 1031 (?: s | &\#115[;]? | &\#x73[;]? ) 1032 (?: s | &\#115[;]? | &\#x73[;]? ) 1033 (?: i | &\#105[;]? | &\#x69[;]? ) 1034 (?: o | &\#111[;]? | &\#x6f[;]? ) 1035 (?: n | &\#110[;]? | &\#x6e[;]? ) 1036 '; 1037 1038 # Replace as many times as it is needed to avoid nesting tag attacks. 1039 do { 1040 $Replaced = undef; 1041 1042 # remove script tags 1043 if ( $Param{NoJavaScript} ) { 1044 $Replaced += ${$String} =~ s{ 1045 $TagStart script.*? $TagEnd .*? $TagStart /script \s* $TagEnd 1046 } 1047 {}sgxim; 1048 $Replaced += ${$String} =~ s{ 1049 $TagStart script.*? $TagEnd .+? ($TagStart|$TagEnd) 1050 } 1051 {}sgxim; 1052 1053 # remove style/javascript parts 1054 $Replaced += ${$String} =~ s{ 1055 $TagStart style[^>]+? $JavaScriptPrefixRegex (.+?|) $TagEnd (.*?) $TagStart /style \s* $TagEnd 1056 } 1057 {}sgxim; 1058 1059 # remove MS CSS expressions (JavaScript embedded in CSS) 1060 ${$String} =~ s{ 1061 ($TagStart style[^>]+? $TagEnd .*? $TagStart /style \s* $TagEnd) 1062 } 1063 { 1064 if ( index($1, 'expression(' ) > -1 ) { 1065 $Replaced = 1; 1066 ''; 1067 } 1068 else { 1069 $1; 1070 } 1071 }egsxim; 1072 } 1073 1074 # remove HTTP redirects 1075 $Replaced += ${$String} =~ s{ 1076 $TagStart meta [^>]+? http-equiv=('|"|)refresh [^>]+? $TagEnd 1077 } 1078 {}sgxim; 1079 1080 my $ReplacementStr = $Param{ReplacementStr} // ''; 1081 1082 # remove <applet> tags 1083 if ( $Param{NoApplet} ) { 1084 $Replaced += ${$String} =~ s{ 1085 $TagStart applet.*? $TagEnd (.*?) $TagStart /applet \s* $TagEnd 1086 } 1087 {$ReplacementStr}sgxim; 1088 } 1089 1090 # remove <Object> tags 1091 if ( $Param{NoObject} ) { 1092 $Replaced += ${$String} =~ s{ 1093 $TagStart object.*? $TagEnd (.*?) $TagStart /object \s* $TagEnd 1094 } 1095 {$ReplacementStr}sgxim; 1096 } 1097 1098 # remove <svg> tags 1099 if ( $Param{NoSVG} ) { 1100 $Replaced += ${$String} =~ s{ 1101 $TagStart svg.*? $TagEnd (.*?) $TagStart /svg \s* $TagEnd 1102 } 1103 {$ReplacementStr}sgxim; 1104 } 1105 1106 # remove <img> tags 1107 if ( $Param{NoImg} ) { 1108 $Replaced += ${$String} =~ s{ 1109 $TagStart img.*? (.*?) \s* $TagEnd 1110 } 1111 {$ReplacementStr}sgxim; 1112 } 1113 1114 # remove <embed> tags 1115 if ( $Param{NoEmbed} ) { 1116 $Replaced += ${$String} =~ s{ 1117 $TagStart embed.*? $TagEnd 1118 } 1119 {$ReplacementStr}sgxim; 1120 } 1121 1122 # check each html tag 1123 ${$String} =~ s{ 1124 ($TagStart.+?$TagEnd) 1125 } 1126 { 1127 my $Tag = $1; 1128 if ($Param{NoJavaScript}) { 1129 1130 # remove on action attributes 1131 $Replaced += $Tag =~ s{ 1132 (?:\s|/) on[a-z]+\s*=("[^"]+"|'[^']+'|.+?)($TagEnd|\s) 1133 } 1134 {$2}sgxim; 1135 1136 # remove javascript in a href links or src links 1137 $Replaced += $Tag =~ s{ 1138 ((?:\s|;|/)(?:background|url|src|href)=) 1139 ('|"|) # delimiter, can be empty 1140 (?:\s* $JavaScriptPrefixRegex .*?) # javascript, followed by anything but the delimiter 1141 \2 # delimiter again 1142 (\s|$TagEnd) 1143 } 1144 { 1145 "$1\"\"$3"; 1146 }sgxime; 1147 1148 # remove link javascript tags 1149 $Replaced += $Tag =~ s{ 1150 ($TagStart link .+? $JavaScriptPrefixRegex (.+?|) $TagEnd) 1151 } 1152 {}sgxim; 1153 1154 # remove MS CSS expressions (JavaScript embedded in CSS) 1155 $Replaced += $Tag =~ s{ 1156 \sstyle=("|')[^\1]*? $ExpressionPrefixRegex [(].*?\1($TagEnd|\s) 1157 } 1158 { 1159 $2; 1160 }egsxim; 1161 } 1162 1163 # Remove malicious CSS content 1164 $Tag =~ s{ 1165 (\s)style=("|') (.*?) \2 1166 } 1167 { 1168 my ($Space, $Delimiter, $Content) = ($1, $2, $3); 1169 1170 if ( 1171 ($Param{NoIntSrcLoad} && $Content =~ m{url\(}) 1172 || ($Param{NoExtSrcLoad} && $Content =~ m/(http|ftp|https):\//i)) { 1173 $Replaced = 1; 1174 ''; 1175 } 1176 else { 1177 "${Space}style=${Delimiter}${Content}${Delimiter}"; 1178 } 1179 }egsxim; 1180 1181 # remove load tags 1182 if ($Param{NoIntSrcLoad} || $Param{NoExtSrcLoad}) { 1183 $Tag =~ s{ 1184 ($TagStart (.+?) (?: \s | /) (?:src|poster)=(.+?) (\s.+?|) $TagEnd) 1185 } 1186 { 1187 my $URL = $3; 1188 if ($Param{NoIntSrcLoad} || ($Param{NoExtSrcLoad} && $URL =~ /(http|ftp|https):\//i)) { 1189 $Replaced = 1; 1190 ''; 1191 } 1192 else { 1193 $1; 1194 } 1195 }segxim; 1196 } 1197 1198 # replace original tag with clean tag 1199 $Tag; 1200 }segxim; 1201 1202 $Safety{Replace} += $Replaced; 1203 1204 } while ($Replaced); ## no critic 1205 1206 # check ref && return result like called 1207 if ( defined $StringScalar ) { 1208 $Safety{String} = ${$String}; 1209 } 1210 else { 1211 $Safety{String} = $String; 1212 } 1213 return %Safety; 1214} 1215 1216=head2 EmbeddedImagesExtract() 1217 1218extracts embedded images with data-URLs from an HTML document. 1219 1220 $HTMLUtilsObject->EmbeddedImagesExtract( 1221 DocumentRef => \$Body, 1222 AttachmentsRef => \@Attachments, 1223 ); 1224 1225Returns nothing. If embedded images were found, these will be appended 1226to the attachments list, and the image data URL will be replaced with a 1227C<cid:> URL in the document. 1228 1229=cut 1230 1231sub EmbeddedImagesExtract { 1232 my ( $Self, %Param ) = @_; 1233 1234 if ( ref $Param{DocumentRef} ne 'SCALAR' || !defined ${ $Param{DocumentRef} } ) { 1235 $Kernel::OM->Get('Kernel::System::Log')->Log( 1236 Priority => 'error', 1237 Message => "Need DocumentRef!" 1238 ); 1239 return; 1240 } 1241 if ( ref $Param{AttachmentsRef} ne 'ARRAY' ) { 1242 $Kernel::OM->Get('Kernel::System::Log')->Log( 1243 Priority => 'error', 1244 Message => "Need AttachmentsRef!" 1245 ); 1246 return; 1247 } 1248 1249 my $FQDN = $Kernel::OM->Get('Kernel::Config')->Get('FQDN'); 1250 ${ $Param{DocumentRef} } =~ s{(src=")(data:image/)(png|gif|jpg|jpeg|bmp)(;base64,)(.+?)(")}{ 1251 1252 my $Base64String = $5; 1253 1254 my $FileName = 'pasted-' . time() . '-' . int(rand(1000000)) . '.' . $3; 1255 my $ContentType = "image/$3; name=\"$FileName\""; 1256 my $ContentID = 'pasted.' . time() . '.' . int(rand(1000000)) . '@' . $FQDN; 1257 1258 my $AttachmentData = { 1259 Content => decode_base64($Base64String), 1260 ContentType => $ContentType, 1261 ContentID => $ContentID, 1262 Filename => $FileName, 1263 Disposition => 'inline', 1264 }; 1265 push @{$Param{AttachmentsRef}}, $AttachmentData; 1266 1267 # compose new image tag 1268 $1 . "cid:$ContentID" . $6 1269 1270 }egxi; 1271 1272 return 1; 1273} 1274 12751; 1276 1277=head1 TERMS AND CONDITIONS 1278 1279This software is part of the OTRS project (L<https://otrs.org/>). 1280 1281This software comes with ABSOLUTELY NO WARRANTY. For details, see 1282the enclosed file COPYING for license information (GPL). If you 1283did not receive this file, see L<https://www.gnu.org/licenses/gpl-3.0.txt>. 1284 1285=cut 1286