1# --
2# Copyright (C) 2001-2020 OTRS AG, https://otrs.com/
3# --
4# This software comes with ABSOLUTELY NO WARRANTY. For details, see
5# the enclosed file COPYING for license information (GPL). If you
6# did not receive this file, see https://www.gnu.org/licenses/gpl-3.0.txt.
7# --
8
9package Kernel::System::HTMLUtils;
10
11use strict;
12use warnings;
13
14use utf8;
15
16use MIME::Base64;
17
18our @ObjectDependencies = (
19    'Kernel::Config',
20    'Kernel::System::Encode',
21    'Kernel::System::Log',
22);
23
24=head1 NAME
25
26Kernel::System::HTMLUtils - creating and modifying html strings
27
28=head1 DESCRIPTION
29
30A module for creating and modifying html strings.
31
32=head1 PUBLIC INTERFACE
33
34=head2 new()
35
36Don't use the constructor directly, use the ObjectManager instead:
37
38    my $HTMLUtilsObject = $Kernel::OM->Get('Kernel::System::HTMLUtils');
39
40=cut
41
42sub new {
43    my ( $Type, %Param ) = @_;
44
45    # allocate new hash for object
46    my $Self = {};
47    bless( $Self, $Type );
48
49    # get debug level from parent
50    $Self->{Debug} = $Param{Debug} || 0;
51
52    return $Self;
53}
54
55=head2 ToAscii()
56
57convert an HTML string to an ASCII string
58
59    my $Ascii = $HTMLUtilsObject->ToAscii( String => $String );
60
61=cut
62
63sub ToAscii {
64    my ( $Self, %Param ) = @_;
65
66    # check needed stuff
67    for (qw(String)) {
68        if ( !defined $Param{$_} ) {
69            $Kernel::OM->Get('Kernel::System::Log')->Log(
70                Priority => 'error',
71                Message  => "Need $_!"
72            );
73            return;
74        }
75    }
76
77    # make sure to flag the input string as unicode (utf8) because replacements below can
78    # introduce unicode encoded characters (see bug#10970, bug#11596 and bug#12097 for more info)
79    $Kernel::OM->Get('Kernel::System::Encode')->EncodeInput( \$Param{String} );
80
81    # get length of line for forcing line breakes
82    my $LineLength = $Kernel::OM->Get('Kernel::Config')->Get('Ticket::Frontend::TextAreaNote') || 78;
83
84    # find <a href=....> and replace it with [x]
85    my $LinkList = '';
86    my $Counter  = 0;
87    $Param{String} =~ s{
88        <a\s.*?href=("|')(.+?)("|').*?>
89    }
90    {
91        my $Link = $2;
92        $Counter++;
93        $LinkList .= "[$Counter] $Link\n";
94        "[$Counter]";
95    }egxi;
96
97    # pre-process <blockquote> and <div style=\"cite\"
98    my %Cite;
99    $Counter = 0;
100    $Param{String} =~ s{
101        <blockquote(.*?)>(.+?)</blockquote>
102    }
103    {
104        my $Ascii = $Self->ToAscii(
105            String => $2,
106        );
107        # force line breaking
108        if ( length $Ascii > $LineLength ) {
109            $Ascii =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm;
110        }
111        $Ascii =~ s/^(.*?)$/> $1/gm;
112        $Counter++;
113        my $Key     = "######Cite::$Counter######";
114        $Cite{$Key} = $Ascii;
115        $Key;
116    }segxmi;
117    $Param{String} =~ s{
118        <div.+?type="cite"[^>]*>(.+?)</div>
119    }
120    {
121        my $Ascii = $Self->ToAscii(
122            String => $1,
123        );
124        # force line breaking
125        if ( length $Ascii > $LineLength ) {
126            $Ascii =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm;
127        }
128        $Ascii =~ s/^(.*?)$/> $1/gm;
129        $Counter++;
130        my $Key     = "######Cite::$Counter######";
131        $Cite{$Key} = $Ascii;
132        $Key;
133    }segxmi;
134
135    # remember <pre> and <code> tags
136    my %One2One;
137    $Counter = 0;
138    $Param{String} =~ s{
139        <(pre|code)(.*?)>(.+?)</(pre|code)(.*?)>
140    }
141    {
142        my $Content = $3;
143        $Counter++;
144        my $Key        = "######One2One::$Counter######";
145        $One2One{$Key} = $Content;
146        $Key;
147    }segxmi;
148
149    # remove comments at the first place to avoid to much work
150    # for the regex engine
151    $Param{String} =~ s{<!-- .*? -->}{}xmgsi;
152
153    # remove empty lines
154    $Param{String} =~ s/^\s*//mg;
155
156    # fix some bad stuff from opera and others
157    $Param{String} =~ s/(\n\r|\r\r\n|\r\n)/\n/gs;
158
159    # remove new line after <br>
160    $Param{String} =~ s/(\<br(\s{0,3}|\s{1,3}.+?)(\/|)\>)(\n|\r)/$1/gsi;
161
162    # replace new lines with one space
163    $Param{String} =~ s/\n/ /gs;
164    $Param{String} =~ s/\r/ /gs;
165
166    # remove style tags
167    $Param{String} =~ s{<style [^>]*? />}{}xgsi;
168    $Param{String} =~ s{<style [^>]*? > .*? </style[^>]*>}{}xgsi;
169
170    # remove <br>,<br/>,<br />, <br class="name"/>, tags and replace it with \n
171    $Param{String} =~ s/\<br(\s{0,3}|\s{1,3}.+?)(\/|)\>/\n/gsi;
172
173    # remove </div> tags and replace it with \n
174    $Param{String} =~ s/<\/(\s{0,3})div>/\n/gsi;
175
176    # remove hr tags and replace it with \n
177    $Param{String} =~ s/\<(hr|hr.+?)\>/\n\n/gsi;
178
179    # remove p, table tags and replace it with \n
180    $Param{String} =~ s/\<(\/|)(p|p.+?|table|table.+?)\>/\n\n/gsi;
181
182    # remove opening tr, th tags and replace them with \n
183    $Param{String} =~ s/\<(tr|tr.+?|th|th.+?)\>/\n\n/gsi;
184
185    # convert li tags to \n -
186    $Param{String} =~ s/\<li\>/\n - /gsi;
187
188    # convert </ul> and </ol> tags to \n\n
189    $Param{String} =~ s/\<\/(ul|ol)\>/\n\n/gsi;
190
191    # remove </td> tags and replace them with " "
192    $Param{String} =~ s/<\/td[^>]*>/ /gsi;
193
194    # replace multiple spaces with just one space
195    $Param{String} =~ s/[ ]{2,}/ /mg;
196
197    # remember <pre> and <code> tags and replace it
198    for my $Key ( sort keys %One2One ) {
199        $Param{String} =~ s/$Key/\n\n\n$One2One{$Key}\n\n/g;
200    }
201
202    # strip all other tags
203    $Param{String} =~ s/\<.+?\>//gs;
204
205    # html encode based on cpan's HTML::Entities v1.35
206    my %Entity = (
207
208        # Some normal chars that have special meaning in SGML context
209        amp  => '&',    # ampersand
210        'gt' => '>',    # greater than
211        'lt' => '<',    # less than
212        quot => '"',    # double quote
213        apos => "'",    # single quote
214
215        # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
216        AElig  => chr(198),    # capital AE diphthong (ligature)
217        Aacute => chr(193),    # capital A, acute accent
218        Acirc  => chr(194),    # capital A, circumflex accent
219        Agrave => chr(192),    # capital A, grave accent
220        Aring  => chr(197),    # capital A, ring
221        Atilde => chr(195),    # capital A, tilde
222        Auml   => chr(196),    # capital A, dieresis or umlaut mark
223        Ccedil => chr(199),    # capital C, cedilla
224        ETH    => chr(208),    # capital Eth, Icelandic
225        Eacute => chr(201),    # capital E, acute accent
226        Ecirc  => chr(202),    # capital E, circumflex accent
227        Egrave => chr(200),    # capital E, grave accent
228        Euml   => chr(203),    # capital E, dieresis or umlaut mark
229        Iacute => chr(205),    # capital I, acute accent
230        Icirc  => chr(206),    # capital I, circumflex accent
231        Igrave => chr(204),    # capital I, grave accent
232        Iuml   => chr(207),    # capital I, dieresis or umlaut mark
233        Ntilde => chr(209),    # capital N, tilde
234        Oacute => chr(211),    # capital O, acute accent
235        Ocirc  => chr(212),    # capital O, circumflex accent
236        Ograve => chr(210),    # capital O, grave accent
237        Oslash => chr(216),    # capital O, slash
238        Otilde => chr(213),    # capital O, tilde
239        Ouml   => chr(214),    # capital O, dieresis or umlaut mark
240        THORN  => chr(222),    # capital THORN, Icelandic
241        Uacute => chr(218),    # capital U, acute accent
242        Ucirc  => chr(219),    # capital U, circumflex accent
243        Ugrave => chr(217),    # capital U, grave accent
244        Uuml   => chr(220),    # capital U, dieresis or umlaut mark
245        Yacute => chr(221),    # capital Y, acute accent
246        aacute => chr(225),    # small a, acute accent
247        acirc  => chr(226),    # small a, circumflex accent
248        aelig  => chr(230),    # small ae diphthong (ligature)
249        agrave => chr(224),    # small a, grave accent
250        aring  => chr(229),    # small a, ring
251        atilde => chr(227),    # small a, tilde
252        auml   => chr(228),    # small a, dieresis or umlaut mark
253        ccedil => chr(231),    # small c, cedilla
254        eacute => chr(233),    # small e, acute accent
255        ecirc  => chr(234),    # small e, circumflex accent
256        egrave => chr(232),    # small e, grave accent
257        eth    => chr(240),    # small eth, Icelandic
258        euml   => chr(235),    # small e, dieresis or umlaut mark
259        iacute => chr(237),    # small i, acute accent
260        icirc  => chr(238),    # small i, circumflex accent
261        igrave => chr(236),    # small i, grave accent
262        iuml   => chr(239),    # small i, dieresis or umlaut mark
263        ntilde => chr(241),    # small n, tilde
264        oacute => chr(243),    # small o, acute accent
265        ocirc  => chr(244),    # small o, circumflex accent
266        ograve => chr(242),    # small o, grave accent
267        oslash => chr(248),    # small o, slash
268        otilde => chr(245),    # small o, tilde
269        ouml   => chr(246),    # small o, dieresis or umlaut mark
270        szlig  => chr(223),    # small sharp s, German (sz ligature)
271        thorn  => chr(254),    # small thorn, Icelandic
272        uacute => chr(250),    # small u, acute accent
273        ucirc  => chr(251),    # small u, circumflex accent
274        ugrave => chr(249),    # small u, grave accent
275        uuml   => chr(252),    # small u, dieresis or umlaut mark
276        yacute => chr(253),    # small y, acute accent
277        yuml   => chr(255),    # small y, dieresis or umlaut mark
278
279        # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
280        copy => chr(169),      # copyright sign
281        reg  => chr(174),      # registered sign
282        nbsp => chr(160),      # non breaking space
283
284        # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
285        iexcl   => chr(161),
286        cent    => chr(162),
287        pound   => chr(163),
288        curren  => chr(164),
289        yen     => chr(165),
290        brvbar  => chr(166),
291        sect    => chr(167),
292        uml     => chr(168),
293        ordf    => chr(170),
294        laquo   => chr(171),
295        'not'   => chr(172),    # not is a keyword in perl
296        shy     => chr(173),
297        macr    => chr(175),
298        deg     => chr(176),
299        plusmn  => chr(177),
300        sup1    => chr(185),
301        sup2    => chr(178),
302        sup3    => chr(179),
303        acute   => chr(180),
304        micro   => chr(181),
305        para    => chr(182),
306        middot  => chr(183),
307        cedil   => chr(184),
308        ordm    => chr(186),
309        raquo   => chr(187),
310        frac14  => chr(188),
311        frac12  => chr(189),
312        frac34  => chr(190),
313        iquest  => chr(191),
314        'times' => chr(215),    # times is a keyword in perl
315        divide  => chr(247),
316
317        (
318            $] > 5.007
319            ? (
320                OElig    => chr(338),
321                oelig    => chr(339),
322                Scaron   => chr(352),
323                scaron   => chr(353),
324                Yuml     => chr(376),
325                fnof     => chr(402),
326                circ     => chr(710),
327                tilde    => chr(732),
328                Alpha    => chr(913),
329                Beta     => chr(914),
330                Gamma    => chr(915),
331                Delta    => chr(916),
332                Epsilon  => chr(917),
333                Zeta     => chr(918),
334                Eta      => chr(919),
335                Theta    => chr(920),
336                Iota     => chr(921),
337                Kappa    => chr(922),
338                Lambda   => chr(923),
339                Mu       => chr(924),
340                Nu       => chr(925),
341                Xi       => chr(926),
342                Omicron  => chr(927),
343                Pi       => chr(928),
344                Rho      => chr(929),
345                Sigma    => chr(931),
346                Tau      => chr(932),
347                Upsilon  => chr(933),
348                Phi      => chr(934),
349                Chi      => chr(935),
350                Psi      => chr(936),
351                Omega    => chr(937),
352                alpha    => chr(945),
353                beta     => chr(946),
354                gamma    => chr(947),
355                delta    => chr(948),
356                epsilon  => chr(949),
357                zeta     => chr(950),
358                eta      => chr(951),
359                theta    => chr(952),
360                iota     => chr(953),
361                kappa    => chr(954),
362                lambda   => chr(955),
363                mu       => chr(956),
364                nu       => chr(957),
365                xi       => chr(958),
366                omicron  => chr(959),
367                pi       => chr(960),
368                rho      => chr(961),
369                sigmaf   => chr(962),
370                sigma    => chr(963),
371                tau      => chr(964),
372                upsilon  => chr(965),
373                phi      => chr(966),
374                chi      => chr(967),
375                psi      => chr(968),
376                omega    => chr(969),
377                thetasym => chr(977),
378                upsih    => chr(978),
379                piv      => chr(982),
380                ensp     => chr(8194),
381                emsp     => chr(8195),
382                thinsp   => chr(8201),
383                zwnj     => chr(8204),
384                zwj      => chr(8205),
385                lrm      => chr(8206),
386                rlm      => chr(8207),
387                ndash    => chr(8211),
388                mdash    => chr(8212),
389                lsquo    => chr(8216),
390                rsquo    => chr(8217),
391                sbquo    => chr(8218),
392                ldquo    => chr(8220),
393                rdquo    => chr(8221),
394                bdquo    => chr(8222),
395                dagger   => chr(8224),
396                Dagger   => chr(8225),
397                bull     => chr(8226),
398                hellip   => chr(8230),
399                permil   => chr(8240),
400                prime    => chr(8242),
401                Prime    => chr(8243),
402                lsaquo   => chr(8249),
403                rsaquo   => chr(8250),
404                oline    => chr(8254),
405                frasl    => chr(8260),
406                euro     => chr(8364),
407                image    => chr(8465),
408                weierp   => chr(8472),
409                real     => chr(8476),
410                trade    => chr(8482),
411                alefsym  => chr(8501),
412                larr     => chr(8592),
413                uarr     => chr(8593),
414                rarr     => chr(8594),
415                darr     => chr(8595),
416                harr     => chr(8596),
417                crarr    => chr(8629),
418                lArr     => chr(8656),
419                uArr     => chr(8657),
420                rArr     => chr(8658),
421                dArr     => chr(8659),
422                hArr     => chr(8660),
423                forall   => chr(8704),
424                part     => chr(8706),
425                exist    => chr(8707),
426                empty    => chr(8709),
427                nabla    => chr(8711),
428                isin     => chr(8712),
429                notin    => chr(8713),
430                ni       => chr(8715),
431                prod     => chr(8719),
432                sum      => chr(8721),
433                minus    => chr(8722),
434                lowast   => chr(8727),
435                radic    => chr(8730),
436                prop     => chr(8733),
437                infin    => chr(8734),
438                ang      => chr(8736),
439                'and'    => chr(8743),
440                'or'     => chr(8744),
441                cap      => chr(8745),
442                cup      => chr(8746),
443                'int'    => chr(8747),
444                there4   => chr(8756),
445                sim      => chr(8764),
446                cong     => chr(8773),
447                asymp    => chr(8776),
448                'ne'     => chr(8800),
449                equiv    => chr(8801),
450                'le'     => chr(8804),
451                'ge'     => chr(8805),
452                'sub'    => chr(8834),
453                sup      => chr(8835),
454                nsub     => chr(8836),
455                sube     => chr(8838),
456                supe     => chr(8839),
457                oplus    => chr(8853),
458                otimes   => chr(8855),
459                perp     => chr(8869),
460                sdot     => chr(8901),
461                lceil    => chr(8968),
462                rceil    => chr(8969),
463                lfloor   => chr(8970),
464                rfloor   => chr(8971),
465                lang     => chr(9001),
466                rang     => chr(9002),
467                loz      => chr(9674),
468                spades   => chr(9824),
469                clubs    => chr(9827),
470                hearts   => chr(9829),
471                diams    => chr(9830),
472                )
473            : ()
474        )
475    );
476
477    # encode html entities like "&#8211;"
478    $Param{String} =~ s{
479        (&\#(\d+);?)
480    }
481    {
482        my $ChrOrig = $1;
483        my $Dec = $2;
484
485        # Don't process UTF-16 surrogate pairs. Used on their own, these are not valid UTF-8 code
486        # points and can result in errors in old Perl versions. See bug#12588 for more information.
487        # - High Surrogate codes (U+D800-U+DBFF)
488        # - Low Surrogate codes (U+DC00-U+DFFF)
489        if ( $Dec >= 55296 && $Dec <= 57343 ) {
490            $ChrOrig;
491        }
492        else {
493            my $Chr = chr($Dec);
494
495            # Make sure we get valid UTF8 code points, but skip characters from 128 to 255
496            #   (inclusive), since they are by default internally not encoded as UTF-8 for
497            #   backward compatibility reasons. See bug#12457 for more information.
498            if ( $Dec < 128 || $Dec> 255 ) {
499                Encode::_utf8_off($Chr);
500                $Chr = Encode::decode('utf-8', $Chr, 0);
501            }
502
503            if ( $Chr ) {
504                $Chr;
505            }
506            else {
507                $ChrOrig;
508            }
509        }
510    }egx;
511
512    # encode html entities like "&#x3d;"
513    $Param{String} =~ s{
514        (&\#[xX]([0-9a-fA-F]+);?)
515    }
516    {
517        my $ChrOrig = $1;
518        my $Dec = hex( $2 );
519
520        # Don't process UTF-16 surrogate pairs. Used on their own, these are not valid UTF-8 code
521        # points and can result in errors in old Perl versions. See bug#12588 for more information.
522        # - High Surrogate codes (U+D800-U+DBFF)
523        # - Low Surrogate codes (U+DC00-U+DFFF)
524        if ( $Dec >= 55296 && $Dec <= 57343 ) {
525            $ChrOrig;
526        }
527        else {
528            if ( $Dec ) {
529                my $Chr = chr( $Dec );
530
531                # Make sure we get valid UTF8 code points, but skip characters from 128 to 255
532                #   (inclusive), since they are by default internally not encoded as UTF-8 for
533                #   backward compatibility reasons. See bug#12457 for more information.
534                if ( $Dec < 128 || $Dec > 255 ) {
535                    Encode::_utf8_off($Chr);
536                    $Chr = Encode::decode('utf-8', $Chr, 0);
537                }
538
539                if ( $Chr ) {
540                    $Chr;
541                }
542                else {
543                    $ChrOrig;
544                }
545            }
546            else {
547                $ChrOrig;
548            }
549        }
550    }egx;
551
552    # encode html entities like "&amp;"
553    $Param{String} =~ s{
554        (&(\w+);?)
555    }
556    {
557        if ( $Entity{$2} ) {
558            $Entity{$2};
559        }
560        else {
561            $1;
562        }
563    }egx;
564
565    # remove empty lines
566    $Param{String} =~ s/^\s*\n\s*\n/\n/mg;
567
568    # force line breaking
569    if ( length $Param{String} > $LineLength ) {
570        $Param{String} =~ s/(.{4,$LineLength})(?:\s|\z)/$1\n/gm;
571    }
572
573    # remember <blockquote> and <div style=\"cite\"
574    for my $Key ( sort keys %Cite ) {
575        $Param{String} =~ s/$Key/$Cite{$Key}\n/g;
576    }
577
578    # add extracted links
579    if ($LinkList) {
580        $Param{String} .= "\n\n" . $LinkList;
581    }
582
583    return $Param{String};
584}
585
586=head2 ToHTML()
587
588convert an ASCII string to an HTML string
589
590    my $HTMLString = $HTMLUtilsObject->ToHTML(
591        String             => $String,
592        ReplaceDoubleSpace => 0,        # replace &nbsp;&nbsp; with "  ", optional 1 or 0 (defaults to 1)
593    );
594
595=cut
596
597sub ToHTML {
598    my ( $Self, %Param ) = @_;
599
600    # check needed stuff
601    for (qw(String)) {
602        if ( !defined $Param{$_} ) {
603            $Kernel::OM->Get('Kernel::System::Log')->Log(
604                Priority => 'error',
605                Message  => "Need $_!"
606            );
607            return;
608        }
609    }
610
611    # fix some bad stuff from opera and others
612    $Param{String} =~ s/(\n\r|\r\r\n|\r\n)/\n/gs;
613
614    $Param{String} =~ s/&/&amp;/g;
615    $Param{String} =~ s/</&lt;/g;
616    $Param{String} =~ s/>/&gt;/g;
617    $Param{String} =~ s/"/&quot;/g;
618    $Param{String} =~ s/(\n|\r)/<br\/>\n/g;
619    $Param{String} =~ s/  /&nbsp;&nbsp;/g if $Param{ReplaceDoubleSpace};
620
621    return $Param{String};
622}
623
624=head2 DocumentComplete()
625
626check and e. g. add <html> and <body> tags to given html string
627
628    my $HTMLDocument = $HTMLUtilsObject->DocumentComplete(
629        String  => $String,
630        Charset => $Charset,
631    );
632
633=cut
634
635sub DocumentComplete {
636    my ( $Self, %Param ) = @_;
637
638    # check needed stuff
639    for (qw(String Charset)) {
640        if ( !defined $Param{$_} ) {
641            $Kernel::OM->Get('Kernel::System::Log')->Log(
642                Priority => 'error',
643                Message  => "Need $_!"
644            );
645            return;
646        }
647    }
648
649    return $Param{String} if $Param{String} =~ /<html>/i;
650
651    my $Css = $Kernel::OM->Get('Kernel::Config')->Get('Frontend::RichText::DefaultCSS')
652        || 'font-size: 12px; font-family:Courier,monospace,fixed;';
653
654    # escape special characters like double-quotes, e.g. used in font names with spaces
655    $Css = $Self->ToHTML( String => $Css );
656
657    # Use the HTML5 doctype because it is compatible with HTML4 and causes the browsers
658    #   to render the content in standards mode, which is more safe than quirks mode.
659    my $Body = '<!DOCTYPE html><html><head>';
660    $Body
661        .= '<meta http-equiv="Content-Type" content="text/html; charset=' . $Param{Charset} . '"/>';
662    $Body .= '</head><body style="' . $Css . '">' . $Param{String} . '</body></html>';
663    return $Body;
664}
665
666=head2 DocumentStrip()
667
668remove html document tags from string
669
670    my $HTMLString = $HTMLUtilsObject->DocumentStrip(
671        String  => $String,
672    );
673
674=cut
675
676sub DocumentStrip {
677    my ( $Self, %Param ) = @_;
678
679    # check needed stuff
680    for (qw(String)) {
681        if ( !defined $Param{$_} ) {
682            $Kernel::OM->Get('Kernel::System::Log')->Log(
683                Priority => 'error',
684                Message  => "Need $_!"
685            );
686            return;
687        }
688    }
689
690    $Param{String} =~ s/^<\!DOCTYPE\s+HTML.+?>//gsi;
691    $Param{String} =~ s/<head>.+?<\/head>//gsi;
692    $Param{String} =~ s/<(html|body)(.*?)>//gsi;
693    $Param{String} =~ s/<\/(html|body)>//gsi;
694
695    return $Param{String};
696}
697
698=head2 DocumentCleanup()
699
700perform some sanity checks on HTML content.
701
702 -  Replace MS Word 12 <p|div> with class "MsoNormal" by using <br/> because
703    it's not used as <p><div> (margin:0cm; margin-bottom:.0001pt;).
704
705 -  Replace <blockquote> by using
706    "<div style="border:none;border-left:solid blue 1.5pt;padding:0cm 0cm 0cm 4.0pt" type="cite">"
707    because of cross mail client and browser compatibility.
708
709 -  If there is no HTML doctype present, inject the HTML5 doctype, because it is compatible with HTML4
710    and causes the browsers to render the content in standards mode, which is safer.
711
712    $HTMLBody = $HTMLUtilsObject->DocumentCleanup(
713        String => $HTMLBody,
714    );
715
716=cut
717
718sub DocumentCleanup {
719    my ( $Self, %Param ) = @_;
720
721    # check needed stuff
722    for (qw(String)) {
723        if ( !defined $Param{$_} ) {
724            $Kernel::OM->Get('Kernel::System::Log')->Log(
725                Priority => 'error',
726                Message  => "Need $_!"
727            );
728            return;
729        }
730    }
731
732    # If the string starts with <html> directly, inject the doctype
733    $Param{String} =~ s{ \A \s* <html }{<!DOCTYPE html><html}gsmix;
734
735    # remove <base> tags - see bug#8880
736    $Param{String} =~ s{<base .*?>}{}xmsi;
737
738    # replace MS Word 12 <p|div> with class "MsoNormal" by using <br/> because
739    # it's not used as <p><div> (margin:0cm; margin-bottom:.0001pt;)
740    $Param{String} =~ s{
741        <p\s{1,3}class=(|"|')MsoNormal(|"|')(.*?)>(.+?)</p>
742    }
743    {
744        $4 . '<br/>';
745    }segxmi;
746
747    $Param{String} =~ s{
748        <div\s{1,3}class=(|"|')MsoNormal(|"|')(.*?)>(.+?)</div>
749    }
750    {
751        $4 . '<br/>';
752    }segxmi;
753
754    # replace <blockquote> by using
755    # "<div style="border:none;border-left:solid blue 1.5pt;padding:0cm 0cm 0cm 4.0pt" type="cite">"
756    # because of cross mail client and browser compatability
757    my $Style = "border:none;border-left:solid blue 1.5pt;padding:0cm 0cm 0cm 4.0pt";
758    for ( 1 .. 10 ) {
759        $Param{String} =~ s{
760            <blockquote(.*?)>(.+?)</blockquote>
761        }
762        {
763            "<div $1 style=\"$Style\">$2</div>";
764        }segxmi;
765    }
766
767    return $Param{String};
768}
769
770=head2 LinkQuote()
771
772detect links in HTML code, add C<a href> if missing
773
774    my $HTMLWithLinks = $HTMLUtilsObject->LinkQuote(
775        String    => $HTMLString,
776        Target    => 'TargetName', # content of target="?", e. g. _blank
777        TargetAdd => 1,            # add target="_blank" to all existing "<a href"
778    );
779
780also string ref is possible
781
782    my $HTMLWithLinksRef = $HTMLUtilsObject->LinkQuote(
783        String => \$HTMLStringRef,
784    );
785
786=cut
787
788sub LinkQuote {
789    my ( $Self, %Param ) = @_;
790
791    my $String = $Param{String} || '';
792
793    # check ref
794    my $StringScalar;
795    if ( !ref $String ) {
796        $StringScalar = $String;
797        $String       = \$StringScalar;
798
799        # return if string is not a ref and it is empty
800        return $StringScalar if !$StringScalar;
801    }
802
803    # add target to already existing url of html string
804    if ( $Param{TargetAdd} ) {
805
806        # find target
807        my $Target = $Param{Target};
808        if ( !$Target ) {
809            $Target = '_blank';
810        }
811
812        # add target to existing "<a href"
813        ${$String} =~ s{
814            (<a\s{1,10})([^>]+)>
815        }
816        {
817            my $Start = $1;
818            my $Value = $2;
819            if ( $Value !~ /href=/i || $Value =~ /target=/i ) {
820                "$Start$Value>";
821            }
822            else {
823                "$Start$Value target=\"$Target\">";
824            }
825        }egxsi;
826    }
827
828    my $Marker = "§" x 10;
829
830    # Remove existing <a>...</a> tags and their content to be re-inserted later, this must not be quoted.
831    # Also remove other tags to avoid quoting in tag parameters.
832    my $Counter = 0;
833    my %TagHash;
834    ${$String} =~ s{
835        (<a\s[^>]*?>[^>]*</a>|<[^>]+?>)
836    }
837    {
838        my $Content = $1;
839        my $Key     = "${Marker}TagHash-$Counter${Marker}";
840        $TagHash{$Counter++} = $Content;
841        $Key;
842    }egxism;
843
844    # Add <a> tags for URLs in the content.
845    my $Target = '';
846    if ( $Param{Target} ) {
847        $Target = " target=\"$Param{Target}\"";
848    }
849    ${$String} =~ s{
850        (                                          # $1 greater-than and less-than sign
851            > | < | \s+ | §{10} |
852            (?: &[a-zA-Z0-9]+; )                   # get html entities
853        )
854        (                                          # $2
855            (?:                                    # http or only www
856                (?: (?: http s? | ftp ) :\/\/) |   # http://,https:// and ftp://
857                (?: (?: www | ftp ) \.)            # www. and ftp.
858            )
859        )
860        (                                          # $3
861            (?: [a-z0-9\-]+ \. )*                  # get subdomains, optional
862            [a-z0-9\-]+                            # get top level domain
863            (?:                                    # optional port number
864                [:]
865                [0-9]+
866            )?
867            (?:                                    # file path element
868                [\/\.]
869                | [a-zA-Z0-9\-_=%]
870            )*
871            (?:                                    # param string
872                [\?]                               # if param string is there, "?" must be present
873                [a-zA-Z0-9&;=%\-_:\.\/]*           # param string content, this will also catch entities like &amp;
874            )?
875            (?:                                    # link hash string
876                [\#]                               #
877                [a-zA-Z0-9&;=%\-_:\.\/]*           # hash string content, this will also catch entities like &amp;
878            )?
879        )
880        (?=                                        # $4
881            (?:
882                [\?,;!\.\)] (?: \s | $ )           # \)\s this construct is because of bug# 2450
883                | \"
884                | \]
885                | \s+
886                | '
887                | >                               # greater-than and less-than sign
888                | <                               # "
889                | (?: &[a-zA-Z0-9]+; )+            # html entities
890                | $                                # bug# 2715
891            )
892            | §{10}                                # ending TagHash
893        )
894    }
895    {
896        my $Start    = $1;
897        my $Protocol = $2;
898        my $Link     = $3;
899        my $End      = $4 || '';
900
901        # there may different links for href and link body
902        my $HrefLink;
903        my $DisplayLink;
904
905        if ( $Protocol =~ m{\A ( http | https | ftp ) : \/ \/ }xi ) {
906            $DisplayLink = $Protocol . $Link;
907            $HrefLink    = $DisplayLink;
908        }
909        else {
910            if ($Protocol =~ m{\A ftp }smx ) {
911                $HrefLink = 'ftp://';
912            }
913            else {
914                $HrefLink = 'http://';
915            }
916
917            if ( $Protocol ) {
918                $HrefLink   .= $Protocol;
919                $DisplayLink = $Protocol;
920            }
921
922            $DisplayLink .= $Link;
923            $HrefLink    .= $Link;
924        }
925        $Start . "<a href=\"$HrefLink\"$Target title=\"$HrefLink\">$DisplayLink<\/a>" . $End;
926    }egxism;
927
928    # Re-add previously removed tags.
929    ${$String} =~ s{${Marker}TagHash-(\d+)${Marker}}{$TagHash{$1}}egsxim;
930
931    # check ref && return result like called
932    if ( defined $StringScalar ) {
933        return ${$String};
934    }
935    return $String;
936}
937
938=head2 Safety()
939
940To remove/strip active html tags/addons (javascript, C<applet>s, C<embed>s and C<object>s)
941from html strings.
942
943    my %Safe = $HTMLUtilsObject->Safety(
944        String         => $HTMLString,
945        NoApplet       => 1,
946        NoObject       => 1,
947        NoEmbed        => 1,
948        NoSVG          => 1,
949        NoImg          => 1,
950        NoIntSrcLoad   => 0,
951        NoExtSrcLoad   => 1,
952        NoJavaScript   => 1,
953        ReplacementStr => 'string',          # optional, string to show instead of applet, object, embed, svg and img tags
954    );
955
956also string ref is possible
957
958    my %Safe = $HTMLUtilsObject->Safety(
959        String       => \$HTMLStringRef,
960        NoApplet     => 1,
961        NoObject     => 1,
962        NoEmbed      => 1,
963        NoSVG        => 1,
964        NoImg        => 1,
965        NoIntSrcLoad => 0,
966        NoExtSrcLoad => 1,
967        NoJavaScript => 1,
968    );
969
970returns
971
972    my %Safe = (
973        String  => $HTMLString, # modified html string (scalar or ref)
974        Replace => 1,           # info if something got replaced
975    );
976
977=cut
978
979sub Safety {
980    my ( $Self, %Param ) = @_;
981
982    # check needed stuff
983    for (qw(String)) {
984        if ( !defined $Param{$_} ) {
985            $Kernel::OM->Get('Kernel::System::Log')->Log(
986                Priority => 'error',
987                Message  => "Need $_!"
988            );
989            return;
990        }
991    }
992
993    my $String = $Param{String} || '';
994
995    # check ref
996    my $StringScalar;
997    if ( !ref $String ) {
998        $StringScalar = $String;
999        $String       = \$StringScalar;
1000    }
1001
1002    my %Safety;
1003
1004    my $Replaced;
1005
1006    # In UTF-7, < and > can be encoded to mask them from security filters like this one.
1007    my $TagStart = '(?:<|[+]ADw-)';
1008    my $TagEnd   = '(?:>|[+]AD4-)';
1009
1010    # This can also be entity-encoded to hide it from the parser.
1011    #   Browsers seem to tolerate an omitted ";".
1012    my $JavaScriptPrefixRegex = '
1013        (?: j | &\#106[;]? | &\#x6a[;]? )
1014        (?: a | &\#97[;]?  | &\#x61[;]? )
1015        (?: v | &\#118[;]? | &\#x76[;]? )
1016        (?: a | &\#97[;]?  | &\#x61[;]? )
1017        (?: s | &\#115[;]? | &\#x73[;]? )
1018        (?: c | &\#99[;]?  | &\#x63[;]? )
1019        (?: r | &\#114[;]? | &\#x72[;]? )
1020        (?: i | &\#105[;]? | &\#x69[;]? )
1021        (?: p | &\#112[;]? | &\#x70[;]? )
1022        (?: t | &\#116[;]? | &\#x74[;]? )
1023    ';
1024
1025    my $ExpressionPrefixRegex = '
1026        (?: e | &\#101[;]? | &\#x65[;]? )
1027        (?: x | &\#120[;]? | &\#x78[;]? )
1028        (?: p | &\#112[;]? | &\#x70[;]? )
1029        (?: r | &\#114[;]? | &\#x72[;]? )
1030        (?: e | &\#101[;]? | &\#x65[;]? )
1031        (?: s | &\#115[;]? | &\#x73[;]? )
1032        (?: s | &\#115[;]? | &\#x73[;]? )
1033        (?: i | &\#105[;]? | &\#x69[;]? )
1034        (?: o | &\#111[;]? | &\#x6f[;]? )
1035        (?: n | &\#110[;]? | &\#x6e[;]? )
1036    ';
1037
1038    # Replace as many times as it is needed to avoid nesting tag attacks.
1039    do {
1040        $Replaced = undef;
1041
1042        # remove script tags
1043        if ( $Param{NoJavaScript} ) {
1044            $Replaced += ${$String} =~ s{
1045                $TagStart script.*? $TagEnd .*?  $TagStart /script \s* $TagEnd
1046            }
1047            {}sgxim;
1048            $Replaced += ${$String} =~ s{
1049                $TagStart script.*? $TagEnd .+? ($TagStart|$TagEnd)
1050            }
1051            {}sgxim;
1052
1053            # remove style/javascript parts
1054            $Replaced += ${$String} =~ s{
1055                $TagStart style[^>]+? $JavaScriptPrefixRegex (.+?|) $TagEnd (.*?) $TagStart /style \s* $TagEnd
1056            }
1057            {}sgxim;
1058
1059            # remove MS CSS expressions (JavaScript embedded in CSS)
1060            ${$String} =~ s{
1061                ($TagStart style[^>]+? $TagEnd .*? $TagStart /style \s* $TagEnd)
1062            }
1063            {
1064                if ( index($1, 'expression(' ) > -1 ) {
1065                    $Replaced = 1;
1066                    '';
1067                }
1068                else {
1069                    $1;
1070                }
1071            }egsxim;
1072        }
1073
1074        # remove HTTP redirects
1075        $Replaced += ${$String} =~ s{
1076            $TagStart meta [^>]+? http-equiv=('|"|)refresh [^>]+? $TagEnd
1077        }
1078        {}sgxim;
1079
1080        my $ReplacementStr = $Param{ReplacementStr} // '';
1081
1082        # remove <applet> tags
1083        if ( $Param{NoApplet} ) {
1084            $Replaced += ${$String} =~ s{
1085                $TagStart applet.*? $TagEnd (.*?) $TagStart /applet \s* $TagEnd
1086            }
1087            {$ReplacementStr}sgxim;
1088        }
1089
1090        # remove <Object> tags
1091        if ( $Param{NoObject} ) {
1092            $Replaced += ${$String} =~ s{
1093                $TagStart object.*? $TagEnd (.*?) $TagStart /object \s* $TagEnd
1094            }
1095            {$ReplacementStr}sgxim;
1096        }
1097
1098        # remove <svg> tags
1099        if ( $Param{NoSVG} ) {
1100            $Replaced += ${$String} =~ s{
1101                $TagStart svg.*? $TagEnd (.*?) $TagStart /svg \s* $TagEnd
1102            }
1103            {$ReplacementStr}sgxim;
1104        }
1105
1106        # remove <img> tags
1107        if ( $Param{NoImg} ) {
1108            $Replaced += ${$String} =~ s{
1109                $TagStart img.*? (.*?) \s* $TagEnd
1110            }
1111            {$ReplacementStr}sgxim;
1112        }
1113
1114        # remove <embed> tags
1115        if ( $Param{NoEmbed} ) {
1116            $Replaced += ${$String} =~ s{
1117                $TagStart embed.*? $TagEnd
1118            }
1119            {$ReplacementStr}sgxim;
1120        }
1121
1122        # check each html tag
1123        ${$String} =~ s{
1124            ($TagStart.+?$TagEnd)
1125        }
1126        {
1127            my $Tag = $1;
1128            if ($Param{NoJavaScript}) {
1129
1130                # remove on action attributes
1131                $Replaced += $Tag =~ s{
1132                    (?:\s|/) on[a-z]+\s*=("[^"]+"|'[^']+'|.+?)($TagEnd|\s)
1133                }
1134                {$2}sgxim;
1135
1136                # remove javascript in a href links or src links
1137                $Replaced += $Tag =~ s{
1138                    ((?:\s|;|/)(?:background|url|src|href)=)
1139                    ('|"|)                                  # delimiter, can be empty
1140                    (?:\s* $JavaScriptPrefixRegex .*?)      # javascript, followed by anything but the delimiter
1141                    \2                                      # delimiter again
1142                    (\s|$TagEnd)
1143                }
1144                {
1145                    "$1\"\"$3";
1146                }sgxime;
1147
1148                # remove link javascript tags
1149                $Replaced += $Tag =~ s{
1150                    ($TagStart link .+? $JavaScriptPrefixRegex (.+?|) $TagEnd)
1151                }
1152                {}sgxim;
1153
1154                # remove MS CSS expressions (JavaScript embedded in CSS)
1155                $Replaced += $Tag =~ s{
1156                    \sstyle=("|')[^\1]*? $ExpressionPrefixRegex [(].*?\1($TagEnd|\s)
1157                }
1158                {
1159                    $2;
1160                }egsxim;
1161            }
1162
1163            # Remove malicious CSS content
1164            $Tag =~ s{
1165                (\s)style=("|') (.*?) \2
1166            }
1167            {
1168                my ($Space, $Delimiter, $Content) = ($1, $2, $3);
1169
1170                if (
1171                    ($Param{NoIntSrcLoad} && $Content =~ m{url\(})
1172                    || ($Param{NoExtSrcLoad} && $Content =~ m/(http|ftp|https):\//i)) {
1173                    $Replaced = 1;
1174                    '';
1175                }
1176                else {
1177                    "${Space}style=${Delimiter}${Content}${Delimiter}";
1178                }
1179            }egsxim;
1180
1181            # remove load tags
1182            if ($Param{NoIntSrcLoad} || $Param{NoExtSrcLoad}) {
1183                $Tag =~ s{
1184                    ($TagStart (.+?) (?: \s | /) (?:src|poster)=(.+?) (\s.+?|) $TagEnd)
1185                }
1186                {
1187                    my $URL = $3;
1188                    if ($Param{NoIntSrcLoad} || ($Param{NoExtSrcLoad} && $URL =~ /(http|ftp|https):\//i)) {
1189                        $Replaced = 1;
1190                        '';
1191                    }
1192                    else {
1193                        $1;
1194                    }
1195                }segxim;
1196            }
1197
1198            # replace original tag with clean tag
1199            $Tag;
1200        }segxim;
1201
1202        $Safety{Replace} += $Replaced;
1203
1204    } while ($Replaced);    ## no critic
1205
1206    # check ref && return result like called
1207    if ( defined $StringScalar ) {
1208        $Safety{String} = ${$String};
1209    }
1210    else {
1211        $Safety{String} = $String;
1212    }
1213    return %Safety;
1214}
1215
1216=head2 EmbeddedImagesExtract()
1217
1218extracts embedded images with data-URLs from an HTML document.
1219
1220    $HTMLUtilsObject->EmbeddedImagesExtract(
1221        DocumentRef    => \$Body,
1222        AttachmentsRef => \@Attachments,
1223    );
1224
1225Returns nothing. If embedded images were found, these will be appended
1226to the attachments list, and the image data URL will be replaced with a
1227C<cid:> URL in the document.
1228
1229=cut
1230
1231sub EmbeddedImagesExtract {
1232    my ( $Self, %Param ) = @_;
1233
1234    if ( ref $Param{DocumentRef} ne 'SCALAR' || !defined ${ $Param{DocumentRef} } ) {
1235        $Kernel::OM->Get('Kernel::System::Log')->Log(
1236            Priority => 'error',
1237            Message  => "Need DocumentRef!"
1238        );
1239        return;
1240    }
1241    if ( ref $Param{AttachmentsRef} ne 'ARRAY' ) {
1242        $Kernel::OM->Get('Kernel::System::Log')->Log(
1243            Priority => 'error',
1244            Message  => "Need AttachmentsRef!"
1245        );
1246        return;
1247    }
1248
1249    my $FQDN = $Kernel::OM->Get('Kernel::Config')->Get('FQDN');
1250    ${ $Param{DocumentRef} } =~ s{(src=")(data:image/)(png|gif|jpg|jpeg|bmp)(;base64,)(.+?)(")}{
1251
1252        my $Base64String = $5;
1253
1254        my $FileName     = 'pasted-' . time() . '-' . int(rand(1000000)) . '.' . $3;
1255        my $ContentType  = "image/$3; name=\"$FileName\"";
1256        my $ContentID    = 'pasted.' . time() . '.' . int(rand(1000000)) . '@' . $FQDN;
1257
1258        my $AttachmentData = {
1259            Content     => decode_base64($Base64String),
1260            ContentType => $ContentType,
1261            ContentID   => $ContentID,
1262            Filename    => $FileName,
1263            Disposition => 'inline',
1264        };
1265        push @{$Param{AttachmentsRef}}, $AttachmentData;
1266
1267        # compose new image tag
1268        $1 . "cid:$ContentID" . $6
1269
1270    }egxi;
1271
1272    return 1;
1273}
1274
12751;
1276
1277=head1 TERMS AND CONDITIONS
1278
1279This software is part of the OTRS project (L<https://otrs.org/>).
1280
1281This software comes with ABSOLUTELY NO WARRANTY. For details, see
1282the enclosed file COPYING for license information (GPL). If you
1283did not receive this file, see L<https://www.gnu.org/licenses/gpl-3.0.txt>.
1284
1285=cut
1286