1# -*- indent-tabs-mode: nil; -*-
2# vim:ft=perl:et:sw=4
3# $Id$
4
5# Sympa - SYsteme de Multi-Postage Automatique
6#
7# Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel
8# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
9# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
10# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
11# Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md
12# file at the top-level directory of this distribution and at
13# <https://github.com/sympa-community/sympa.git>.
14#
15# This program is free software; you can redistribute it and/or modify
16# it under the terms of the GNU General Public License as published by
17# the Free Software Foundation; either version 2 of the License, or
18# (at your option) any later version.
19#
20# This program is distributed in the hope that it will be useful,
21# but WITHOUT ANY WARRANTY; without even the implied warranty of
22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23# GNU General Public License for more details.
24#
25# You should have received a copy of the GNU General Public License
26# along with this program.  If not, see <http://www.gnu.org/licenses/>.
27
28package Sympa::Tools::Text;
29
30use strict;
31use warnings;
32use feature qw(fc);
33use Encode qw();
34use English qw(-no_match_vars);
35use Encode::MIME::Header;    # 'MIME-Q' encoding.
36use HTML::Entities qw();
37use MIME::EncWords;
38use Text::LineFold;
39use Unicode::GCString;
40use URI::Escape qw();
41BEGIN { eval 'use Unicode::Normalize qw()'; }
42BEGIN { eval 'use Unicode::UTF8 qw()'; }
43
44use Sympa::Language;
45use Sympa::Regexps;
46
47# Old name: tools::addrencode().
48sub addrencode {
49    my $addr    = shift;
50    my $phrase  = (shift || '');
51    my $charset = (shift || 'utf8');
52    my $comment = (shift || '');
53
54    return undef unless $addr =~ /\S/;
55
56    if ($phrase =~ /[^\s\x21-\x7E]/) {
57        $phrase = MIME::EncWords::encode_mimewords(
58            Encode::decode('utf8', $phrase),
59            'Encoding'    => 'A',
60            'Charset'     => $charset,
61            'Replacement' => 'FALLBACK',
62            'Field'       => 'Resent-Sender', # almost longest
63            'Minimal'     => 'DISPNAME',      # needs MIME::EncWords >= 1.012.
64        );
65    } elsif ($phrase =~ /\S/) {
66        $phrase =~ s/([\\\"])/\\$1/g;
67        $phrase = '"' . $phrase . '"';
68    }
69    if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) {
70        $comment = MIME::EncWords::encode_mimewords(
71            Encode::decode('utf8', $comment),
72            'Encoding'    => 'A',
73            'Charset'     => $charset,
74            'Replacement' => 'FALLBACK',
75            'Minimal'     => 'DISPNAME',
76        );
77    } elsif ($comment =~ /\S/) {
78        $comment =~ s/([\\\"])/\\$1/g;
79    }
80
81    return
82          ($phrase =~ /\S/  ? "$phrase "    : '')
83        . ($comment =~ /\S/ ? "($comment) " : '')
84        . "<$addr>";
85}
86
87# Old names: tools::clean_email(), tools::get_canonical_email().
88sub canonic_email {
89    my $email = shift;
90
91    return undef unless defined $email;
92
93    # Remove leading and trailing white spaces.
94    $email =~ s/\A\s+//;
95    $email =~ s/\s+\z//;
96
97    # Lower-case.
98    $email =~ tr/A-Z/a-z/;
99
100    return (length $email) ? $email : undef;
101}
102
103# Old name: tools::clean_msg_id().
104sub canonic_message_id {
105    my $msg_id = shift;
106
107    return $msg_id unless defined $msg_id;
108
109    chomp $msg_id;
110
111    if ($msg_id =~ /\<(.+)\>/) {
112        $msg_id = $1;
113    }
114
115    return $msg_id;
116}
117
118sub canonic_text {
119    my $text = shift;
120
121    return undef unless defined $text;
122
123    # Normalize text. See also discussion on
124    # https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html
125    #
126    # N.B.: Corresponding modules are optional by now, and should be
127    # mandatory in the future.
128    my $utext;
129    if (Encode::is_utf8($text)) {
130        $utext = $text;
131    } elsif ($Unicode::UTF8::VERSION) {
132        no warnings 'utf8';
133        $utext = Unicode::UTF8::decode_utf8($text);
134    } else {
135        $utext = Encode::decode_utf8($text);
136    }
137    if ($Unicode::Normalize::VERSION) {
138        $utext = Unicode::Normalize::normalize('NFC', $utext);
139    }
140
141    # Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL,
142    # and EIMS:
143    $utext =~ s/\r\n|\r/\n/g;
144
145    if (Encode::is_utf8($text)) {
146        return $utext;
147    } else {
148        return Encode::encode_utf8($utext);
149    }
150}
151
152sub slurp {
153    my $path = shift;
154
155    my $ifh;
156    return undef unless open $ifh, '<', $path;
157    my $text = do { local $RS; <$ifh> };
158    close $ifh;
159
160    return canonic_text($text);
161}
162
163sub wrap_text {
164    my $text = shift;
165    my $init = shift;
166    my $subs = shift;
167    my $cols = shift;
168
169    $init //= '';
170    $subs //= '';
171    $cols //= 78;
172    return $text unless $cols;
173
174    my $email_re = Sympa::Regexps::email();
175    my $linefold = Text::LineFold->new(
176        Language   => Sympa::Language->instance->get_lang,
177        Prep       => 'NONBREAKURI',
178        prep       => [$email_re, sub { shift; @_ }],
179        ColumnsMax => $cols,
180        Format     => sub {
181            shift;
182            my $event = shift;
183            my $str   = shift;
184            if ($event =~ /^eo/)     { return "\n"; }
185            if ($event =~ /^so[tp]/) { return $init . $str; }
186            if ($event eq 'sol')     { return $subs . $str; }
187            undef;
188        },
189    );
190
191    my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text);
192
193    my $ret = '';
194    while (1000 < length $t) {
195        my $s = substr $t, 0, 1000;
196        $ret .= $linefold->break_partial($s);
197        $t = substr $t, 1000;
198    }
199    $ret .= $linefold->break_partial($t) if length $t;
200    $ret .= $linefold->break_partial(undef);
201
202    return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret);
203}
204
205sub decode_filesystem_safe {
206    my $str = shift;
207    return '' unless defined $str and length $str;
208
209    $str = Encode::encode_utf8($str) if Encode::is_utf8($str);
210    # On case-insensitive filesystem "_XX" along with "_xx" should be decoded.
211    $str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg;
212    return $str;
213}
214
215sub decode_html {
216    my $str = shift;
217
218    Encode::encode_utf8(
219        HTML::Entities::decode_entities(Encode::decode_utf8($str)));
220}
221
222sub encode_filesystem_safe {
223    my $str = shift;
224    return '' unless defined $str and length $str;
225
226    $str = Encode::encode_utf8($str) if Encode::is_utf8($str);
227    $str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg;
228    return $str;
229}
230
231sub encode_html {
232    my $str = shift;
233    my $additional_unsafe = shift || '';
234
235    HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe);
236}
237
238sub encode_uri {
239    my $str     = shift;
240    my %options = @_;
241
242    # Note: URI-1.35 (URI::Escape 3.28) or later is required.
243    return Encode::encode_utf8(
244        URI::Escape::uri_escape_utf8(
245            Encode::decode_utf8($str),
246            '^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '')
247        )
248    );
249}
250
251# Old name: tools::escape_chars().
252sub escape_chars {
253    my $s          = shift;
254    my $except     = shift;                            ## Exceptions
255    my $ord_except = ord $except if defined $except;
256
257    ## Escape chars
258    ##  !"#$%&'()+,:;<=>?[] AND accented chars
259    ## escape % first
260    foreach my $i (
261        0x25,
262        0x20 .. 0x24,
263        0x26 .. 0x2c,
264        0x3a .. 0x3f,
265        0x5b, 0x5d,
266        0x80 .. 0x9f,
267        0xa0 .. 0xff
268    ) {
269        next if defined $ord_except and $i == $ord_except;
270        my $hex_i = sprintf "%lx", $i;
271        $s =~ s/\x$hex_i/%$hex_i/g;
272    }
273    ## Special traetment for '/'
274    $s =~ s/\//%a5/g unless defined $except and $except eq '/';
275
276    return $s;
277}
278
279# Old name: tt2::escape_url().
280# DEPRECATED.  Use Sympa::Tools::Text::escape_uri() or
281# Sympa::Tools::Text::mailtourl().
282#sub escape_url;
283
284sub foldcase {
285    my $str = shift;
286
287    return '' unless defined $str and length $str;
288    return Encode::encode_utf8(fc(Encode::decode_utf8($str)));
289}
290
291my %legacy_charsets = (
292    'ar'    => [qw(iso-8859-6)],
293    'bs'    => [qw(iso-8859-2)],
294    'cs'    => [qw(iso-8859-2)],
295    'eo'    => [qw(iso-8859-3)],
296    'et'    => [qw(iso-8859-4)],
297    'he'    => [qw(iso-8859-8)],
298    'hr'    => [qw(iso-8859-2)],
299    'hu'    => [qw(iso-8859-2)],
300    'ja'    => [qw(euc-jp cp932 MacJapanese)],
301    'kl'    => [qw(iso-8859-4)],
302    'ko'    => [qw(cp949)],
303    'lt'    => [qw(iso-8859-4)],
304    'lv'    => [qw(iso-8859-4)],
305    'mt'    => [qw(iso-8859-3)],
306    'pl'    => [qw(iso-8859-2)],
307    'ro'    => [qw(iso-8859-2)],
308    'ru'    => [qw(koi8-r cp1251)],               # cp866? MacCyrillic?
309    'sk'    => [qw(iso-8859-2)],
310    'sl'    => [qw(iso-8859-2)],
311    'th'    => [qw(iso-8859-11 cp874 MacThai)],
312    'tr'    => [qw(iso-8859-9)],
313    'uk'    => [qw(koi8-u)],                      # MacUkrainian?
314    'zh-CN' => [qw(euc-cn)],
315    'zh-TW' => [qw(big5-eten)],
316);
317
318sub guessed_to_utf8 {
319    my $text  = shift;
320    my @langs = @_;
321
322    return Encode::encode_utf8($text) if Encode::is_utf8($text);
323    return $text
324        unless defined $text
325        and length $text
326        and $text =~ /[^\x00-\x7F]/;
327
328    my $utf8;
329    if ($Unicode::UTF8::VERSION) {
330        $utf8 =
331            eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) };
332    }
333    unless (defined $utf8) {
334        foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) {
335            $utf8 =
336                eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
337            last if defined $utf8;
338        }
339    }
340    unless (defined $utf8) {
341        $utf8 = Encode::decode('iso-8859-1', $text);
342    }
343
344    # Apply NFC: e.g. for modified-NFD by Mac OS X.
345    $utf8 = Unicode::Normalize::normalize('NFC', $utf8)
346        if $Unicode::Normalize::VERSION;
347
348    return Encode::encode_utf8($utf8);
349}
350
351sub mailtourl {
352    my $text    = shift;
353    my %options = @_;
354
355    my $dtext =
356          (not defined $text)   ? ''
357        : $options{decode_html} ? Sympa::Tools::Text::decode_html($text)
358        :                         $text;
359    $dtext =~ s/\A\s+//;
360    $dtext =~ s/\s+\z//;
361    $dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
362    $dtext =~ s/\r\n|\r|\n/ /g;
363
364    # The ``@'' in email address should not be encoded because some MUAs
365    # aren't able to decode ``%40'' in e-mail address of mailto: URL.
366    # Contrary, ``@'' in query component should be encoded because some
367    # MUAs take it for a delimiter to separate URL from the rest.
368    my ($format, $utext, $qsep);
369    if ($dtext =~ /[()<>\[\]:;,\"\s]/) {
370        # Use "to" header if source text includes any of RFC 5322
371        # "specials", minus ``@'' and ``\'', plus whitespaces.
372        $format = 'mailto:?to=%s%s';
373        $utext  = Sympa::Tools::Text::encode_uri($dtext);
374        $qsep   = '&';
375    } else {
376        $format = 'mailto:%s%s';
377        $utext  = Sympa::Tools::Text::encode_uri($dtext, omit => '@');
378        $qsep   = '?';
379    }
380    my $qstring = _url_query_string(
381        $options{query},
382        decode_html => $options{decode_html},
383        leadchar    => $qsep,
384        sepchar     => '&',
385        trim_values => 1,
386    );
387
388    return sprintf $format, $utext, $qstring;
389}
390
391sub _url_query_string {
392    my $query   = shift;
393    my %options = @_;
394
395    unless (ref $query eq 'HASH' and %$query) {
396        return '';
397    } else {
398        my $decode_html = $options{decode_html};
399        my $trim_values = $options{trim_values};
400        return ($options{leadchar} || '?') . join(
401            ($options{sepchar} || ';'),
402            map {
403                my ($dkey, $dval) = map {
404                          (not defined $_) ? ''
405                        : $decode_html ? Sympa::Tools::Text::decode_html($_)
406                        :                $_;
407                } ($_, $query->{$_});
408                if ($trim_values and lc $dkey ne 'body') {
409                    $dval =~ s/\A\s+//;
410                    $dval =~ s/\s+\z//;
411                    $dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g;
412                    $dval =~ s/\r\n|\r|\n/ /g;
413                }
414
415                sprintf '%s=%s',
416                    Sympa::Tools::Text::encode_uri($dkey),
417                    Sympa::Tools::Text::encode_uri($dval);
418            } sort keys %$query
419        );
420    }
421}
422
423sub pad {
424    my $str   = shift;
425    my $width = shift;
426
427    return $str unless $width and defined $str;
428
429    my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str);
430    my $cols = Unicode::GCString->new($ustr)->columns;
431
432    unless ($cols < abs $width) {
433        return $str;
434    } elsif ($width < 0) {
435        return $str . (' ' x (-$width - $cols));
436    } else {
437        return (' ' x ($width - $cols)) . $str;
438    }
439}
440
441# Old name: tools::qdecode_filename().
442sub qdecode_filename {
443    my $filename = shift;
444
445    ## We don't use MIME::Words here because it does not encode properly
446    ## Unicode
447    ## Check if string is already Q-encoded first
448    #if ($filename =~ /\=\?UTF-8\?/) {
449    $filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename));
450    #}
451
452    return $filename;
453}
454
455# Old name: tools::qencode_filename().
456sub qencode_filename {
457    my $filename = shift;
458
459    ## We don't use MIME::Words here because it does not encode properly
460    ## Unicode
461    ## Check if string is already Q-encoded first
462    ## Also check if the string contains 8bit chars
463    unless ($filename =~ /\=\?UTF-8\?/
464        || $filename =~ /^[\x00-\x7f]*$/) {
465
466        ## Don't encode elements such as .desc. or .url or .moderate
467        ## or .extension
468        my $part = $filename;
469        my ($leading, $trailing);
470        $leading  = $1 if ($part =~ s/^(\.desc\.)//);    ## leading .desc
471        $trailing = $1 if ($part =~ s/((\.\w+)+)$//);    ## trailing .xx
472
473        my $encoded_part = MIME::EncWords::encode_mimewords(
474            $part,
475            Charset    => 'utf8',
476            Encoding   => 'q',
477            MaxLineLen => 1000,
478            Minimal    => 'NO'
479        );
480
481        $filename = $leading . $encoded_part . $trailing;
482    }
483
484    return $filename;
485}
486
487sub clip {
488    my $string = shift;
489    return undef unless @_;
490    my $length = shift;
491
492    my ($gcstr, $blen);
493    if (ref $string eq 'Unicode::GCString') {
494        $gcstr = $string;
495        $blen  = length Encode::encode_utf8($string->as_string);
496    } elsif (Encode::is_utf8($string)) {
497        $gcstr = Unicode::GCString->new($string);
498        $blen  = length Encode::encode_utf8($string);
499    } else {
500        $gcstr = Unicode::GCString->new(Encode::decode_utf8($string));
501        $blen  = length $string;
502    }
503
504    $length += $blen if $length < 0;
505    return '' if $length < 0;             # out of range
506    return $string if $blen <= $length;
507
508    my $result = $gcstr->substr(0, _gc_length($gcstr, $length));
509
510    if (ref $string eq 'Unicode::GCString') {
511        return $result;
512    } elsif (Encode::is_utf8($string)) {
513        return $result->as_string;
514    } else {
515        return Encode::encode_utf8($result->as_string);
516    }
517}
518
519sub _gc_length {
520    my $gcstr  = shift;
521    my $length = shift;
522
523    return 0 unless $gcstr->length;
524    return 0 unless $length;
525
526    my ($shorter, $longer) = (0, $gcstr->length);
527    while ($shorter < $longer) {
528        my $cur = ($shorter + $longer + 1) >> 1;
529        my $elen =
530            length Encode::encode_utf8($gcstr->substr(0, $cur)->as_string);
531        if ($elen <= $length) {
532            $shorter = $cur;
533        } else {
534            $longer = $cur - 1;
535        }
536    }
537
538    return $shorter;
539}
540
541# Old name: tools::unescape_chars().
542sub unescape_chars {
543    my $s = shift;
544
545    $s =~ s/%a5/\//g;    ## Special traetment for '/'
546    foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f,
547        0xa0 .. 0xff) {
548        my $hex_i = sprintf "%lx", $i;
549        my $hex_s = sprintf "%c",  $i;
550        $s =~ s/%$hex_i/$hex_s/g;
551    }
552
553    return $s;
554}
555
556# Old name: tools::valid_email().
557sub valid_email {
558    my $email = shift;
559
560    my $email_re = Sympa::Regexps::email();
561    return undef unless $email =~ /^${email_re}$/;
562
563    # Forbidden characters.
564    return undef if $email =~ /[\|\$\*\?\!]/;
565
566    return 1;
567}
568
569sub weburl {
570    my $base    = shift;
571    my $paths   = shift;
572    my %options = @_;
573
574    my @paths = map {
575        Sympa::Tools::Text::encode_uri(
576              (not defined $_)      ? ''
577            : $options{decode_html} ? Sympa::Tools::Text::decode_html($_)
578            :                         $_
579        );
580    } @{$paths || []};
581
582    my $qstring = _url_query_string(
583        $options{query},
584        decode_html => $options{decode_html},
585        sepchar     => '&',
586    );
587
588    my $fstring;
589    my $fragment = $options{fragment};
590    if (defined $fragment) {
591        $fstring = '#'
592            . Sympa::Tools::Text::encode_uri(
593            $options{decode_html}
594            ? Sympa::Tools::Text::decode_html($fragment)
595            : $fragment
596            );
597    } else {
598        $fstring = '';
599    }
600
601    return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)),
602        $qstring, $fstring;
603}
604
6051;
606__END__
607
608=encoding utf-8
609
610=head1 NAME
611
612Sympa::Tools::Text - Text-related functions
613
614=head1 DESCRIPTION
615
616This package provides some text-related functions.
617
618=head2 Functions
619
620=over
621
622=item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] )
623
624Returns formatted (and encoded) name-addr as RFC5322 3.4.
625
626=item canonic_email ( $email )
627
628I<Function>.
629Returns canonical form of e-mail address.
630
631Leading and trailing white spaces are removed.
632Latin letters without accents are lower-cased.
633
634For malformed inputs returns C<undef>.
635
636=item canonic_message_id ( $message_id )
637
638Returns canonical form of message ID without trailing or leading whitespaces
639or C<E<lt>>, C<E<gt>>.
640
641=item canonic_text ( $text )
642
643Canonicalizes text.
644C<$text> should be a binary string encoded by UTF-8 character set or
645a Unicode string.
646Forbidden sequences in binary string will be replaced by
647U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied.
648
649=item clip ( $string, $length )
650
651I<Function>.
652Clips $string according to $length by bytes,
653considering boundary of grapheme clusters.
654UTF-8 is assumed for $string as bytestring.
655
656=item decode_filesystem_safe ( $str )
657
658I<Function>.
659Decodes a string encoded by encode_filesystem_safe().
660
661Parameter:
662
663=over
664
665=item $str
666
667String to be decoded.
668
669=back
670
671Returns:
672
673Decoded string, stripped C<utf8> flag if any.
674
675=item decode_html ( $str )
676
677I<Function>.
678Decodes HTML entities in a string encoded by UTF-8 or a Unicode string.
679
680Parameter:
681
682=over
683
684=item $str
685
686String to be decoded.
687
688=back
689
690Returns:
691
692Decoded string, stripped C<utf8> flag if any.
693
694=item encode_filesystem_safe ( $str )
695
696I<Function>.
697Encodes a string $str to be suitable for filesystem.
698
699Parameter:
700
701=over
702
703=item $str
704
705String to be encoded.
706
707=back
708
709Returns:
710
711Encoded string, stripped C<utf8> flag if any.
712All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'>
713and alphanumeric characters are encoded to sequences C<'_'> followed by
714two hexdigits.
715
716Note that C<'/'> will also be encoded.
717
718=item encode_html ( $str, [ $additional_unsafe ] )
719
720I<Function>.
721Encodes characters in a string $str to HTML entities.
722By default
723C<'E<lt>'>, C<'E<gt>'>, C<'E<amp>'> and C<'E<quot>'> are encoded.
724
725Parameter:
726
727=over
728
729=item $str
730
731String to be encoded.
732
733=item $additional_unsafe
734
735Character or range of characters additionally encoded as entity references.
736
737This optional parameter was introduced on Sympa 6.2.37b.3.
738
739=back
740
741Returns:
742
743Encoded string, I<not> stripping utf8 flag if any.
744
745=item encode_uri ( $str, [ omit => $chars ] )
746
747I<Function>.
748Encodes potentially unsafe characters in the string using "percent" encoding
749suitable for URIs.
750
751Parameters:
752
753=over
754
755=item $str
756
757String to be encoded.
758
759=item omit =E<gt> $chars
760
761By default, all characters except those defined as "unreserved" in RFC 3986
762are encoded, that is, C<[^-A-Za-z0-9._~]>.
763If this parameter is given, it will prevent encoding additional characters.
764
765=back
766
767Returns:
768
769Encoded string, stripped C<utf8> flag if any.
770
771=item escape_chars ( $str )
772
773Escape weird characters.
774
775ToDo: This should be obsoleted in the future release: Would be better to use
776L</encode_filesystem_safe>.
777
778=item escape_url ( $str )
779
780DEPRECATED.
781Would be better to use L</"encode_uri"> or L</"mailtourl">.
782
783=item foldcase ( $str )
784
785I<Function>.
786Returns "fold-case" string suitable for case-insensitive match.
787For example, a code below looks for a needle in haystack not regarding case,
788even if they are non-ASCII UTF-8 strings.
789
790  $haystack = Sympa::Tools::Text::foldcase($HayStack);
791  $needle   = Sympa::Tools::Text::foldcase($NeedLe);
792  if (index $haystack, $needle >= 0) {
793      ...
794  }
795
796Parameter:
797
798=over
799
800=item $str
801
802A string.
803
804=back
805
806=item guessed_to_utf8( $text, [ lang, ... ] )
807
808I<Function>.
809Guesses text charset considering language context
810and returns the text reencoded by UTF-8.
811
812Parameters:
813
814=over
815
816=item $text
817
818Text to be reencoded.
819
820=item lang, ...
821
822Language tag(s) which may be given by L<Sympa::Language/"implicated_langs">.
823
824=back
825
826Returns:
827
828Reencoded text.
829If any charsets could not be guessed, C<iso-8859-1> will be used
830as the last resort, just because it covers full range of 8-bit.
831
832=item mailtourl ( $email, [ decode_html =E<gt> 1 ],
833[ query =E<gt> {key =E<gt> val, ...} ] )
834
835I<Function>.
836Constructs a C<mailto:> URL for given e-mail.
837
838Parameters:
839
840=over
841
842=item $email
843
844E-mail address.
845
846=item decode_html =E<gt> 1
847
848If set, arguments are assumed to include HTML entities.
849
850=item query =E<gt> {key =E<gt> val, ...}
851
852Optional query.
853
854=back
855
856Returns:
857
858Constructed URL.
859
860=item pad ( $str, $width )
861
862Pads space a string so that result will not be narrower than given width.
863
864Parameters:
865
866=over
867
868=item $str
869
870A string.
871
872=item $width
873
874If $width is false value or width of $str is not less than $width,
875does nothing.
876If $width is less than C<0>, pads right.
877Otherwise, pads left.
878
879=back
880
881Returns:
882
883Padded string.
884
885=item qdecode_filename ( $filename )
886
887Q-Decodes web file name.
888
889ToDo:
890This should be obsoleted in the future release: Would be better to use
891L</decode_filesystem_safe>.
892
893=item qencode_filename ( $filename )
894
895Q-Encodes web file name.
896
897ToDo:
898This should be obsoleted in the future release: Would be better to use
899L</encode_filesystem_safe>.
900
901=item slurp ( $file )
902
903Get entire content of the file.
904Normalization by canonic_text() is applied.
905C<$file> is the path to text file.
906
907=item unescape_chars ( $str )
908
909Unescape weird characters.
910
911ToDo: This should be obsoleted in the future release: Would be better to use
912L</decode_filesystem_safe>.
913
914=item valid_email ( $string )
915
916Basic check of an email address.
917
918=item weburl ( $base, \@paths, [ decode_html =E<gt> 1 ],
919[ fragment =E<gt> $fragment ], [ query =E<gt> \%query ] )
920
921Constructs a C<http:> or C<https:> URL under given base URI.
922
923Parameters:
924
925=over
926
927=item $base
928
929Base URI.
930
931=item \@paths
932
933Additional path components.
934
935=item decode_html =E<gt> 1
936
937If set, arguments are assumed to include HTML entities.
938Exception is $base:
939It is assumed not to include entities.
940
941=item fragment =E<gt> $fragment
942
943Optional fragment.
944
945=item query =E<gt> \%query
946
947Optional query.
948
949=back
950
951Returns:
952
953A URI.
954
955=item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] )
956
957I<Function>.
958Returns line-wrapped text.
959
960Parameters:
961
962=over
963
964=item $text
965
966The text to be folded.
967
968=item $init_tab
969
970Indentation prepended to the first line of paragraph.
971Default is C<''>, no indentation.
972
973=item $subsequent_tab
974
975Indentation prepended to each subsequent line of folded paragraph.
976Default is C<''>, no indentation.
977
978=item $cols
979
980Max number of columns of folded text.
981Default is C<78>.
982
983=back
984
985=back
986
987=head1 HISTORY
988
989L<Sympa::Tools::Text> appeared on Sympa 6.2a.41.
990
991decode_filesystem_safe() and encode_filesystem_safe() were added
992on Sympa 6.2.10.
993
994decode_html(), encode_html(), encode_uri() and mailtourl()
995were added on Sympa 6.2.14, and escape_url() was deprecated.
996
997guessed_to_utf8() and pad() were added on Sympa 6.2.17.
998
999canonic_text() and slurp() were added on Sympa 6.2.53b.
1000
1001clip() was added on Sympa 6.2.61b.
1002
1003=cut
1004