1# -*- indent-tabs-mode: nil; -*- 2# vim:ft=perl:et:sw=4 3# $Id$ 4 5# Sympa - SYsteme de Multi-Postage Automatique 6# 7# Copyright (c) 1997, 1998, 1999 Institut Pasteur & Christophe Wolfhugel 8# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 9# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites 10# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER 11# Copyright 2018, 2020 The Sympa Community. See the AUTHORS.md 12# file at the top-level directory of this distribution and at 13# <https://github.com/sympa-community/sympa.git>. 14# 15# This program is free software; you can redistribute it and/or modify 16# it under the terms of the GNU General Public License as published by 17# the Free Software Foundation; either version 2 of the License, or 18# (at your option) any later version. 19# 20# This program is distributed in the hope that it will be useful, 21# but WITHOUT ANY WARRANTY; without even the implied warranty of 22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23# GNU General Public License for more details. 24# 25# You should have received a copy of the GNU General Public License 26# along with this program. If not, see <http://www.gnu.org/licenses/>. 27 28package Sympa::Tools::Text; 29 30use strict; 31use warnings; 32use feature qw(fc); 33use Encode qw(); 34use English qw(-no_match_vars); 35use Encode::MIME::Header; # 'MIME-Q' encoding. 36use HTML::Entities qw(); 37use MIME::EncWords; 38use Text::LineFold; 39use Unicode::GCString; 40use URI::Escape qw(); 41BEGIN { eval 'use Unicode::Normalize qw()'; } 42BEGIN { eval 'use Unicode::UTF8 qw()'; } 43 44use Sympa::Language; 45use Sympa::Regexps; 46 47# Old name: tools::addrencode(). 48sub addrencode { 49 my $addr = shift; 50 my $phrase = (shift || ''); 51 my $charset = (shift || 'utf8'); 52 my $comment = (shift || ''); 53 54 return undef unless $addr =~ /\S/; 55 56 if ($phrase =~ /[^\s\x21-\x7E]/) { 57 $phrase = MIME::EncWords::encode_mimewords( 58 Encode::decode('utf8', $phrase), 59 'Encoding' => 'A', 60 'Charset' => $charset, 61 'Replacement' => 'FALLBACK', 62 'Field' => 'Resent-Sender', # almost longest 63 'Minimal' => 'DISPNAME', # needs MIME::EncWords >= 1.012. 64 ); 65 } elsif ($phrase =~ /\S/) { 66 $phrase =~ s/([\\\"])/\\$1/g; 67 $phrase = '"' . $phrase . '"'; 68 } 69 if ($comment =~ /[^\s\x21-\x27\x2A-\x5B\x5D-\x7E]/) { 70 $comment = MIME::EncWords::encode_mimewords( 71 Encode::decode('utf8', $comment), 72 'Encoding' => 'A', 73 'Charset' => $charset, 74 'Replacement' => 'FALLBACK', 75 'Minimal' => 'DISPNAME', 76 ); 77 } elsif ($comment =~ /\S/) { 78 $comment =~ s/([\\\"])/\\$1/g; 79 } 80 81 return 82 ($phrase =~ /\S/ ? "$phrase " : '') 83 . ($comment =~ /\S/ ? "($comment) " : '') 84 . "<$addr>"; 85} 86 87# Old names: tools::clean_email(), tools::get_canonical_email(). 88sub canonic_email { 89 my $email = shift; 90 91 return undef unless defined $email; 92 93 # Remove leading and trailing white spaces. 94 $email =~ s/\A\s+//; 95 $email =~ s/\s+\z//; 96 97 # Lower-case. 98 $email =~ tr/A-Z/a-z/; 99 100 return (length $email) ? $email : undef; 101} 102 103# Old name: tools::clean_msg_id(). 104sub canonic_message_id { 105 my $msg_id = shift; 106 107 return $msg_id unless defined $msg_id; 108 109 chomp $msg_id; 110 111 if ($msg_id =~ /\<(.+)\>/) { 112 $msg_id = $1; 113 } 114 115 return $msg_id; 116} 117 118sub canonic_text { 119 my $text = shift; 120 121 return undef unless defined $text; 122 123 # Normalize text. See also discussion on 124 # https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html 125 # 126 # N.B.: Corresponding modules are optional by now, and should be 127 # mandatory in the future. 128 my $utext; 129 if (Encode::is_utf8($text)) { 130 $utext = $text; 131 } elsif ($Unicode::UTF8::VERSION) { 132 no warnings 'utf8'; 133 $utext = Unicode::UTF8::decode_utf8($text); 134 } else { 135 $utext = Encode::decode_utf8($text); 136 } 137 if ($Unicode::Normalize::VERSION) { 138 $utext = Unicode::Normalize::normalize('NFC', $utext); 139 } 140 141 # Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL, 142 # and EIMS: 143 $utext =~ s/\r\n|\r/\n/g; 144 145 if (Encode::is_utf8($text)) { 146 return $utext; 147 } else { 148 return Encode::encode_utf8($utext); 149 } 150} 151 152sub slurp { 153 my $path = shift; 154 155 my $ifh; 156 return undef unless open $ifh, '<', $path; 157 my $text = do { local $RS; <$ifh> }; 158 close $ifh; 159 160 return canonic_text($text); 161} 162 163sub wrap_text { 164 my $text = shift; 165 my $init = shift; 166 my $subs = shift; 167 my $cols = shift; 168 169 $init //= ''; 170 $subs //= ''; 171 $cols //= 78; 172 return $text unless $cols; 173 174 my $email_re = Sympa::Regexps::email(); 175 my $linefold = Text::LineFold->new( 176 Language => Sympa::Language->instance->get_lang, 177 Prep => 'NONBREAKURI', 178 prep => [$email_re, sub { shift; @_ }], 179 ColumnsMax => $cols, 180 Format => sub { 181 shift; 182 my $event = shift; 183 my $str = shift; 184 if ($event =~ /^eo/) { return "\n"; } 185 if ($event =~ /^so[tp]/) { return $init . $str; } 186 if ($event eq 'sol') { return $subs . $str; } 187 undef; 188 }, 189 ); 190 191 my $t = Encode::is_utf8($text) ? $text : Encode::decode_utf8($text); 192 193 my $ret = ''; 194 while (1000 < length $t) { 195 my $s = substr $t, 0, 1000; 196 $ret .= $linefold->break_partial($s); 197 $t = substr $t, 1000; 198 } 199 $ret .= $linefold->break_partial($t) if length $t; 200 $ret .= $linefold->break_partial(undef); 201 202 return Encode::is_utf8($text) ? $ret : Encode::encode_utf8($ret); 203} 204 205sub decode_filesystem_safe { 206 my $str = shift; 207 return '' unless defined $str and length $str; 208 209 $str = Encode::encode_utf8($str) if Encode::is_utf8($str); 210 # On case-insensitive filesystem "_XX" along with "_xx" should be decoded. 211 $str =~ s/_([0-9A-Fa-f]{2})/chr hex "0x$1"/eg; 212 return $str; 213} 214 215sub decode_html { 216 my $str = shift; 217 218 Encode::encode_utf8( 219 HTML::Entities::decode_entities(Encode::decode_utf8($str))); 220} 221 222sub encode_filesystem_safe { 223 my $str = shift; 224 return '' unless defined $str and length $str; 225 226 $str = Encode::encode_utf8($str) if Encode::is_utf8($str); 227 $str =~ s/([^-+.0-9\@A-Za-z])/sprintf '_%02x', ord $1/eg; 228 return $str; 229} 230 231sub encode_html { 232 my $str = shift; 233 my $additional_unsafe = shift || ''; 234 235 HTML::Entities::encode_entities($str, '<>&"' . $additional_unsafe); 236} 237 238sub encode_uri { 239 my $str = shift; 240 my %options = @_; 241 242 # Note: URI-1.35 (URI::Escape 3.28) or later is required. 243 return Encode::encode_utf8( 244 URI::Escape::uri_escape_utf8( 245 Encode::decode_utf8($str), 246 '^-A-Za-z0-9._~' . (exists $options{omit} ? $options{omit} : '') 247 ) 248 ); 249} 250 251# Old name: tools::escape_chars(). 252sub escape_chars { 253 my $s = shift; 254 my $except = shift; ## Exceptions 255 my $ord_except = ord $except if defined $except; 256 257 ## Escape chars 258 ## !"#$%&'()+,:;<=>?[] AND accented chars 259 ## escape % first 260 foreach my $i ( 261 0x25, 262 0x20 .. 0x24, 263 0x26 .. 0x2c, 264 0x3a .. 0x3f, 265 0x5b, 0x5d, 266 0x80 .. 0x9f, 267 0xa0 .. 0xff 268 ) { 269 next if defined $ord_except and $i == $ord_except; 270 my $hex_i = sprintf "%lx", $i; 271 $s =~ s/\x$hex_i/%$hex_i/g; 272 } 273 ## Special traetment for '/' 274 $s =~ s/\//%a5/g unless defined $except and $except eq '/'; 275 276 return $s; 277} 278 279# Old name: tt2::escape_url(). 280# DEPRECATED. Use Sympa::Tools::Text::escape_uri() or 281# Sympa::Tools::Text::mailtourl(). 282#sub escape_url; 283 284sub foldcase { 285 my $str = shift; 286 287 return '' unless defined $str and length $str; 288 return Encode::encode_utf8(fc(Encode::decode_utf8($str))); 289} 290 291my %legacy_charsets = ( 292 'ar' => [qw(iso-8859-6)], 293 'bs' => [qw(iso-8859-2)], 294 'cs' => [qw(iso-8859-2)], 295 'eo' => [qw(iso-8859-3)], 296 'et' => [qw(iso-8859-4)], 297 'he' => [qw(iso-8859-8)], 298 'hr' => [qw(iso-8859-2)], 299 'hu' => [qw(iso-8859-2)], 300 'ja' => [qw(euc-jp cp932 MacJapanese)], 301 'kl' => [qw(iso-8859-4)], 302 'ko' => [qw(cp949)], 303 'lt' => [qw(iso-8859-4)], 304 'lv' => [qw(iso-8859-4)], 305 'mt' => [qw(iso-8859-3)], 306 'pl' => [qw(iso-8859-2)], 307 'ro' => [qw(iso-8859-2)], 308 'ru' => [qw(koi8-r cp1251)], # cp866? MacCyrillic? 309 'sk' => [qw(iso-8859-2)], 310 'sl' => [qw(iso-8859-2)], 311 'th' => [qw(iso-8859-11 cp874 MacThai)], 312 'tr' => [qw(iso-8859-9)], 313 'uk' => [qw(koi8-u)], # MacUkrainian? 314 'zh-CN' => [qw(euc-cn)], 315 'zh-TW' => [qw(big5-eten)], 316); 317 318sub guessed_to_utf8 { 319 my $text = shift; 320 my @langs = @_; 321 322 return Encode::encode_utf8($text) if Encode::is_utf8($text); 323 return $text 324 unless defined $text 325 and length $text 326 and $text =~ /[^\x00-\x7F]/; 327 328 my $utf8; 329 if ($Unicode::UTF8::VERSION) { 330 $utf8 = 331 eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) }; 332 } 333 unless (defined $utf8) { 334 foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) { 335 $utf8 = 336 eval { Encode::decode($charset, $text, Encode::FB_CROAK()) }; 337 last if defined $utf8; 338 } 339 } 340 unless (defined $utf8) { 341 $utf8 = Encode::decode('iso-8859-1', $text); 342 } 343 344 # Apply NFC: e.g. for modified-NFD by Mac OS X. 345 $utf8 = Unicode::Normalize::normalize('NFC', $utf8) 346 if $Unicode::Normalize::VERSION; 347 348 return Encode::encode_utf8($utf8); 349} 350 351sub mailtourl { 352 my $text = shift; 353 my %options = @_; 354 355 my $dtext = 356 (not defined $text) ? '' 357 : $options{decode_html} ? Sympa::Tools::Text::decode_html($text) 358 : $text; 359 $dtext =~ s/\A\s+//; 360 $dtext =~ s/\s+\z//; 361 $dtext =~ s/(?:\r\n|\r|\n)(?=[ \t])//g; 362 $dtext =~ s/\r\n|\r|\n/ /g; 363 364 # The ``@'' in email address should not be encoded because some MUAs 365 # aren't able to decode ``%40'' in e-mail address of mailto: URL. 366 # Contrary, ``@'' in query component should be encoded because some 367 # MUAs take it for a delimiter to separate URL from the rest. 368 my ($format, $utext, $qsep); 369 if ($dtext =~ /[()<>\[\]:;,\"\s]/) { 370 # Use "to" header if source text includes any of RFC 5322 371 # "specials", minus ``@'' and ``\'', plus whitespaces. 372 $format = 'mailto:?to=%s%s'; 373 $utext = Sympa::Tools::Text::encode_uri($dtext); 374 $qsep = '&'; 375 } else { 376 $format = 'mailto:%s%s'; 377 $utext = Sympa::Tools::Text::encode_uri($dtext, omit => '@'); 378 $qsep = '?'; 379 } 380 my $qstring = _url_query_string( 381 $options{query}, 382 decode_html => $options{decode_html}, 383 leadchar => $qsep, 384 sepchar => '&', 385 trim_values => 1, 386 ); 387 388 return sprintf $format, $utext, $qstring; 389} 390 391sub _url_query_string { 392 my $query = shift; 393 my %options = @_; 394 395 unless (ref $query eq 'HASH' and %$query) { 396 return ''; 397 } else { 398 my $decode_html = $options{decode_html}; 399 my $trim_values = $options{trim_values}; 400 return ($options{leadchar} || '?') . join( 401 ($options{sepchar} || ';'), 402 map { 403 my ($dkey, $dval) = map { 404 (not defined $_) ? '' 405 : $decode_html ? Sympa::Tools::Text::decode_html($_) 406 : $_; 407 } ($_, $query->{$_}); 408 if ($trim_values and lc $dkey ne 'body') { 409 $dval =~ s/\A\s+//; 410 $dval =~ s/\s+\z//; 411 $dval =~ s/(?:\r\n|\r|\n)(?=[ \t])//g; 412 $dval =~ s/\r\n|\r|\n/ /g; 413 } 414 415 sprintf '%s=%s', 416 Sympa::Tools::Text::encode_uri($dkey), 417 Sympa::Tools::Text::encode_uri($dval); 418 } sort keys %$query 419 ); 420 } 421} 422 423sub pad { 424 my $str = shift; 425 my $width = shift; 426 427 return $str unless $width and defined $str; 428 429 my $ustr = Encode::is_utf8($str) ? $str : Encode::decode_utf8($str); 430 my $cols = Unicode::GCString->new($ustr)->columns; 431 432 unless ($cols < abs $width) { 433 return $str; 434 } elsif ($width < 0) { 435 return $str . (' ' x (-$width - $cols)); 436 } else { 437 return (' ' x ($width - $cols)) . $str; 438 } 439} 440 441# Old name: tools::qdecode_filename(). 442sub qdecode_filename { 443 my $filename = shift; 444 445 ## We don't use MIME::Words here because it does not encode properly 446 ## Unicode 447 ## Check if string is already Q-encoded first 448 #if ($filename =~ /\=\?UTF-8\?/) { 449 $filename = Encode::encode_utf8(Encode::decode('MIME-Q', $filename)); 450 #} 451 452 return $filename; 453} 454 455# Old name: tools::qencode_filename(). 456sub qencode_filename { 457 my $filename = shift; 458 459 ## We don't use MIME::Words here because it does not encode properly 460 ## Unicode 461 ## Check if string is already Q-encoded first 462 ## Also check if the string contains 8bit chars 463 unless ($filename =~ /\=\?UTF-8\?/ 464 || $filename =~ /^[\x00-\x7f]*$/) { 465 466 ## Don't encode elements such as .desc. or .url or .moderate 467 ## or .extension 468 my $part = $filename; 469 my ($leading, $trailing); 470 $leading = $1 if ($part =~ s/^(\.desc\.)//); ## leading .desc 471 $trailing = $1 if ($part =~ s/((\.\w+)+)$//); ## trailing .xx 472 473 my $encoded_part = MIME::EncWords::encode_mimewords( 474 $part, 475 Charset => 'utf8', 476 Encoding => 'q', 477 MaxLineLen => 1000, 478 Minimal => 'NO' 479 ); 480 481 $filename = $leading . $encoded_part . $trailing; 482 } 483 484 return $filename; 485} 486 487sub clip { 488 my $string = shift; 489 return undef unless @_; 490 my $length = shift; 491 492 my ($gcstr, $blen); 493 if (ref $string eq 'Unicode::GCString') { 494 $gcstr = $string; 495 $blen = length Encode::encode_utf8($string->as_string); 496 } elsif (Encode::is_utf8($string)) { 497 $gcstr = Unicode::GCString->new($string); 498 $blen = length Encode::encode_utf8($string); 499 } else { 500 $gcstr = Unicode::GCString->new(Encode::decode_utf8($string)); 501 $blen = length $string; 502 } 503 504 $length += $blen if $length < 0; 505 return '' if $length < 0; # out of range 506 return $string if $blen <= $length; 507 508 my $result = $gcstr->substr(0, _gc_length($gcstr, $length)); 509 510 if (ref $string eq 'Unicode::GCString') { 511 return $result; 512 } elsif (Encode::is_utf8($string)) { 513 return $result->as_string; 514 } else { 515 return Encode::encode_utf8($result->as_string); 516 } 517} 518 519sub _gc_length { 520 my $gcstr = shift; 521 my $length = shift; 522 523 return 0 unless $gcstr->length; 524 return 0 unless $length; 525 526 my ($shorter, $longer) = (0, $gcstr->length); 527 while ($shorter < $longer) { 528 my $cur = ($shorter + $longer + 1) >> 1; 529 my $elen = 530 length Encode::encode_utf8($gcstr->substr(0, $cur)->as_string); 531 if ($elen <= $length) { 532 $shorter = $cur; 533 } else { 534 $longer = $cur - 1; 535 } 536 } 537 538 return $shorter; 539} 540 541# Old name: tools::unescape_chars(). 542sub unescape_chars { 543 my $s = shift; 544 545 $s =~ s/%a5/\//g; ## Special traetment for '/' 546 foreach my $i (0x20 .. 0x2c, 0x3a .. 0x3f, 0x5b, 0x5d, 0x80 .. 0x9f, 547 0xa0 .. 0xff) { 548 my $hex_i = sprintf "%lx", $i; 549 my $hex_s = sprintf "%c", $i; 550 $s =~ s/%$hex_i/$hex_s/g; 551 } 552 553 return $s; 554} 555 556# Old name: tools::valid_email(). 557sub valid_email { 558 my $email = shift; 559 560 my $email_re = Sympa::Regexps::email(); 561 return undef unless $email =~ /^${email_re}$/; 562 563 # Forbidden characters. 564 return undef if $email =~ /[\|\$\*\?\!]/; 565 566 return 1; 567} 568 569sub weburl { 570 my $base = shift; 571 my $paths = shift; 572 my %options = @_; 573 574 my @paths = map { 575 Sympa::Tools::Text::encode_uri( 576 (not defined $_) ? '' 577 : $options{decode_html} ? Sympa::Tools::Text::decode_html($_) 578 : $_ 579 ); 580 } @{$paths || []}; 581 582 my $qstring = _url_query_string( 583 $options{query}, 584 decode_html => $options{decode_html}, 585 sepchar => '&', 586 ); 587 588 my $fstring; 589 my $fragment = $options{fragment}; 590 if (defined $fragment) { 591 $fstring = '#' 592 . Sympa::Tools::Text::encode_uri( 593 $options{decode_html} 594 ? Sympa::Tools::Text::decode_html($fragment) 595 : $fragment 596 ); 597 } else { 598 $fstring = ''; 599 } 600 601 return sprintf '%s%s%s', join('/', grep { defined $_ } ($base, @paths)), 602 $qstring, $fstring; 603} 604 6051; 606__END__ 607 608=encoding utf-8 609 610=head1 NAME 611 612Sympa::Tools::Text - Text-related functions 613 614=head1 DESCRIPTION 615 616This package provides some text-related functions. 617 618=head2 Functions 619 620=over 621 622=item addrencode ( $addr, [ $phrase, [ $charset, [ $comment ] ] ] ) 623 624Returns formatted (and encoded) name-addr as RFC5322 3.4. 625 626=item canonic_email ( $email ) 627 628I<Function>. 629Returns canonical form of e-mail address. 630 631Leading and trailing white spaces are removed. 632Latin letters without accents are lower-cased. 633 634For malformed inputs returns C<undef>. 635 636=item canonic_message_id ( $message_id ) 637 638Returns canonical form of message ID without trailing or leading whitespaces 639or C<E<lt>>, C<E<gt>>. 640 641=item canonic_text ( $text ) 642 643Canonicalizes text. 644C<$text> should be a binary string encoded by UTF-8 character set or 645a Unicode string. 646Forbidden sequences in binary string will be replaced by 647U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied. 648 649=item clip ( $string, $length ) 650 651I<Function>. 652Clips $string according to $length by bytes, 653considering boundary of grapheme clusters. 654UTF-8 is assumed for $string as bytestring. 655 656=item decode_filesystem_safe ( $str ) 657 658I<Function>. 659Decodes a string encoded by encode_filesystem_safe(). 660 661Parameter: 662 663=over 664 665=item $str 666 667String to be decoded. 668 669=back 670 671Returns: 672 673Decoded string, stripped C<utf8> flag if any. 674 675=item decode_html ( $str ) 676 677I<Function>. 678Decodes HTML entities in a string encoded by UTF-8 or a Unicode string. 679 680Parameter: 681 682=over 683 684=item $str 685 686String to be decoded. 687 688=back 689 690Returns: 691 692Decoded string, stripped C<utf8> flag if any. 693 694=item encode_filesystem_safe ( $str ) 695 696I<Function>. 697Encodes a string $str to be suitable for filesystem. 698 699Parameter: 700 701=over 702 703=item $str 704 705String to be encoded. 706 707=back 708 709Returns: 710 711Encoded string, stripped C<utf8> flag if any. 712All bytes except C<'-'>, C<'+'>, C<'.'>, C<'@'> 713and alphanumeric characters are encoded to sequences C<'_'> followed by 714two hexdigits. 715 716Note that C<'/'> will also be encoded. 717 718=item encode_html ( $str, [ $additional_unsafe ] ) 719 720I<Function>. 721Encodes characters in a string $str to HTML entities. 722By default 723C<'E<lt>'>, C<'E<gt>'>, C<'E<amp>'> and C<'E<quot>'> are encoded. 724 725Parameter: 726 727=over 728 729=item $str 730 731String to be encoded. 732 733=item $additional_unsafe 734 735Character or range of characters additionally encoded as entity references. 736 737This optional parameter was introduced on Sympa 6.2.37b.3. 738 739=back 740 741Returns: 742 743Encoded string, I<not> stripping utf8 flag if any. 744 745=item encode_uri ( $str, [ omit => $chars ] ) 746 747I<Function>. 748Encodes potentially unsafe characters in the string using "percent" encoding 749suitable for URIs. 750 751Parameters: 752 753=over 754 755=item $str 756 757String to be encoded. 758 759=item omit =E<gt> $chars 760 761By default, all characters except those defined as "unreserved" in RFC 3986 762are encoded, that is, C<[^-A-Za-z0-9._~]>. 763If this parameter is given, it will prevent encoding additional characters. 764 765=back 766 767Returns: 768 769Encoded string, stripped C<utf8> flag if any. 770 771=item escape_chars ( $str ) 772 773Escape weird characters. 774 775ToDo: This should be obsoleted in the future release: Would be better to use 776L</encode_filesystem_safe>. 777 778=item escape_url ( $str ) 779 780DEPRECATED. 781Would be better to use L</"encode_uri"> or L</"mailtourl">. 782 783=item foldcase ( $str ) 784 785I<Function>. 786Returns "fold-case" string suitable for case-insensitive match. 787For example, a code below looks for a needle in haystack not regarding case, 788even if they are non-ASCII UTF-8 strings. 789 790 $haystack = Sympa::Tools::Text::foldcase($HayStack); 791 $needle = Sympa::Tools::Text::foldcase($NeedLe); 792 if (index $haystack, $needle >= 0) { 793 ... 794 } 795 796Parameter: 797 798=over 799 800=item $str 801 802A string. 803 804=back 805 806=item guessed_to_utf8( $text, [ lang, ... ] ) 807 808I<Function>. 809Guesses text charset considering language context 810and returns the text reencoded by UTF-8. 811 812Parameters: 813 814=over 815 816=item $text 817 818Text to be reencoded. 819 820=item lang, ... 821 822Language tag(s) which may be given by L<Sympa::Language/"implicated_langs">. 823 824=back 825 826Returns: 827 828Reencoded text. 829If any charsets could not be guessed, C<iso-8859-1> will be used 830as the last resort, just because it covers full range of 8-bit. 831 832=item mailtourl ( $email, [ decode_html =E<gt> 1 ], 833[ query =E<gt> {key =E<gt> val, ...} ] ) 834 835I<Function>. 836Constructs a C<mailto:> URL for given e-mail. 837 838Parameters: 839 840=over 841 842=item $email 843 844E-mail address. 845 846=item decode_html =E<gt> 1 847 848If set, arguments are assumed to include HTML entities. 849 850=item query =E<gt> {key =E<gt> val, ...} 851 852Optional query. 853 854=back 855 856Returns: 857 858Constructed URL. 859 860=item pad ( $str, $width ) 861 862Pads space a string so that result will not be narrower than given width. 863 864Parameters: 865 866=over 867 868=item $str 869 870A string. 871 872=item $width 873 874If $width is false value or width of $str is not less than $width, 875does nothing. 876If $width is less than C<0>, pads right. 877Otherwise, pads left. 878 879=back 880 881Returns: 882 883Padded string. 884 885=item qdecode_filename ( $filename ) 886 887Q-Decodes web file name. 888 889ToDo: 890This should be obsoleted in the future release: Would be better to use 891L</decode_filesystem_safe>. 892 893=item qencode_filename ( $filename ) 894 895Q-Encodes web file name. 896 897ToDo: 898This should be obsoleted in the future release: Would be better to use 899L</encode_filesystem_safe>. 900 901=item slurp ( $file ) 902 903Get entire content of the file. 904Normalization by canonic_text() is applied. 905C<$file> is the path to text file. 906 907=item unescape_chars ( $str ) 908 909Unescape weird characters. 910 911ToDo: This should be obsoleted in the future release: Would be better to use 912L</decode_filesystem_safe>. 913 914=item valid_email ( $string ) 915 916Basic check of an email address. 917 918=item weburl ( $base, \@paths, [ decode_html =E<gt> 1 ], 919[ fragment =E<gt> $fragment ], [ query =E<gt> \%query ] ) 920 921Constructs a C<http:> or C<https:> URL under given base URI. 922 923Parameters: 924 925=over 926 927=item $base 928 929Base URI. 930 931=item \@paths 932 933Additional path components. 934 935=item decode_html =E<gt> 1 936 937If set, arguments are assumed to include HTML entities. 938Exception is $base: 939It is assumed not to include entities. 940 941=item fragment =E<gt> $fragment 942 943Optional fragment. 944 945=item query =E<gt> \%query 946 947Optional query. 948 949=back 950 951Returns: 952 953A URI. 954 955=item wrap_text ( $text, [ $init_tab, [ $subsequent_tab, [ $cols ] ] ] ) 956 957I<Function>. 958Returns line-wrapped text. 959 960Parameters: 961 962=over 963 964=item $text 965 966The text to be folded. 967 968=item $init_tab 969 970Indentation prepended to the first line of paragraph. 971Default is C<''>, no indentation. 972 973=item $subsequent_tab 974 975Indentation prepended to each subsequent line of folded paragraph. 976Default is C<''>, no indentation. 977 978=item $cols 979 980Max number of columns of folded text. 981Default is C<78>. 982 983=back 984 985=back 986 987=head1 HISTORY 988 989L<Sympa::Tools::Text> appeared on Sympa 6.2a.41. 990 991decode_filesystem_safe() and encode_filesystem_safe() were added 992on Sympa 6.2.10. 993 994decode_html(), encode_html(), encode_uri() and mailtourl() 995were added on Sympa 6.2.14, and escape_url() was deprecated. 996 997guessed_to_utf8() and pad() were added on Sympa 6.2.17. 998 999canonic_text() and slurp() were added on Sympa 6.2.53b. 1000 1001clip() was added on Sympa 6.2.61b. 1002 1003=cut 1004