1# Locale::Po4a::Po -- manipulation of PO files
2#
3# This program is free software; you may redistribute it and/or modify it
4# under the terms of GPL (see COPYING).
5
6############################################################################
7# Modules and declarations
8############################################################################
9
10=encoding UTF-8
11
12=head1 NAME
13
14Locale::Po4a::Po - PO file manipulation module
15
16=head1 SYNOPSIS
17
18    use Locale::Po4a::Po;
19    my $pofile=Locale::Po4a::Po->new();
20
21    # Read PO file
22    $pofile->read('file.po');
23
24    # Add an entry
25    $pofile->push('msgid' => 'Hello', 'msgstr' => 'bonjour',
26                  'flags' => "wrap", 'reference'=>'file.c:46');
27
28    # Extract a translation
29    $pofile->gettext("Hello"); # returns 'bonjour'
30
31    # Write back to a file
32    $pofile->write('otherfile.po');
33
34=head1 DESCRIPTION
35
36Locale::Po4a::Po is a module that allows you to manipulate message
37catalogs. You can load and write from/to a file (which extension is often
38I<po>), you can build new entries on the fly or request for the translation
39of a string.
40
41For a more complete description of message catalogs in the PO format and
42their use, please refer to the info documentation of the gettext program (node "`PO Files"').
43
44This module is part of the po4a project, which objective is to use PO files
45(designed at origin to ease the translation of program messages) to
46translate everything, including documentation (man page, info manual),
47package description, debconf templates, and everything which may benefit
48from this.
49
50=head1 OPTIONS ACCEPTED BY THIS MODULE
51
52=over 4
53
54=item B<--porefs> I<type>
55
56Specify the reference format. Argument I<type> can be one of B<never>
57to not produce any reference, B<file> to only specify the file
58without the line number, B<counter> to replace line number by an
59increasing counter, and B<full> to include complete references (default: full).
60
61=item B<--wrap-po> B<no>|B<newlines>|I<number> (default: 76)
62
63Specify how the po file should be wrapped. This gives the choice between either
64files that are nicely wrapped but could lead to git conflicts, or files that are
65easier to handle automatically, but harder to read for humans.
66
67Historically, the gettext suite has reformatted the po files at the 77th column
68for cosmetics. This option specifies the behavior of po4a. If set to a numerical
69value, po4a will wrap the po file after this column and after newlines in the
70content. If set to B<newlines>, po4a will only split the msgid and msgstr after
71newlines in the content. If set to B<no>, po4a will not wrap the po file at all.
72The reference comments are always wrapped by the gettext tools that we use internally.
73
74Note that this option has no impact on how the msgid and msgstr are wrapped, ie
75on how newlines are added to the content of these strings.
76
77=item B<--msgid-bugs-address> I<email@address>
78
79Set the report address for msgid bugs. By default, the created POT files
80have no Report-Msgid-Bugs-To fields.
81
82=item B<--copyright-holder> I<string>
83
84Set the copyright holder in the POT header. The default value is
85"Free Software Foundation, Inc."
86
87=item B<--package-name> I<string>
88
89Set the package name for the POT header. The default is "PACKAGE".
90
91=item B<--package-version> I<string>
92
93Set the package version for the POT header. The default is "VERSION".
94
95=back
96
97=cut
98
99use IO::File;
100
101require Exporter;
102
103package Locale::Po4a::Po;
104use DynaLoader;
105
106use Locale::Po4a::Common qw(wrap_msg wrap_mod wrap_ref_mod dgettext);
107
108use subs qw(makespace);
109use vars qw(@ISA @EXPORT_OK);
110@ISA       = qw(Exporter DynaLoader);
111@EXPORT    = qw(%debug);
112@EXPORT_OK = qw(&move_po_if_needed);
113
114use Locale::Po4a::TransTractor;
115
116# Try to use a C extension if present.
117eval("bootstrap Locale::Po4a::Po $Locale::Po4a::TransTractor::VERSION");
118
119use 5.006;
120use strict;
121use warnings;
122
123use Carp qw(croak);
124use File::Basename;
125use File::Path;    # mkdir before write
126use File::Copy;    # move
127use POSIX qw(strftime floor);
128use Time::Local;
129
130use Encode;
131use Config;
132
133my @known_flags = qw(
134  wrap no-wrap fuzzy
135  c-format no-c-format
136  objc-format no-objc-format
137  sh-format no-sh-format
138  python-format no-python-format
139  python-brace-format no-python-brace-format
140  lisp-format no-lisp-format
141  elisp-format no-elisp-format
142  librep-format no-librep-format
143  scheme-format no-scheme-format
144  smalltalk-format no-smalltalk-format
145  java-format no-java-format
146  csharp-format no-csharp-format
147  awk-format no-awk-format
148  object-pascal-format no-object-pascal-format
149  ycp-format no-ycp-format
150  tcl-format no-tcl-format
151  perl-format no-perl-format
152  perl-brace-format no-perl-brace-format
153  php-format no-php-format
154  gcc-internal-format no-gcc-internal-format
155  gfc-internal-format no-gfc-internal-format
156  qt-format no-qt-format
157  qt-plural-format no-qt-plural-format
158  kde-format no-kde-format
159  boost-format no-boost-format
160  lua-format no-lua-format
161  javascript-format no-javascript-format
162);
163
164# Custom flags, used for example by weblate
165push @known_flags, 'markdown-text';
166
167our %debug = (
168    'canonize' => 0,
169    'quote'    => 0,
170    'escape'   => 0,
171    'encoding' => 0,
172    'filter'   => 0
173);
174
175=head1 Functions concerning entire message catalogs
176
177=over 4
178
179=item new()
180
181Creates a new message catalog. If an argument is provided, it's the name of
182a PO file we should load.
183
184=cut
185
186sub new {
187    my ( $this, $options ) = ( shift, shift );
188    my $class = ref($this) || $this;
189    my $self  = {};
190    bless $self, $class;
191    $self->initialize($options);
192
193    my $filename = shift;
194    $self->read($filename) if length($filename);
195    return $self;
196}
197
198# Return the numerical timezone (e.g. +0200)
199# Neither the %z nor the %s formats of strftime are portable:
200# '%s' is not supported on Solaris and '%z' indicates
201# "2006-10-25 19:36E. Europe Standard Time" on MS Windows.
202sub timezone {
203    my ($time) = @_;
204    my @l = localtime($time);
205
206    my $diff = floor( timegm(@l) / 60 + 0.5 ) - floor( $time / 60 + 0.5 );
207    my $sign = ( $diff >= 0 ? 1 : -1 );
208    $diff = abs($diff);
209
210    my $h = $sign * floor( $diff / 60 );
211    my $m = $diff % 60;
212
213    return sprintf "%+03d%02d\n", $h, $m;
214}
215
216sub initialize {
217    my ( $self, $options ) = ( shift, shift );
218    my $time = time;
219    my $date = strftime( "%Y-%m-%d %H:%M", localtime($time) ) . timezone($time);
220    chomp $date;
221
222    $self->{options}{'porefs'}             = 'full';
223    $self->{options}{'msgid-bugs-address'} = undef;
224    $self->{options}{'copyright-holder'}   = "Free Software Foundation, Inc.";
225    $self->{options}{'package-name'}       = "PACKAGE";
226    $self->{options}{'package-version'}    = "VERSION";
227    $self->{options}{'wrap-po'}            = 76;
228    $self->{options}{'pot-charset'}        = "UTF-8";
229    $self->{options}{'pot-language'}       = "";
230
231    foreach my $opt ( keys %$options ) {
232
233        #        print STDERR "$opt: ".(defined($options->{$opt})?$options->{$opt}:"(undef)")."\n";
234        if ( $options->{$opt} ) {
235            die wrap_mod( "po4a::po", dgettext( "po4a", "Unknown option: %s" ), $opt )
236              unless exists $self->{options}{$opt};
237            $self->{options}{$opt} = $options->{$opt};
238        }
239    }
240    $self->{options}{'wrap-po'} =~ /^(no|newlines|\d+)$/
241      || die wrap_mod(
242        "po4a::po",
243        dgettext( "po4a", "Invalid value for option 'wrap-po' ('%s' is not 'no' nor 'newlines' nor a number)" ),
244        $self->{options}{'wrap-po'}
245      );
246
247    $self->{options}{'porefs'} =~ /^(full|counter|noline|file|none|never)?$/
248      || die wrap_mod(
249        "po4a::po",
250        dgettext(
251            "po4a",
252            "Invalid value for option 'porefs' ('%s' is "
253              . "not one of 'full', 'counter', 'noline', 'file' or 'never')"
254        ),
255        $self->{options}{'porefs'}
256      );
257    $self->{options}{'porefs'} =~ s/noline/file/;    # backward compat. 'file' used to be called 'noline'.
258    $self->{options}{'porefs'} =~ s/none/never/;     # backward compat. 'never' used to be called 'none'.
259    if ( $self->{options}{'porefs'} =~ m/^counter/ ) {
260        $self->{counter} = {};
261    }
262
263    $self->{po}        = ();
264    $self->{count}     = 0;                          # number of msgids in the PO
265                                                     # count_doc: number of strings in the document
266                                                     # (duplicate strings counted multiple times)
267    $self->{count_doc} = 0;
268    $self->{header_comment} =
269        " SOME DESCRIPTIVE TITLE\n"
270      . " Copyright (C) YEAR "
271      . $self->{options}{'copyright-holder'} . "\n"
272      . " This file is distributed under the same license "
273      . "as the "
274      . $self->{options}{'package-name'}
275      . " package.\n"
276      . " FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.\n" . "\n"
277      . ", fuzzy";
278
279    #    $self->header_tag="fuzzy";
280    $self->{header} = escape_text(
281            "Project-Id-Version: "
282          . $self->{options}{'package-name'} . " "
283          . $self->{options}{'package-version'} . "\n"
284          . (
285            ( defined $self->{options}{'msgid-bugs-address'} )
286            ? "Report-Msgid-Bugs-To: " . $self->{options}{'msgid-bugs-address'} . "\n"
287            : ""
288          )
289          . "POT-Creation-Date: $date\n"
290          . "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
291          . "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\n"
292          . "Language-Team: LANGUAGE <LL\@li.org>\n"
293          . "Language: "
294          . $self->{options}{'pot-language'} . "\n"
295          . "MIME-Version: 1.0\n"
296          . "Content-Type: text/plain; charset="
297          . $self->{options}{'pot-charset'} . "\n"
298          . "Content-Transfer-Encoding: 8bit\n"
299    );
300
301    $self->{encoder} = find_encoding("UTF-8");
302    $self->{footer}  = [];
303
304    # To make stats about gettext hits
305    $self->stats_clear();
306}
307
308=item read($)
309
310Reads a PO file (which name is given as argument).  Previously existing
311entries in self are not removed, the new ones are added to the end of the
312catalog.
313
314=cut
315
316sub read {
317    my $self     = shift;
318    my $filename = shift
319      or croak wrap_mod( "po4a::po", dgettext( "po4a", "Please provide a non-null filename" ) );
320
321    my $lang = basename($filename);
322    $lang =~ s/\.po$//;
323    $self->{lang} = $lang;
324
325    my $cmd = "msgfmt" . $Config{_exe} . " --check-format --check-domain -o /dev/null " . $filename;
326
327    my $locale = $ENV{'LC_ALL'};
328    $ENV{'LC_ALL'} = "C";
329    my $out = qx/$cmd 2>&1/;
330    $ENV{'LC_ALL'} = $locale;
331
332    die wrap_msg( dgettext( "po4a", "Invalid po file %s:\n%s" ), $filename, $out )
333      unless ( $? == 0 );
334
335    my $fh;
336    if ( $filename eq '-' ) {
337        $fh = *STDIN;
338    } else {
339        open $fh, "<$filename"
340          or croak wrap_mod( "po4a::po", dgettext( "po4a", "Cannot read from %s: %s" ), $filename, $! );
341    }
342
343    ## Read paragraphs line-by-line
344    my $pofile = "";
345    my $textline;
346    while ( defined( $textline = <$fh> ) ) {
347        $pofile .= $textline;
348    }
349
350    #    close INPUT
351    #        or croak (sprintf(dgettext("po4a",
352    #                                   "Cannot close %s after reading: %s"),
353    #                          $filename,$!)."\n");
354
355    my $linenum = 0;
356
357    foreach my $msg ( split( /\n\n/, $pofile ) ) {
358        my ( $msgid, $msgstr, $comment, $previous, $automatic, $reference, $flags, $buffer );
359        my ( $msgid_plural, $msgstr_plural );
360        if ( $msg =~ m/^#~/m ) {
361            push( @{ $self->{footer} }, $msg );
362            next;
363        }
364        foreach my $line ( split( /\n/, $msg ) ) {
365            $linenum++;
366            if ( $line =~ /^#\. ?(.*)$/ ) {    # Automatic comment
367                $automatic .= ( defined($automatic) ? "\n" : "" ) . $1;
368
369            } elsif ( $line =~ /^#: ?(.*)$/ ) {    # reference
370                $reference .= ( defined($reference) ? "\n" : "" ) . $1;
371
372            } elsif ( $line =~ /^#, ?(.*)$/ ) {    # flags
373                $flags .= ( defined($flags) ? "\n" : "" ) . $1;
374
375            } elsif ( $line =~ /^#\| ?(.*)$/ ) {    # previous translation
376                $previous .= ( defined($previous) ? "\n" : "" ) . ( $1 || "" );
377
378            } elsif ( $line =~ /^#(.*)$/ ) {        # Translator comments
379                $comment .= ( defined($comment) ? "\n" : "" ) . ( $1 || "" );
380
381            } elsif ( $line =~ /^msgid (".*")$/ ) {    # begin of msgid
382                $buffer = $1;
383
384            } elsif ( $line =~ /^msgid_plural (".*")$/ ) {
385
386                # begin of msgid_plural, end of msgid
387
388                $msgid  = $buffer;
389                $buffer = $1;
390
391            } elsif ( $line =~ /^msgstr (".*")$/ ) {
392
393                # begin of msgstr, end of msgid
394
395                $msgid  = $buffer;
396                $buffer = "$1";
397
398            } elsif ( $line =~ /^msgstr\[([0-9]+)\] (".*")$/ ) {
399
400                # begin of msgstr[x], end of msgid_plural or msgstr[x-1]
401
402                # Note: po4a cannot uses plural forms
403                # (no integer to use the plural form)
404                #   * drop the msgstr[x] where x >= 2
405                #   * use msgstr[0] as the translation of msgid
406                #   * use msgstr[1] as the translation of msgid_plural
407
408                if ( $1 eq "0" ) {
409                    $msgid_plural = $buffer;
410                    $buffer       = "$2";
411                } elsif ( $1 eq "1" ) {
412                    $msgstr = $buffer;
413                    $buffer = "$2";
414                } elsif ( $1 eq "2" ) {
415                    $msgstr_plural = $buffer;
416                    warn wrap_ref_mod( "$filename:$linenum", "po4a::po",
417                        dgettext( "po4a", "Messages with more than 2 plural forms are not supported." ) );
418                }
419            } elsif ( $line =~ /^(".*")$/ ) {
420
421                # continuation of a line
422                $buffer .= "\n$1";
423
424            } else {
425                warn wrap_ref_mod( "$filename:$linenum", "po4a::po", dgettext( "po4a", "Parse error at: -->%s<--" ),
426                    $line );
427            }
428        }
429        $linenum++;
430        if ( defined $msgid_plural ) {
431            $msgstr_plural = $buffer;
432
433            $msgid  = unquote_text($msgid)  if ( defined($msgid) );
434            $msgstr = unquote_text($msgstr) if ( defined($msgstr) );
435
436            $self->push_raw(
437                'msgid'     => $msgid,
438                'msgstr'    => $msgstr,
439                'reference' => $reference,
440                'flags'     => $flags,
441                'comment'   => $comment,
442                'previous'  => $previous,
443                'automatic' => $automatic,
444                'plural'    => 0
445            );
446
447            $msgid_plural = unquote_text($msgid_plural)
448              if ( defined($msgid_plural) );
449            $msgstr_plural = unquote_text($msgstr_plural)
450              if ( defined($msgstr_plural) );
451
452            $self->push_raw(
453                'msgid'     => $msgid_plural,
454                'msgstr'    => $msgstr_plural,
455                'reference' => $reference,
456                'flags'     => $flags,
457                'comment'   => $comment,
458                'previous'  => $previous,
459                'automatic' => $automatic,
460                'plural'    => 1
461            );
462        } else {
463            $msgstr = $buffer;
464
465            $msgid  = unquote_text($msgid)  if ( defined($msgid) );
466            $msgstr = unquote_text($msgstr) if ( defined($msgstr) );
467
468            $self->push_raw(
469                'msgid'     => $msgid,
470                'msgstr'    => $msgstr,
471                'reference' => $reference,
472                'flags'     => $flags,
473                'comment'   => $comment,
474                'previous'  => $previous,
475                'automatic' => $automatic
476            );
477        }
478    }
479}
480
481=item write($)
482
483Writes the current catalog to the given file.
484
485=cut
486
487sub write {
488    my $self     = shift;
489    my $filename = shift
490      or croak dgettext( "po4a", "Cannot write to a file without filename" ) . "\n";
491
492    my $fh;
493    if ( $filename eq '-' ) {
494        $fh = \*STDOUT;
495    } else {
496
497        # make sure the directory in which we should write the localized
498        # file exists
499        my $dir = $filename;
500        if ( $dir =~ m|/| ) {
501            $dir =~ s|/[^/]*$||;
502
503            File::Path::mkpath( $dir, 0, 0755 )    # Croaks on error
504              if ( length($dir) && !-e $dir );
505        }
506        open $fh, ">$filename"
507          or croak wrap_mod( "po4a::po", dgettext( "po4a", "Cannot write to %s: %s" ), $filename, $! );
508    }
509
510    print $fh "" . format_comment( $self->{header_comment}, "" )
511      if length( $self->{header_comment} );
512
513    print $fh "msgid \"\"\n";
514    print $fh "msgstr " . quote_text( $self->{header}, $self->{options}{'wrap-po'} ) . "\n\n";
515
516    my $buf_msgstr_plural;    # Used to keep the first msgstr of plural forms
517    my $first = 1;
518    foreach my $msgid ( sort { ( $self->{po}{"$a"}{'pos'} ) <=> ( $self->{po}{"$b"}{'pos'} ) } keys %{ $self->{po} } ) {
519        my $output = "";
520
521        if ($first) {
522            $first = 0;
523        } else {
524            $output .= "\n";
525        }
526
527        $output .= format_comment( $self->{po}{$msgid}{'comment'}, "" )
528          if length( $self->{po}{$msgid}{'comment'} );
529        if ( length( $self->{po}{$msgid}{'automatic'} ) ) {
530            foreach my $comment ( split( /\\n/, $self->{po}{$msgid}{'automatic'} ) ) {
531                $output .= format_comment( $comment, ". " );
532            }
533        }
534        $output .= format_comment( $self->{po}{$msgid}{'type'}, ". type: " )
535          if length( $self->{po}{$msgid}{'type'} );
536
537        if ( length( $self->{po}{$msgid}{'reference'} ) ) {
538            my $output_ref = wrap( $self->{po}{$msgid}{'reference'} );
539            $output_ref =~ s/\s+$//mg;
540            $output .= format_comment( $output_ref, ": " );
541        }
542        $output .= "#, " . join( ", ", sort split( /\s+/, $self->{po}{$msgid}{'flags'} ) ) . "\n"
543          if length( $self->{po}{$msgid}{'flags'} );
544        $output .= format_comment( $self->{po}{$msgid}{'previous'}, "| " )
545          if length( $self->{po}{$msgid}{'previous'} );
546
547        if ( exists $self->{po}{$msgid}{'plural'} ) {
548            if ( $self->{po}{$msgid}{'plural'} == 0 ) {
549                if ( $self->get_charset =~ /^utf-8$/i ) {
550                    my $msgstr = Encode::decode_utf8( $self->{po}{$msgid}{'msgstr'} );
551                    $msgid = Encode::decode_utf8($msgid);
552                    $output .=
553                      Encode::encode_utf8( "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n" );
554                    $buf_msgstr_plural =
555                      Encode::encode_utf8( "msgstr[0] " . quote_text( $msgstr, $self->{options}{'wrap-po'} ) . "\n" );
556                } else {
557                    $output = "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
558                    $buf_msgstr_plural =
559                      "msgstr[0] " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
560                }
561            } elsif ( $self->{po}{$msgid}{'plural'} == 1 ) {
562
563                # TODO: there may be only one plural form
564                if ( $self->get_charset =~ /^utf-8$/i ) {
565                    my $msgstr = Encode::decode_utf8( $self->{po}{$msgid}{'msgstr'} );
566                    $msgid = Encode::decode_utf8($msgid);
567                    $output =
568                      Encode::encode_utf8( "msgid_plural " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n" );
569                    $output .= $buf_msgstr_plural;
570                    $output .=
571                      Encode::encode_utf8( "msgstr[1] " . quote_text( $msgstr, $self->{options}{'wrap-po'} ) . "\n" );
572                    $buf_msgstr_plural = "";
573                } else {
574                    $output = "msgid_plural " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
575                    $output .= $buf_msgstr_plural;
576                    $output .=
577                      "msgstr[1] " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
578                }
579            } else {
580                die wrap_msg( dgettext( "po4a", "Cannot write PO files with more than two plural forms." ) );
581            }
582        } else {
583            if ( $self->get_charset =~ /^utf-8$/i ) {
584                my $msgstr = Encode::decode_utf8( $self->{po}{$msgid}{'msgstr'} );
585                $msgid = Encode::decode_utf8($msgid);
586                $output .= Encode::encode_utf8( "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n" );
587                $output .= Encode::encode_utf8( "msgstr " . quote_text( $msgstr, $self->{options}{'wrap-po'} ) . "\n" );
588            } else {
589                $output .= "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
590                $output .= "msgstr " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
591            }
592        }
593
594        print $fh $output;
595    }
596    print $fh join( "\n\n", @{ $self->{footer} } ) if scalar @{ $self->{footer} };
597
598    #    print STDERR "$fh";
599    #    if ($filename ne '-') {
600    #        close $fh
601    #            or croak (sprintf(dgettext("po4a",
602    #                                       "Cannot close %s after writing: %s\n"),
603    #                              $filename,$!));
604    #    }
605}
606
607=item write_if_needed($$)
608
609Like write, but if the PO or POT file already exists, the object will be
610written in a temporary file which will be compared with the existing file
611to check if the update is needed (this avoids to change a POT just to
612update a line reference or the POT-Creation-Date field).
613
614=cut
615
616sub move_po_if_needed {
617    my ( $new_po, $old_po, $backup ) = ( shift, shift, shift );
618    my $diff;
619
620    if ( -e $old_po ) {
621        $diff = qx(diff -q -I'^#:' -I'^\"POT-Creation-Date:' -I'^\"PO-Revision-Date:' $old_po $new_po);
622        if ( $diff eq "" ) {
623            unlink $new_po
624              or die wrap_msg( dgettext( "po4a", "Cannot unlink %s: %s." ), $new_po, $! );
625
626            # touch the old PO
627            my ( $atime, $mtime ) = ( time, time );
628            utime $atime, $mtime, $old_po;
629        } else {
630            move $new_po, $old_po
631              or die wrap_msg( dgettext( "po4a", "Cannot move %s to %s: %s." ), $new_po, $old_po, $! );
632        }
633    } else {
634        move $new_po, $old_po
635          or die wrap_msg( dgettext( "po4a", "Cannot move %s to %s: %s." ), $new_po, $old_po, $! );
636    }
637}
638
639sub write_if_needed {
640    my $self     = shift;
641    my $filename = shift
642      or croak dgettext( "po4a", "Cannot write to a file without filename" ) . "\n";
643
644    if ( -e $filename ) {
645        my ($tmp_filename);
646        my $basename = basename($filename);
647        ( undef, $tmp_filename ) = File::Temp::tempfile(
648            $basename . "XXXX",
649            DIR    => $ENV{TMPDIR} || "/tmp",
650            OPEN   => 0,
651            UNLINK => 0
652        );
653        $self->write($tmp_filename);
654        move_po_if_needed( $tmp_filename, $filename );
655    } else {
656        $self->write($filename);
657    }
658}
659
660=item gettextize($$)
661
662This function produces one translated message catalog from two catalogs, an
663original and a translation. This process is described in L<po4a(7)|po4a.7>,
664section I<Gettextization: how does it work?>.
665
666=cut
667
668sub gettextize {
669    my $this  = shift;
670    my $class = ref($this) || $this;
671    my ( $poorig, $potrans ) = ( shift, shift );
672
673    my $pores = Locale::Po4a::Po->new();
674
675    my $please_fail = 0;
676    my $toobad      = dgettext( "po4a",
677            "\nThe gettextization failed (once again). Don't give up, "
678          . "gettextizing is a subtle art, but this is only needed once "
679          . "to convert a project to the gorgeous luxus offered by po4a "
680          . "to translators."
681          . "\nPlease refer to the po4a(7) documentation, the section "
682          . "\"HOWTO convert a pre-existing translation to po4a?\" "
683          . "contains several hints to help you in your task" );
684
685    # Don't fail right now when the entry count does not match. Instead, give
686    # it a try so that the user can see where we fail (which is probably where
687    # the problem is).
688    if ( $poorig->count_entries_doc() > $potrans->count_entries_doc() ) {
689        warn wrap_mod(
690            "po4a gettextize",
691            dgettext(
692                "po4a",
693                "Original has more strings than the translation (%d>%d). "
694                  . "Please fix it by editing the translated version to add "
695                  . "some dummy entry."
696            ),
697            $poorig->count_entries_doc(),
698            $potrans->count_entries_doc()
699        );
700        $please_fail = 1;
701    } elsif ( $poorig->count_entries_doc() < $potrans->count_entries_doc() ) {
702        warn wrap_mod(
703            "po4a gettextize",
704            dgettext(
705                "po4a",
706                "Original has less strings than the translation (%d<%d). "
707                  . "Please fix it by removing the extra entry from the "
708                  . "translated file. You may need an addendum (cf po4a(7)) "
709                  . "to reput the chunk in place after gettextization. A "
710                  . "possible cause is that a text duplicated in the original "
711                  . "is not translated the same way each time. Remove one of "
712                  . "the translations, and you're fine."
713            ),
714            $poorig->count_entries_doc(),
715            $potrans->count_entries_doc()
716        );
717        $please_fail = 1;
718    }
719
720    if ( $poorig->get_charset =~ /^utf-8$/i ) {
721        $potrans->to_utf8;
722        $pores->set_charset("UTF-8");
723    } else {
724        my $charset = $potrans->get_charset();
725        $charset = "UTF-8" if $charset eq "CHARSET";
726        $pores->set_charset($charset);
727    }
728    print "Po character sets:\n"
729      . "  original="
730      . $poorig->get_charset . "\n"
731      . "  translated="
732      . $potrans->get_charset . "\n"
733      . "  result="
734      . $pores->get_charset . "\n"
735      if $debug{'encoding'};
736
737    for (
738        my ( $o, $t ) = ( 0, 0 ) ;
739        $o < $poorig->count_entries_doc() && $t < $potrans->count_entries_doc() ;
740        $o++, $t++
741      )
742    {
743        #
744        # Extract some informations
745
746        my ( $orig, $trans ) = ( $poorig->msgid_doc($o), $potrans->msgid_doc($t) );
747
748        #       print STDERR "Matches [[$orig]]<<$trans>>\n";
749
750        my ( $reforig,  $reftrans )  = ( $poorig->{po}{$orig}{'reference'}, $potrans->{po}{$trans}{'reference'} );
751        my ( $typeorig, $typetrans ) = ( $poorig->{po}{$orig}{'type'},      $potrans->{po}{$trans}{'type'} );
752
753        #
754        # Make sure the type of both string exist
755        #
756        die wrap_mod( "po4a gettextize", "Internal error: type of original string number %s " . "isn't provided", $o )
757          if ( $typeorig eq '' );
758
759        die wrap_mod( "po4a gettextize", "Internal error: type of translated string number %s " . "isn't provided", $o )
760          if ( $typetrans eq '' );
761
762        #
763        # Make sure both type are the same
764        #
765        if ( $typeorig ne $typetrans ) {
766            $pores->write("gettextization.failed.po");
767            eval {
768                # Recode $trans into current charset, if possible
769                require I18N::Langinfo;
770                I18N::Langinfo->import(qw(langinfo CODESET));
771                my $codeset = langinfo( CODESET() );
772                Encode::from_to( $trans, $potrans->get_charset, $codeset );
773            };
774            die wrap_msg(
775                dgettext( "po4a",
776                        "po4a gettextization: Structure disparity between "
777                      . "original and translated files:\n"
778                      . "msgid (at %s) is of type '%s' while\n"
779                      . "msgstr (at %s) is of type '%s'.\n"
780                      . "Original text: %s\n"
781                      . "Translated text: %s\n"
782                      . "(result so far dumped to gettextization.failed.po)" )
783                  . "%s",
784                $reforig,
785                $typeorig,
786                $reftrans,
787                $typetrans,
788                $orig, $trans, $toobad
789            );
790        }
791
792        #
793        # Push the entry
794        #
795        my $flags;
796        if ( defined $poorig->{po}{$orig}{'flags'} ) {
797            $flags = $poorig->{po}{$orig}{'flags'} . " fuzzy";
798        } else {
799            $flags = "fuzzy";
800        }
801        $pores->push_raw(
802            'msgid'     => $orig,
803            'msgstr'    => $trans,
804            'flags'     => $flags,
805            'type'      => $typeorig,
806            'reference' => $reforig,
807            'conflict'  => 1,
808            'transref'  => $potrans->{po}{$trans}{'reference'}
809          )
810          unless ( defined( $pores->{po}{$orig} )
811            and ( $pores->{po}{$orig}{'msgstr'} eq $trans ) )
812
813          # FIXME: maybe we should be smarter about what reference should be
814          #        sent to push_raw.
815    }
816
817    # make sure we return a useful error message when entry count differ
818    die "$toobad\n" if $please_fail;
819
820    return $pores;
821}
822
823=item filter($)
824
825This function extracts a catalog from an existing one. Only the entries having
826a reference in the given file will be placed in the resulting catalog.
827
828This function parses its argument, converts it to a Perl function definition,
829evals this definition and filters the fields for which this function returns
830true.
831
832I love Perl sometimes ;)
833
834=cut
835
836sub filter {
837    my $self = shift;
838    our $filter = shift;
839
840    my $res;
841    $res = Locale::Po4a::Po->new();
842
843    # Parse the filter
844    our $code   = "sub apply { return ";
845    our $pos    = 0;
846    our $length = length $filter;
847
848    # explode chars to parts. How to subscript a string in Perl?
849    our @filter = split( //, $filter );
850
851    sub gloups {
852        my $fmt   = shift;
853        my $space = "";
854        for ( 1 .. $pos ) {
855            $space .= ' ';
856        }
857        die wrap_msg("$fmt\n$filter\n$space^ HERE");
858    }
859
860    sub showmethecode {
861        return unless $debug{'filter'};
862        my $fmt   = shift;
863        my $space = "";
864        for ( 1 .. $pos ) {
865            $space .= ' ';
866        }
867        print STDERR "$filter\n$space^ $fmt\n";    #"$code\n";
868    }
869
870    # I dream of a lex in perl :-/
871    sub parse_expression {
872        showmethecode("Begin expression")
873          if $debug{'filter'};
874
875        gloups( "Begin of expression expected, got '%s'", $filter[$pos] )
876          unless ( $filter[$pos] eq '(' );
877        $pos++;                                    # pass the '('
878        if ( $filter[$pos] eq '&' ) {
879
880            # AND
881            $pos++;
882            showmethecode("Begin of AND")
883              if $debug{'filter'};
884            $code .= "(";
885            while (1) {
886                gloups("Unfinished AND statement.")
887                  if ( $pos == $length );
888                parse_expression();
889                if ( $filter[$pos] eq '(' ) {
890                    $code .= " && ";
891                } elsif ( $filter[$pos] eq ')' ) {
892                    last;    # do not eat that char
893                } else {
894                    gloups( "End of AND or begin of sub-expression expected, got '%s'", $filter[$pos] );
895                }
896            }
897            $code .= ")";
898        } elsif ( $filter[$pos] eq '|' ) {
899
900            # OR
901            $pos++;
902            $code .= "(";
903            while (1) {
904                gloups("Unfinished OR statement.")
905                  if ( $pos == $length );
906                parse_expression();
907                if ( $filter[$pos] eq '(' ) {
908                    $code .= " || ";
909                } elsif ( $filter[$pos] eq ')' ) {
910                    last;    # do not eat that char
911                } else {
912                    gloups( "End of OR or begin of sub-expression expected, got '%s'", $filter[$pos] );
913                }
914            }
915            $code .= ")";
916        } elsif ( $filter[$pos] eq '!' ) {
917
918            # NOT
919            $pos++;
920            $code .= "(!";
921            gloups("Missing sub-expression in NOT statement.")
922              if ( $pos == $length );
923            parse_expression();
924            $code .= ")";
925        } else {
926
927            # must be an equal. Let's get field and argument
928            my ( $field, $arg, $done );
929            $field = substr( $filter, $pos );
930            gloups("EQ statement contains no '=' or invalid field name")
931              unless ( $field =~ /([a-z]*)=/i );
932            $field = lc($1);
933            $pos += ( length $field ) + 1;
934
935            # check that we've got a valid field name,
936            # and the number it referes to
937            # DO NOT CHANGE THE ORDER
938            my @names = qw(msgid msgstr reference flags comment previous automatic);
939            my $fieldpos;
940            for ( $fieldpos = 0 ; $fieldpos < scalar @names && $field ne $names[$fieldpos] ; $fieldpos++ ) { }
941            gloups( "Invalid field name: %s", $field )
942              if $fieldpos == scalar @names;    # not found
943
944            # Now, get the argument value. It has to be between quotes,
945            # which can be escaped
946            # We point right on the first char of the argument
947            # (first quote already eaten)
948            my $escaped = 0;
949            my $quoted  = 0;
950            if ( $filter[$pos] eq '"' ) {
951                $pos++;
952                $quoted = 1;
953            }
954            showmethecode( ( $quoted ? "Quoted" : "Unquoted" ) . " argument of field '$field'" )
955              if $debug{'filter'};
956
957            while ( !$done ) {
958                gloups("Unfinished EQ argument.")
959                  if ( $pos == $length );
960
961                if ($quoted) {
962                    if ( $filter[$pos] eq '\\' ) {
963                        if ($escaped) {
964                            $arg .= '\\';
965                            $escaped = 0;
966                        } else {
967                            $escaped = 1;
968                        }
969                    } elsif ($escaped) {
970                        if ( $filter[$pos] eq '"' ) {
971                            $arg .= '"';
972                            $escaped = 0;
973                        } else {
974                            gloups( "Invalid escape sequence in argument: '\\%s'", $filter[$pos] );
975                        }
976                    } else {
977                        if ( $filter[$pos] eq '"' ) {
978                            $done = 1;
979                        } else {
980                            $arg .= $filter[$pos];
981                        }
982                    }
983                } else {
984                    if ( $filter[$pos] eq ')' ) {
985
986                        # counter the next ++ since we don't want to eat
987                        # this char
988                        $pos--;
989                        $done = 1;
990                    } else {
991                        $arg .= $filter[$pos];
992                    }
993                }
994                $pos++;
995            }
996
997            # and now, add the code to check this equality
998            $code .= "(\$_[$fieldpos] =~ m{$arg})";
999
1000        }
1001        showmethecode("End of expression")
1002          if $debug{'filter'};
1003        gloups("Unfinished statement.")
1004          if ( $pos == $length );
1005        gloups( "End of expression expected, got '%s'", $filter[$pos] )
1006          unless ( $filter[$pos] eq ')' );
1007        $pos++;
1008    }
1009
1010    # And now, launch the beast, finish the function and use eval
1011    # to construct this function.
1012    # Ok, the lack of lexer is a fair price for the eval ;)
1013    parse_expression();
1014    gloups("Garbage at the end of the expression")
1015      if ( $pos != $length );
1016    $code .= "; }";
1017    print STDERR "CODE = $code\n"
1018      if $debug{'filter'};
1019    eval $code;
1020    die wrap_mod( "po4a::po", dgettext( "po4a", "Evaluating the provided filter failed: %s" ), $@ )
1021      if $@;
1022
1023    for ( my $cpt = (0) ; $cpt < $self->count_entries() ; $cpt++ ) {
1024
1025        my ( $msgid, $ref, $msgstr, $flags, $type, $comment, $previous, $automatic );
1026
1027        $msgid = $self->msgid($cpt);
1028        $ref   = $self->{po}{$msgid}{'reference'};
1029
1030        $msgstr    = $self->{po}{$msgid}{'msgstr'};
1031        $flags     = $self->{po}{$msgid}{'flags'};
1032        $type      = $self->{po}{$msgid}{'type'};
1033        $comment   = $self->{po}{$msgid}{'comment'};
1034        $previous  = $self->{po}{$msgid}{'previous'};
1035        $automatic = $self->{po}{$msgid}{'automatic'};
1036
1037        # DO NOT CHANGE THE ORDER
1038        $res->push_raw(
1039            'msgid'     => $msgid,
1040            'msgstr'    => $msgstr,
1041            'flags'     => $flags,
1042            'type'      => $type,
1043            'reference' => $ref,
1044            'comment'   => $comment,
1045            'previous'  => $previous,
1046            'automatic' => $automatic
1047        ) if ( apply( $msgid, $msgstr, $ref, $flags, $comment, $previous, $automatic ) );
1048    }
1049
1050    # delete the apply subroutine
1051    # otherwise it will be redefined.
1052    undef &apply;
1053    return $res;
1054}
1055
1056=item to_utf8()
1057
1058Recodes to UTF-8 the PO's msgstrs. Does nothing if the charset is not
1059specified in the PO file ("CHARSET" value), or if it's already UTF-8 or
1060ASCII.
1061
1062=cut
1063
1064sub to_utf8 {
1065    my $this    = shift;
1066    my $charset = $this->get_charset();
1067
1068    unless ( $charset eq "CHARSET"
1069        or $charset =~ /^ascii$/i
1070        or $charset =~ /^utf-8$/i )
1071    {
1072        foreach my $msgid ( keys %{ $this->{po} } ) {
1073            Encode::from_to( $this->{po}{$msgid}{'msgstr'}, $charset, "utf-8" );
1074        }
1075        $this->set_charset("UTF-8");
1076    }
1077}
1078
1079=back
1080
1081=head1 Functions to use a message catalog for translations
1082
1083=over 4
1084
1085=item gettext($%)
1086
1087Request the translation of the string given as argument in the current catalog.
1088The function returns the original (untranslated) string if the string was not
1089found.
1090
1091After the string to translate, you can pass a hash of extra
1092arguments. Here are the valid entries:
1093
1094=over
1095
1096=item B<wrap>
1097
1098boolean indicating whether we can consider that whitespaces in string are
1099not important. If yes, the function canonizes the string before looking for
1100a translation, and wraps the result.
1101
1102=item B<wrapcol>
1103
1104the column at which we should wrap (default: 76).
1105
1106=back
1107
1108=cut
1109
1110sub gettext {
1111    my $self  = shift;
1112    my $text  = shift;
1113    my (%opt) = @_;
1114    my $res;
1115
1116    return "" unless length($text);    # Avoid returning the header.
1117    my $validoption = "reference wrap wrapcol";
1118    my %validoption;
1119
1120    map { $validoption{$_} = 1 } ( split( / /, $validoption ) );
1121    foreach ( keys %opt ) {
1122        Carp::confess "internal error:  unknown arg $_.\n" . "Here are the valid options: $validoption.\n"
1123          unless $validoption{$_};
1124    }
1125
1126    $text = canonize($text)
1127      if ( $opt{'wrap'} );
1128
1129    my $esc_text = escape_text($text);
1130
1131    $self->{gettextqueries}++;
1132
1133    if (
1134            defined $self->{po}{$esc_text}
1135        and defined $self->{po}{$esc_text}{'msgstr'}
1136        and length $self->{po}{$esc_text}{'msgstr'}
1137        and ( not defined $self->{po}{$esc_text}{'flags'}
1138            or $self->{po}{$esc_text}{'flags'} !~ /fuzzy/ )
1139      )
1140    {
1141
1142        $self->{gettexthits}++;
1143        $res = unescape_text( $self->{po}{$esc_text}{'msgstr'} );
1144        if ( defined $self->{po}{$esc_text}{'plural'} ) {
1145            if ( $self->{po}{$esc_text}{'plural'} eq "0" ) {
1146                warn wrap_mod(
1147                    "po4a gettextize",
1148                    dgettext(
1149                        "po4a",
1150                        "'%s' is the singular form of a message, " . "po4a will use the msgstr[0] translation (%s)."
1151                    ),
1152                    $esc_text,
1153                    $res
1154                );
1155            } else {
1156                warn wrap_mod(
1157                    "po4a gettextize",
1158                    dgettext(
1159                        "po4a",
1160                        "'%s' is the plural form of a message, " . "po4a will use the msgstr[1] translation (%s)."
1161                    ),
1162                    $esc_text,
1163                    $res
1164                );
1165            }
1166        }
1167    } else {
1168        $res = $text;
1169    }
1170
1171    if ( $opt{'wrap'} ) {
1172        if ( $self->get_charset =~ /^utf-8$/i ) {
1173            $res = Encode::decode_utf8($res);
1174            $res = wrap( $res, $opt{'wrapcol'} || 76 );
1175            $res = Encode::encode_utf8($res);
1176        } else {
1177            $res = wrap( $res, $opt{'wrapcol'} || 76 );
1178        }
1179    }
1180
1181    #    print STDERR "Gettext >>>$text<<<(escaped=$esc_text)=[[[$res]]]\n\n";
1182    return $res;
1183}
1184
1185=item stats_get()
1186
1187Returns statistics about the hit ratio of gettext since the last time that
1188stats_clear() was called. Please note that it's not the same
1189statistics than the one printed by msgfmt --statistic. Here, it's statistics
1190about recent usage of the PO file, while msgfmt reports the status of the
1191file.  Example of use:
1192
1193    [some use of the PO file to translate stuff]
1194
1195    ($percent,$hit,$queries) = $pofile->stats_get();
1196    print "So far, we found translations for $percent\%  ($hit of $queries) of strings.\n";
1197
1198=cut
1199
1200sub stats_get() {
1201    my $self = shift;
1202    my ( $h, $q ) = ( $self->{gettexthits}, $self->{gettextqueries} );
1203    my $p = ( $q == 0 ? 100 : int( $h / $q * 10000 ) / 100 );
1204
1205    #    $p =~ s/\.00//;
1206    #    $p =~ s/(\..)0/$1/;
1207
1208    return ( $p, $h, $q );
1209}
1210
1211=item stats_clear()
1212
1213Clears the statistics about gettext hits.
1214
1215=cut
1216
1217sub stats_clear {
1218    my $self = shift;
1219    $self->{gettextqueries} = 0;
1220    $self->{gettexthits}    = 0;
1221}
1222
1223=back
1224
1225=head1 Functions to build a message catalog
1226
1227=over 4
1228
1229=item push(%)
1230
1231Push a new entry at the end of the current catalog. The arguments should
1232form a hash table. The valid keys are:
1233
1234=over 4
1235
1236=item B<msgid>
1237
1238the string in original language.
1239
1240=item B<msgstr>
1241
1242the translation.
1243
1244=item B<reference>
1245
1246an indication of where this string was found. Example: file.c:46 (meaning
1247in 'file.c' at line 46). It can be a space-separated list in case of
1248multiple occurrences.
1249
1250=item B<comment>
1251
1252a comment added here manually (by the translators). The format here is free.
1253
1254=item B<automatic>
1255
1256a comment which was automatically added by the string extraction
1257program. See the B<--add-comments> option of the B<xgettext> program for
1258more information.
1259
1260=item B<flags>
1261
1262space-separated list of all defined flags for this entry.
1263
1264Valid flags are: B<c-text>, B<python-text>, B<lisp-text>, B<elisp-text>, B<librep-text>,
1265B<smalltalk-text>, B<java-text>, B<awk-text>, B<object-pascal-text>, B<ycp-text>,
1266B<tcl-text>, B<wrap>, B<no-wrap> and B<fuzzy>.
1267
1268See the gettext documentation for their meaning.
1269
1270=item B<type>
1271
1272this is mostly an internal argument: it is used while gettextizing
1273documents. The idea here is to parse both the original and the translation
1274into a PO object, and merge them, using one's msgid as msgid and the
1275other's msgid as msgstr. To make sure that things get ok, each msgid in PO
1276objects are given a type, based on their structure (like "chapt", "sect1",
1277"p" and so on in DocBook). If the types of strings are not the same, that
1278means that both files do not share the same structure, and the process
1279reports an error.
1280
1281This information is written as automatic comment in the PO file since this
1282gives to translators some context about the strings to translate.
1283
1284=item B<wrap>
1285
1286boolean indicating whether whitespaces can be mangled in cosmetic
1287reformattings. If true, the string is canonized before use.
1288
1289This information is written to the PO file using the B<wrap> or B<no-wrap> flag.
1290
1291=item B<wrapcol>
1292
1293the column at which we should wrap (default: 76).
1294
1295This information is not written to the PO file.
1296
1297=back
1298
1299=cut
1300
1301sub push {
1302    my $self  = shift;
1303    my %entry = @_;
1304
1305    my $validoption = "wrap wrapcol type msgid msgstr automatic previous flags reference";
1306    my %validoption;
1307
1308    map { $validoption{$_} = 1 } ( split( / /, $validoption ) );
1309    foreach ( keys %entry ) {
1310        Carp::confess "internal error:  unknown arg $_.\n" . "Here are the valid options: $validoption.\n"
1311          unless $validoption{$_};
1312    }
1313
1314    unless ( $entry{'wrap'} ) {
1315        $entry{'flags'} .= " no-wrap";
1316    }
1317    if ( defined( $entry{'msgid'} ) ) {
1318        $entry{'msgid'} = canonize( $entry{'msgid'} )
1319          if ( $entry{'wrap'} );
1320
1321        $entry{'msgid'} = escape_text( $entry{'msgid'} );
1322    }
1323    if ( defined( $entry{'msgstr'} ) ) {
1324        $entry{'msgstr'} = canonize( $entry{'msgstr'} )
1325          if ( $entry{'wrap'} );
1326
1327        $entry{'msgstr'} = escape_text( $entry{'msgstr'} );
1328    }
1329
1330    $self->push_raw(%entry);
1331}
1332
1333# The same as push(), but assuming that msgid and msgstr are already escaped
1334sub push_raw {
1335    my $self  = shift;
1336    my %entry = @_;
1337    my ( $msgid, $msgstr, $reference, $comment, $automatic, $previous, $flags, $type, $transref ) = (
1338        $entry{'msgid'},    $entry{'msgstr'}, $entry{'reference'}, $entry{'comment'}, $entry{'automatic'},
1339        $entry{'previous'}, $entry{'flags'},  $entry{'type'},      $entry{'transref'}
1340    );
1341    my $keep_conflict = $entry{'conflict'};
1342
1343    #    print STDERR "Push_raw\n";
1344    #    print STDERR " msgid=>>>$msgid<<<\n" if $msgid;
1345    #    print STDERR " msgstr=[[[$msgstr]]]\n" if $msgstr;
1346    #    Carp::cluck " flags=$flags\n" if $flags;
1347
1348    return unless defined( $entry{'msgid'} );
1349
1350    # no msgid => header definition
1351    unless ( length( $entry{'msgid'} ) ) {
1352
1353        #       if (defined($self->{header}) && $self->{header} =~ /\S/) {
1354        #           warn dgettext("po4a","Redefinition of the header. ".
1355        #                                "The old one will be discarded\n");
1356        #       } FIXME: do that iff the header isn't the default one.
1357        $self->{header}         = $msgstr;
1358        $self->{header_comment} = $comment;
1359        my $charset = $self->get_charset;
1360        if ( $charset ne "CHARSET" ) {
1361            $self->{encoder} = find_encoding($charset);
1362        } else {
1363            $self->{encoder} = find_encoding("UTF-8");
1364        }
1365        return;
1366    }
1367
1368    if ( $self->{options}{'porefs'} =~ m/^never/ ) {
1369        $reference = "";
1370    } elsif ( $self->{options}{'porefs'} =~ m/^counter/ ) {
1371        if ( $reference =~ m/^(.+?)(?=\S+:\d+)/g ) {
1372            my $new_ref = $1;
1373            1 while $reference =~ s{  # x modifier is added to add formatting and improve readability
1374              \G(\s*)(\S+):\d+        # \G is the last match in m//g (see also the (?=) syntax above)
1375                                      # $2 is the file name
1376            }{
1377                 $self->{counter}{$2} ||= 0, # each file has its own counter
1378                 ++$self->{counter}{$2},     # increment it
1379                 $new_ref .= "$1$2:".$self->{counter}{$2} # replace line number by this counter
1380            }gex && pos($reference);
1381            $reference = $new_ref;
1382        }
1383    } elsif ( $self->{options}{'porefs'} =~ m/^file/ ) {
1384        $reference =~ s/:\d+//g;
1385    }
1386
1387    if ( defined( $self->{po}{$msgid} ) ) {
1388        warn wrap_mod( "po4a::po", dgettext( "po4a", "msgid defined twice: %s" ), $msgid )
1389          if (0);    # FIXME: put a verbose stuff
1390        if (    defined $msgstr
1391            and defined $self->{po}{$msgid}{'msgstr'}
1392            and $self->{po}{$msgid}{'msgstr'} ne $msgstr )
1393        {
1394            my $txt = quote_text( $msgid, $self->{options}{'wrap-po'} );
1395            my ( $first, $second ) = (
1396                    format_comment( ". ", $self->{po}{$msgid}{'reference'} )
1397                  . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ),
1398
1399                format_comment( ". ", $reference ) . quote_text($msgstr), $self->{options}{'wrap-po'}
1400            );
1401
1402            if ($keep_conflict) {
1403                if ( $self->{po}{$msgid}{'msgstr'} =~ m/^#-#-#-#-#  .*  #-#-#-#-#\\n/s ) {
1404                    $msgstr = $self->{po}{$msgid}{'msgstr'} . "\\n#-#-#-#-#  $transref  #-#-#-#-#\\n" . $msgstr;
1405                } else {
1406                    $msgstr =
1407                        "#-#-#-#-#  "
1408                      . $self->{po}{$msgid}{'transref'}
1409                      . "  #-#-#-#-#\\n"
1410                      . $self->{po}{$msgid}{'msgstr'} . "\\n"
1411                      . "#-#-#-#-#  $transref  #-#-#-#-#\\n"
1412                      . $msgstr;
1413                }
1414
1415                # Every msgid will have the same list of references.
1416                # Only keep the last list.
1417                $self->{po}{$msgid}{'reference'} = "";
1418            } else {
1419                warn wrap_msg(
1420                    dgettext(
1421                        "po4a",
1422                        "Translations don't match for:\n" . "%s\n"
1423                          . "-->First translation:\n" . "%s\n"
1424                          . " Second translation:\n" . "%s\n"
1425                          . " Old translation discarded."
1426                    ),
1427                    $txt, $first, $second
1428                );
1429            }
1430        }
1431    }
1432    if ( defined $transref ) {
1433        $self->{po}{$msgid}{'transref'} = $transref;
1434    }
1435    if ( length($reference) ) {
1436        if ( defined $self->{po}{$msgid}{'reference'} ) {
1437
1438            # Only add the new reference if it's not already included in the existing string
1439            # It'd be much easier if $self->{po}{$msgid}{'reference'} were an array instead of a joined string...
1440            my $oldref = $self->{po}{$msgid}{'reference'};
1441            $self->{po}{$msgid}{'reference'} .= " " . $reference
1442              unless ( ( $oldref =~ m/ $reference / )
1443                || ( $oldref =~ m/ $reference$/ )
1444                || ( $oldref =~ m/^$reference$/ )
1445                || ( $oldref =~ m/^$reference / ) );
1446        } else {
1447            $self->{po}{$msgid}{'reference'} = $reference;
1448        }
1449    }
1450    $self->{po}{$msgid}{'msgstr'}    = $msgstr;
1451    $self->{po}{$msgid}{'comment'}   = $comment;
1452    $self->{po}{$msgid}{'automatic'} = $automatic;
1453    $self->{po}{$msgid}{'previous'}  = $previous;
1454    if ( defined( $self->{po}{$msgid}{'pos_doc'} ) ) {
1455        $self->{po}{$msgid}{'pos_doc'} .= " " . $self->{count_doc}++;
1456    } else {
1457        $self->{po}{$msgid}{'pos_doc'} = $self->{count_doc}++;
1458    }
1459    unless ( defined( $self->{po}{$msgid}{'pos'} ) ) {
1460        $self->{po}{$msgid}{'pos'} = $self->{count}++;
1461    }
1462    $self->{po}{$msgid}{'type'}   = $type;
1463    $self->{po}{$msgid}{'plural'} = $entry{'plural'}
1464      if defined $entry{'plural'};
1465
1466    if ( defined($flags) ) {
1467        $flags = " $flags ";
1468        $flags =~ s/,/ /g;
1469        foreach my $flag (@known_flags) {
1470            if ( index( $flags, " $flag " ) != -1 ) {    # if flag to be set
1471                unless ( defined( $self->{po}{$msgid}{'flags'} )
1472                    && $self->{po}{$msgid}{'flags'} =~ /\b$flag\b/ )
1473                {
1474                    # flag not already set
1475                    if ( defined $self->{po}{$msgid}{'flags'} ) {
1476                        $self->{po}{$msgid}{'flags'} .= " " . $flag;
1477                    } else {
1478                        $self->{po}{$msgid}{'flags'} = $flag;
1479                    }
1480                }
1481            }
1482        }
1483    }
1484
1485    #    print STDERR "stored ((($msgid)))=>(((".$self->{po}{$msgid}{'msgstr'}.")))\n\n";
1486
1487}
1488
1489=back
1490
1491=head1 Miscellaneous functions
1492
1493=over 4
1494
1495=item count_entries()
1496
1497Returns the number of entries in the catalog (without the header).
1498
1499=cut
1500
1501sub count_entries($) {
1502    my $self = shift;
1503    return $self->{count};
1504}
1505
1506=item count_entries_doc()
1507
1508Returns the number of entries in document. If a string appears multiple times
1509in the document, it will be counted multiple times.
1510
1511=cut
1512
1513sub count_entries_doc($) {
1514    my $self = shift;
1515    return $self->{count_doc};
1516}
1517
1518=item equals_msgid(po)
1519
1520Returns ($uptodate, $diagnostic) with $uptodate indicating whether all msgid of the current po file are
1521also present in the one passed as parameter (all other fields are ignored in the file comparison).
1522Informally, if $uptodate returns false, then the po files would be changed when going through B<po4a-updatepo>.
1523
1524If $uptodate is false, then $diagnostic contains a diagnostic of why this is so.
1525
1526=cut
1527
1528sub equals_msgid($$) {
1529    my ( $self, $other ) = ( shift, shift );
1530
1531    unless ( $self->count_entries() == $other->count_entries() ) {
1532        return (
1533            0,
1534            wrap_msg(
1535                dgettext( "po4a", "The amount of entries differ between files: %d is not %d" ),
1536                $self->count_entries(),
1537                $other->count_entries()
1538            )
1539        );
1540    }
1541    foreach my $msgid ( keys %{ $self->{po} } ) {
1542        unless ( defined( $self->{po}{$msgid} ) && defined( $other->{po}{$msgid} ) ) {
1543            return ( 0, wrap_msg( dgettext( "po4a", "msgid declared in one file only: %s\n" ), $msgid ) );
1544        }
1545    }
1546    return ( 1, "" );
1547}
1548
1549=item msgid($)
1550
1551Returns the msgid of the given number.
1552
1553=cut
1554
1555sub msgid($$) {
1556    my $self = shift;
1557    my $num  = shift;
1558
1559    foreach my $msgid ( keys %{ $self->{po} } ) {
1560        return $msgid if ( $self->{po}{$msgid}{'pos'} eq $num );
1561    }
1562    return undef;
1563}
1564
1565=item msgid_doc($)
1566
1567Returns the msgid with the given position in the document.
1568
1569=cut
1570
1571sub msgid_doc($$) {
1572    my $self = shift;
1573    my $num  = shift;
1574
1575    foreach my $msgid ( keys %{ $self->{po} } ) {
1576        foreach my $pos ( split / /, $self->{po}{$msgid}{'pos_doc'} ) {
1577            return $msgid if ( $pos eq $num );
1578        }
1579    }
1580    return undef;
1581}
1582
1583=item get_charset()
1584
1585Returns the character set specified in the PO header. If it hasn't been
1586set, it will return "UTF-8".
1587
1588=cut
1589
1590sub get_charset() {
1591    my $self = shift;
1592
1593    $self->{header} =~ /charset=(.*?)[\s\\]/;
1594
1595    if ( defined $1 ) {
1596        return $1;
1597    } else {
1598        return "UTF-8";
1599    }
1600}
1601
1602=item set_charset($)
1603
1604This sets the character set of the PO header to the value specified in its
1605first argument. If you never call this function (and no file with a specified
1606character set is read), the default value is left to "UTF-8". This value
1607doesn't change the behavior of this module, it's just used to fill that field
1608in the header, and to return it in get_charset().
1609
1610=cut
1611
1612sub set_charset() {
1613    my $self = shift;
1614
1615    my ( $newchar, $oldchar );
1616    $newchar = shift;
1617    $oldchar = $self->get_charset();
1618
1619    $self->{header} =~ s/$oldchar/$newchar/;
1620    $self->{encoder} = find_encoding($newchar);
1621}
1622
1623#----[ helper functions ]---------------------------------------------------
1624
1625# transforme the string from its PO file representation to the form which
1626#   should be used to print it
1627sub unescape_text {
1628    my $text = shift;
1629
1630    print STDERR "\nunescape [$text]====" if $debug{'escape'};
1631    $text = join( "", split( /\n/, $text ) );
1632    $text =~ s/\\"/"/g;
1633
1634    # unescape newlines
1635    #   NOTE on \G:
1636    #   The following regular expression introduce newlines.
1637    #   Thus, ^ doesn't match all beginnings of lines.
1638    #   \G is a zero-width assertion that matches the position
1639    #   of the previous substitution with s///g. As every
1640    #   substitution ends by a newline, it always matches a
1641    #   position just after a newline.
1642    $text =~ s/(           # $1:
1643                (\G|[^\\]) #    beginning of the line or any char
1644                           #    different from '\'
1645                (\\\\)*    #    followed by any even number of '\'
1646               )\\n        # and followed by an escaped newline
1647              /$1\n/sgx;    # single string, match globally, allow comments
1648                            # unescape carriage returns
1649    $text =~ s/(           # $1:
1650                (\G|[^\\]) #    beginning of the line or any char
1651                           #    different from '\'
1652                (\\\\)*    #    followed by any even number of '\'
1653               )\\r        # and followed by an escaped carriage return
1654              /$1\r/sgx;    # single string, match globally, allow comments
1655                            # unescape tabulations
1656    $text =~ s/(          # $1:
1657                (\G|[^\\])#    beginning of the line or any char
1658                          #    different from '\'
1659                (\\\\)*   #    followed by any even number of '\'
1660               )\\t       # and followed by an escaped tabulation
1661              /$1\t/mgx;    # multilines string, match globally, allow comments
1662                            # and unescape the escape character
1663    $text =~ s/\\\\/\\/g;
1664    print STDERR ">$text<\n" if $debug{'escape'};
1665
1666    return $text;
1667}
1668
1669# transform the string to its representation as it should be written in PO
1670# files
1671sub escape_text {
1672    my $text = shift;
1673
1674    print STDERR "\nescape [$text]====" if $debug{'escape'};
1675    $text =~ s/\\/\\\\/g;
1676    $text =~ s/"/\\"/g;
1677    $text =~ s/\n/\\n/g;
1678    $text =~ s/\r/\\r/g;
1679    $text =~ s/\t/\\t/g;
1680    print STDERR ">$text<\n" if $debug{'escape'};
1681
1682    return $text;
1683}
1684
1685# put quotes around the string on each lines (without escaping it)
1686# It does also normalize the text (ie, make sure its representation is wrapped
1687#   on the 80th char, but without changing the meaning of the string)
1688sub quote_text {
1689    my $string  = shift;
1690    my $do_wrap = shift;    # either 'no' or 'newlines', or column at which we should wrap
1691
1692    return '""' unless length($string);
1693
1694    return "\"$string\"" if ( $do_wrap eq 'no' );
1695
1696    print STDERR "\nquote $do_wrap [$string]====" if $debug{'quote'};
1697
1698    # break lines on newlines, if any
1699    # see unescape_text for an explanation on \G
1700    $string =~ s/(           # $1:
1701                  (\G|[^\\]) #    beginning of the line or any char
1702                             #    different from '\'
1703                  (\\\\)*    #    followed by any even number of '\'
1704                 \\n)        # and followed by an escaped newline
1705                /$1\n/sgx;    # single string, match globally, allow comments
1706
1707    $string = wrap( $string, $do_wrap ) if ( $do_wrap ne 'newlines' );
1708    my @string = split( /\n/, $string );
1709    $string = join( "\"\n\"", @string );
1710    $string = "\"$string\"";
1711    if ( scalar @string > 1 && $string[0] ne '' ) {
1712        $string = "\"\"\n" . $string;
1713    }
1714
1715    print STDERR ">$string<\n" if $debug{'quote'};
1716    return $string;
1717}
1718
1719# undo the work of the quote_text function
1720sub unquote_text {
1721    my $string = shift;
1722    print STDERR "\nunquote [$string]====" if $debug{'quote'};
1723    $string =~ s/^""\\n//s;
1724    $string =~ s/^"(.*)"$/$1/s;
1725    $string =~ s/"\n"//gm;
1726
1727    # Note: an even number of '\' could precede \\n, but I could not build a
1728    # document to test this
1729    $string =~ s/([^\\])\\n\n/$1!!DUMMYPOPM!!/gm;
1730    $string =~ s|!!DUMMYPOPM!!|\\n|gm;
1731    print STDERR ">$string<\n" if $debug{'quote'};
1732    return $string;
1733}
1734
1735# canonize the string: write it on only one line, changing consecutive
1736# whitespace to only one space.
1737# Warning, it changes the string and should only be called if the string is
1738# plain text
1739sub canonize {
1740    my $text = shift;
1741    print STDERR "\ncanonize [$text]====" if $debug{'canonize'};
1742    $text =~ s/^ *//s;
1743    $text =~ s/^[ \t]+/  /gm;
1744
1745    # if ($text eq "\n"), it messed up the first string (header)
1746    $text =~ s/\n/  /gm if ( $text ne "\n" );
1747    $text =~ s/([.)])  +/$1  /gm;
1748    $text =~ s/([^.)])  */$1 /gm;
1749    $text =~ s/ *$//s;
1750    print STDERR ">$text<\n" if $debug{'canonize'};
1751    return $text;
1752}
1753
1754# wraps the string. We don't use Text::Wrap since it mangles whitespace at the end of the split line
1755sub wrap {
1756    my $text = shift;
1757    return "0" if ( $text eq '0' );
1758    my $col   = shift || 76;
1759    my @lines = split( /\n/, "$text" );
1760    my $res   = "";
1761    my $first = 1;
1762    while ( defined( my $line = shift @lines ) ) {
1763        if ( $first && length($line) > $col - 10 ) {
1764            unshift @lines, $line;
1765            $first = 0;
1766            next;
1767        }
1768        if ( length($line) > $col ) {
1769            my $pos = rindex( $line, " ", $col );
1770            while ( substr( $line, $pos - 1, 1 ) eq '.' && $pos != -1 ) {
1771                $pos = rindex( $line, " ", $pos - 1 );
1772            }
1773            if ( $pos == -1 ) {
1774
1775                # There are no spaces in the first $col chars, pick-up the
1776                # first space
1777                $pos = index( $line, " " );
1778            }
1779            if ( $pos != -1 ) {
1780                my $end = substr( $line, $pos + 1 );
1781                $line = substr( $line, 0, $pos + 1 );
1782                if ( $end =~ s/^( +)// ) {
1783                    $line .= $1;
1784                }
1785                unshift @lines, $end;
1786            }
1787        }
1788        $first = 0;
1789        $res .= "$line\n";
1790    }
1791
1792    # Restore the original trailing spaces
1793    $res =~ s/\s+$//s;
1794    if ( $text =~ m/(\s+)$/s ) {
1795        $res .= $1;
1796    }
1797    return $res;
1798}
1799
1800# outputs properly a '# ... ' line to be put in the PO file
1801sub format_comment {
1802    my $comment = shift;
1803    my $char    = shift;
1804    my $result  = "#" . $char . $comment;
1805    $result =~ s/\n/\n#$char/gs;
1806    $result =~ s/^#$char$/#/gm;
1807    $result .= "\n";
1808    return $result;
1809}
1810
18111;
1812__END__
1813
1814=back
1815
1816=head1 AUTHORS
1817
1818 Denis Barbier <barbier@linuxfr.org>
1819 Martin Quinson (mquinson#debian.org)
1820
1821=cut
1822