1#
2# Copyright 2002-2011 Zuza Software Foundation
3#
4# This file is part of the Translate Toolkit.
5#
6# This program is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 2 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, see <http://www.gnu.org/licenses/>.
18
19"""Classes for the support of Gettext .po and .pot files.
20
21This implementation assumes that cpo is working. This should not be used
22directly, but can be used once cpo has been established to work.
23"""
24
25# TODO:
26# - handle headerless PO files better
27# - previous msgid and msgctxt
28# - accept only unicodes everywhere
29
30import copy
31import logging
32import re
33
34from translate.misc.multistring import multistring
35from translate.storage import base, cpo, pocommon
36
37
38logger = logging.getLogger(__name__)
39
40
41lsep = " "
42"""Separator for #: entries"""
43
44basic_header = r"""msgid ""
45msgstr ""
46"Content-Type: text/plain; charset=UTF-8\n"
47"Content-Transfer-Encoding: 8bit\n"
48"""
49
50
51class pounit(pocommon.pounit):
52    # othercomments = []      #   # this is another comment
53    # automaticcomments = []  #   #. comment extracted from the source code
54    # sourcecomments = []     #   #: sourcefile.xxx:35
55    # prev_msgctxt = []       #   #| The previous values that msgctxt and msgid held
56    # prev_msgid = []         #
57    # prev_msgid_plural = []  #
58    # typecomments = []       #   #, fuzzy
59    # msgidcomment = ""      #   _: within msgid
60    # msgctxt
61    # msgid = []
62    # msgstr = []
63
64    # Our homegrown way to indicate what must be copied in a shallow
65    # fashion
66    __shallow__ = ["_store"]
67
68    def __init__(self, source=None, **kwargs):
69        super().__init__(source)
70        self._initallcomments(blankall=True)
71        self._msgctxt = ""
72
73        self.target = ""
74
75    def _initallcomments(self, blankall=False):
76        """Initialises allcomments"""
77        if blankall:
78            self.othercomments = []
79            self.automaticcomments = []
80            self.sourcecomments = []
81            self.typecomments = []
82            self.msgidcomment = ""
83
84    @property
85    def source(self):
86        return self._source
87
88    @source.setter
89    def source(self, source):
90        self._rich_source = None
91        source = source or ""
92        if isinstance(source, multistring):
93            self._source = source
94        elif isinstance(source, str):
95            self._source = source
96        else:  # If it is unicode, list or dict.
97            self._source = multistring(source)
98
99    @property
100    def target(self):
101        """Returns the unescaped msgstr"""
102        return self._target
103
104    @target.setter
105    def target(self, target):
106        """Sets the msgstr to the given (unescaped) value"""
107        self._rich_target = None
108        if self.hasplural():
109            if isinstance(target, multistring):
110                self._target = target
111            else:  # If it is unicode, list or dict.
112                self._target = multistring(target)
113        elif isinstance(target, (dict, list)):
114            if len(target) == 1:
115                self._target = target[0]
116            else:
117                raise ValueError(
118                    "po msgid element has no plural but msgstr"
119                    "has %d elements (%s)" % (len(target), target)
120                )
121        else:
122            self._target = target
123
124    def getnotes(self, origin=None):
125        """Return comments based on origin value (programmer, developer, source code and translator)"""
126        if origin is None:
127            comments = "\n".join(self.othercomments)
128            comments += "\n".join(self.automaticcomments)
129        elif origin == "translator":
130            comments = "\n".join(self.othercomments)
131        elif origin in ["programmer", "developer", "source code"]:
132            comments = "\n".join(self.automaticcomments)
133        else:
134            raise ValueError("Comment type not valid")
135        return comments
136
137    def addnote(self, text, origin=None, position="append"):
138        """This is modeled on the XLIFF method. See xliff.py::xliffunit.addnote"""
139        # ignore empty strings and strings without non-space characters
140        if not (text and text.strip()):
141            return
142        commentlist = self.othercomments
143        autocomments = False
144        if origin in ["programmer", "developer", "source code"]:
145            autocomments = True
146            commentlist = self.automaticcomments
147        if text.endswith("\n"):
148            text = text[:-1]
149        newcomments = text.split("\n")
150        if position == "append":
151            newcomments = commentlist + newcomments
152        elif position == "prepend":
153            newcomments = newcomments + commentlist
154
155        if autocomments:
156            self.automaticcomments = newcomments
157        else:
158            self.othercomments = newcomments
159
160    def removenotes(self, origin=None):
161        """Remove all the translator's notes (other comments)"""
162        self.othercomments = []
163
164    def __deepcopy__(self, memo={}):
165        # Make an instance to serve as the copy
166        new_unit = self.__class__()
167        # We'll be testing membership frequently, so make a set from
168        # self.__shallow__
169        shallow = set(self.__shallow__)
170        # Make deep copies of all members which are not in shallow
171        for key, value in self.__dict__.items():
172            if key not in shallow:
173                setattr(new_unit, key, copy.deepcopy(value))
174        # Make shallow copies of all members which are in shallow
175        for key in set(shallow):
176            setattr(new_unit, key, getattr(self, key))
177        # Mark memo with ourself, so that we won't get deep copied
178        # again
179        memo[id(self)] = self
180        # Return our copied unit
181        return new_unit
182
183    def copy(self):
184        return copy.deepcopy(self)
185
186    def _msgidlen(self):
187        if self.hasplural():
188            len("".join(string for string in self.source.strings))
189        else:
190            return len(self.source)
191
192    def _msgstrlen(self):
193        if self.hasplural():
194            len("".join(string for string in self.target.strings))
195        else:
196            return len(self.target)
197
198    def merge(self, otherpo, overwrite=False, comments=True, authoritative=False):
199        """Merges the otherpo (with the same msgid) into this one.
200
201        Overwrite non-blank self.msgstr only if overwrite is True
202        merge comments only if comments is True
203        """
204
205        def mergelists(list1, list2, split=False):
206            # Determine the newline style of list2
207            lineend = ""
208            if list2 and list2[0]:
209                for candidate in ["\n", "\r", "\n\r"]:
210                    if list2[0].endswith(candidate):
211                        lineend = candidate
212                if not lineend:
213                    lineend = ""
214
215            # Split if directed to do so:
216            if split:
217                splitlist1 = []
218                splitlist2 = []
219                for item in list1:
220                    splitlist1.extend(item.split())
221                for item in list2:
222                    splitlist2.extend(item.split())
223                list1.extend([item for item in splitlist2 if item not in splitlist1])
224            else:
225                # Normal merge, but conform to list1 newline style
226                if list1 != list2:
227                    for item in list2:
228                        item = item.rstrip(lineend)
229                        # avoid duplicate comment lines (this might cause some problems)
230                        if item not in list1 or len(item) < 5:
231                            list1.append(item)
232
233        if not isinstance(otherpo, pounit):
234            super().merge(otherpo, overwrite, comments)
235            return
236        if comments:
237            mergelists(self.othercomments, otherpo.othercomments)
238            mergelists(self.typecomments, otherpo.typecomments)
239            if not authoritative:
240                # We don't bring across otherpo.automaticcomments as we consider ourself
241                # to be the the authority.  Same applies to otherpo.msgidcomments
242                mergelists(self.automaticcomments, otherpo.automaticcomments)
243                #                mergelists(self.msgidcomments, otherpo.msgidcomments) #XXX?
244                mergelists(self.sourcecomments, otherpo.sourcecomments, split=True)
245        if not self.istranslated() or overwrite:
246            # Remove kde-style comments from the translation (if any). XXX - remove
247            if pocommon.extract_msgid_comment(otherpo.target):
248                otherpo.target = otherpo.target.replace(
249                    "_: " + otherpo._extract_msgidcomments() + "\n", ""
250                )
251            self.target = otherpo.target
252            if (
253                self.source != otherpo.source
254                or self.getcontext() != otherpo.getcontext()
255            ):
256                self.markfuzzy()
257            else:
258                self.markfuzzy(otherpo.isfuzzy())
259        elif not otherpo.istranslated():
260            if self.source != otherpo.source:
261                self.markfuzzy()
262        else:
263            if self.target != otherpo.target:
264                self.markfuzzy()
265
266    def isheader(self):
267        # TODO: fix up nicely
268        return not self.getid() and len(self.target) > 0
269
270    def isblank(self):
271        if self.isheader() or self.msgidcomment:
272            return False
273        if (
274            (self._msgidlen() == 0)
275            and (self._msgstrlen() == 0)
276            and len(self._msgctxt) == 0
277        ):
278            return True
279        return False
280
281    def hastypecomment(self, typecomment):
282        """Check whether the given type comment is present"""
283        # check for word boundaries properly by using a regular expression...
284        return (
285            sum(
286                map(
287                    lambda tcline: len(re.findall("\\b%s\\b" % typecomment, tcline)),
288                    self.typecomments,
289                )
290            )
291            != 0
292        )
293
294    def hasmarkedcomment(self, commentmarker):
295        """Check whether the given comment marker is present as # (commentmarker) ..."""
296        commentmarker = "(%s)" % commentmarker
297        for comment in self.othercomments:
298            if comment.startswith(commentmarker):
299                return True
300        return False
301
302    def settypecomment(self, typecomment, present=True):
303        """Alters whether a given typecomment is present"""
304        if self.hastypecomment(typecomment) != present:
305            if present:
306                self.typecomments.append("#, %s\n" % typecomment)
307            else:
308                # this should handle word boundaries properly ...
309                typecomments = map(
310                    lambda tcline: re.sub("\\b%s\\b[ \t,]*" % typecomment, "", tcline),
311                    self.typecomments,
312                )
313                self.typecomments = filter(
314                    lambda tcline: tcline.strip() != "#,", typecomments
315                )
316
317    def istranslated(self):
318        return super().istranslated() and not self.isobsolete()
319
320    def istranslatable(self):
321        return not (self.isheader() or self.isblank() or self.isobsolete())
322
323    def isfuzzy(self):
324        return self.hastypecomment("fuzzy")
325
326    def _domarkfuzzy(self, present=True):
327        self.settypecomment("fuzzy", present)
328
329    def makeobsolete(self):
330        """Makes this unit obsolete"""
331        self.sourcecomments = []
332        self.automaticcomments = []
333        super().makeobsolete()
334
335    def hasplural(self):
336        """returns whether this pounit contains plural strings..."""
337        source = self.source
338        return isinstance(source, multistring) and len(source.strings) > 1
339
340    def __str__(self):
341        """convert to a string. double check that unicode is handled somehow here"""
342        _cpo_unit = cpo.pounit.buildfromunit(self)
343        return str(_cpo_unit)
344
345    def getlocations(self):
346        """Get a list of locations from sourcecomments in the PO unit.
347
348        rtype: List
349        return: A list of the locations with '#: ' stripped
350
351        """
352        # TODO: rename to .locations
353        return self.sourcecomments
354
355    def addlocation(self, location):
356        """Add a location to sourcecomments in the PO unit.
357
358        :param location: Text location e.g. 'file.c:23' does not include #:
359        :type location: String
360        """
361        self.sourcecomments.append(location)
362
363    def _extract_msgidcomments(self, text=None):
364        """Extract KDE style msgid comments from the unit.
365
366        :rtype: String
367        :return: Returns the extracted msgidcomments found in this unit's msgid.
368        """
369        if text:
370            return pocommon.extract_msgid_comment(text)
371        else:
372            return self.msgidcomment
373
374    def getcontext(self):
375        """Get the message context."""
376        return self._msgctxt + self.msgidcomment
377
378    def setcontext(self, context):
379        self._msgctxt = context or ""
380
381    def getid(self):
382        """Returns a unique identifier for this unit."""
383        context = self.getcontext()
384        # Gettext does not consider the plural to determine duplicates, only
385        # the msgid. For generation of .mo files, we might want to use this
386        # code to generate the entry for the hash table, but for now, it is
387        # commented out for conformance to gettext.
388        #        id = '\0'.join(self.source.strings)
389        id = self.source
390        if self.msgidcomment:
391            id = f"_: {context}\n{id}"
392        elif context:
393            id = f"{context}\04{id}"
394        return id
395
396    @classmethod
397    def buildfromunit(cls, unit):
398        """Build a native unit from a foreign unit, preserving as much
399        information as possible.
400        """
401        if type(unit) == cls and hasattr(unit, "copy") and callable(unit.copy):
402            return unit.copy()
403        elif isinstance(unit, pocommon.pounit):
404            newunit = cls(unit.source)
405            newunit.target = unit.target
406            # context
407            newunit.msgidcomment = unit._extract_msgidcomments()
408            if not newunit.msgidcomment:
409                newunit.setcontext(unit.getcontext())
410
411            locations = unit.getlocations()
412            if locations:
413                newunit.addlocations(locations)
414            notes = unit.getnotes("developer")
415            if notes:
416                newunit.addnote(notes, "developer")
417            notes = unit.getnotes("translator")
418            if notes:
419                newunit.addnote(notes, "translator")
420            newunit.markfuzzy(unit.isfuzzy())
421            if unit.isobsolete():
422                newunit.makeobsolete()
423            for tc in ["python-format", "c-format", "php-format"]:
424                if unit.hastypecomment(tc):
425                    newunit.settypecomment(tc)
426                    break
427            return newunit
428        else:
429            return base.TranslationUnit.buildfromunit(unit)
430
431
432class pofile(pocommon.pofile):
433    """A .po file containing various units"""
434
435    UnitClass = pounit
436
437    def _build_self_from_cpo(self):
438        """Builds up this store from the internal cpo store.
439
440        A user must ensure that self._cpo_store already exists, and that it is
441        deleted afterwards.
442        """
443        for unit in self._cpo_store.units:
444            self.addunit(self.UnitClass.buildfromunit(unit))
445        self.encoding = self._cpo_store.encoding
446
447    def _build_cpo_from_self(self):
448        """Builds the internal cpo store from the data in self.
449
450        A user must ensure that self._cpo_store does not exist, and should
451        delete it after using it.
452        """
453        self._cpo_store = cpo.pofile(noheader=True)
454        for unit in self.units:
455            if not unit.isblank():
456                self._cpo_store.addunit(
457                    cpo.pofile.UnitClass.buildfromunit(unit, self.encoding)
458                )
459        if not self._cpo_store.header():
460            # only add a temporary header
461            self._cpo_store.makeheader(charset=self.encoding, encoding="8bit")
462
463    def parse(self, input):
464        """Parses the given file or file source string."""
465        try:
466            if hasattr(input, "name"):
467                self.filename = input.name
468            elif not getattr(self, "filename", ""):
469                self.filename = ""
470            self.units = []
471            self._cpo_store = cpo.pofile(input, noheader=True)
472            self._build_self_from_cpo()
473            del self._cpo_store
474        except Exception as e:
475            raise base.ParseError(e)
476
477    def removeduplicates(self, duplicatestyle="merge"):
478        """Make sure each msgid is unique ; merge comments etc from duplicates into original"""
479        # TODO: can we handle consecutive calls to removeduplicates()? What
480        # about files already containing msgctxt? - test
481        id_dict = {}
482        uniqueunits = []
483        # TODO: this is using a list as the pos aren't hashable, but this is slow.
484        # probably not used frequently enough to worry about it, though.
485        markedpos = []
486
487        def addcomment(thepo):
488            thepo.msgidcomment = " ".join(thepo.getlocations())
489            markedpos.append(thepo)
490
491        for thepo in self.units:
492            id = thepo.getid()
493            if thepo.isheader() and not thepo.getlocations():
494                # header msgids shouldn't be merged...
495                uniqueunits.append(thepo)
496            elif id in id_dict:
497                if duplicatestyle == "merge":
498                    if id:
499                        id_dict[id].merge(thepo)
500                    else:
501                        addcomment(thepo)
502                        uniqueunits.append(thepo)
503                elif duplicatestyle == "msgctxt":
504                    origpo = id_dict[id]
505                    if origpo not in markedpos and id:
506                        # if it doesn't have an id, we already added msgctxt
507                        origpo._msgctxt += " ".join(origpo.getlocations())
508                        markedpos.append(thepo)
509                    thepo._msgctxt += " ".join(thepo.getlocations())
510                    if not thepo._msgctxt == id_dict[id]._msgctxt:
511                        uniqueunits.append(thepo)
512                    else:
513                        logger.warning(
514                            "Duplicate unit found with msgctx of '%s' and source '%s'",
515                            thepo._msgctxt,
516                            thepo.source,
517                        )
518            else:
519                if not id:
520                    if duplicatestyle == "merge":
521                        addcomment(thepo)
522                    else:
523                        thepo._msgctxt += " ".join(thepo.getlocations())
524                id_dict[id] = thepo
525                uniqueunits.append(thepo)
526        self.units = uniqueunits
527
528    def serialize(self, out):
529        """Write content to file"""
530        self._cpo_store = cpo.pofile(encoding=self.encoding, noheader=True)
531        try:
532            self._build_cpo_from_self()
533        except UnicodeEncodeError:
534            self.encoding = "utf-8"
535            self.updateheader(add=True, Content_Type="text/plain; charset=UTF-8")
536            self._build_cpo_from_self()
537        self._cpo_store.serialize(out)
538        del self._cpo_store
539