1# This file is part of Pyphen
2#
3# Copyright 2008 - Wilbert Berendsen <info@wilbertberendsen.nl>
4# Copyright 2012-2013 - Guillaume Ayoub <guillaume.ayoub@kozea.fr>
5#
6# This library is free software.  It is released under the
7# GPL 2.0+/LGPL 2.1+/MPL 1.1 tri-license.  See COPYING.GPL, COPYING.LGPL and
8# COPYING.MPL for more details.
9#
10# This library is distributed in the hope that it will be useful, but WITHOUT
11# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
13# details.
14
15"""
16
17Pyphen
18======
19
20Pure Python module to hyphenate text, inspired by Ruby's Text::Hyphen.
21
22"""
23
24import os
25import re
26
27__all__ = ('Pyphen', 'LANGUAGES', 'language_fallback')
28
29# cache of per-file HyphDict objects
30hdcache = {}
31
32# precompile some stuff
33parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub
34parse = re.compile(r'(\d?)(\D?)').findall
35
36try:
37    from pkg_resources import resource_filename
38    dictionaries_root = resource_filename('pyphen', 'dictionaries')
39except ImportError:
40    dictionaries_root = os.path.join(os.path.dirname(__file__), 'dictionaries')
41
42LANGUAGES = {}
43for filename in sorted(os.listdir(dictionaries_root)):
44    if filename.endswith('.dic'):
45        name = filename[5:-4]
46        full_path = os.path.join(dictionaries_root, filename)
47        LANGUAGES[name] = full_path
48        short_name = name.split('_')[0]
49        if short_name not in LANGUAGES:
50            LANGUAGES[short_name] = full_path
51
52
53def language_fallback(language):
54    """Get a fallback language available in our dictionaries.
55
56    http://www.unicode.org/reports/tr35/#Locale_Inheritance
57
58    We use the normal truncation inheritance. This function needs aliases
59    including scripts for languages with multiple regions available.
60
61    """
62    parts = language.replace('-', '_').split('_')
63    while parts:
64        language = '_'.join(parts)
65        if language in LANGUAGES:
66            return language
67        parts.pop()
68
69
70class AlternativeParser(object):
71    """Parser of nonstandard hyphen pattern alternative.
72
73    The instance returns a special int with data about the current position in
74    the pattern when called with an odd value.
75
76    """
77    def __init__(self, pattern, alternative):
78        alternative = alternative.split(',')
79        self.change = alternative[0]
80        self.index = int(alternative[1])
81        self.cut = int(alternative[2])
82        if pattern.startswith('.'):
83            self.index += 1
84
85    def __call__(self, value):
86        self.index -= 1
87        value = int(value)
88        if value & 1:
89            return DataInt(value, (self.change, self.index, self.cut))
90        else:
91            return value
92
93
94class DataInt(int):
95    """``int`` with some other data can be stuck to in a ``data`` attribute."""
96    def __new__(cls, value, data=None, reference=None):
97        """Create a new ``DataInt``.
98
99        Call with ``reference=dataint_object`` to use the data from another
100        ``DataInt``.
101
102        """
103        obj = int.__new__(cls, value)
104        if reference and isinstance(reference, DataInt):
105            obj.data = reference.data
106        else:
107            obj.data = data
108        return obj
109
110
111class HyphDict(object):
112    """Hyphenation patterns."""
113
114    def __init__(self, filename):
115        """Read a ``hyph_*.dic`` and parse its patterns.
116
117        :param filename: filename of hyph_*.dic to read
118
119        """
120        self.patterns = {}
121
122        with open(filename, 'rb') as stream:
123            # see "man 4 hunspell", iscii-devanagari is not supported by python
124            charset = stream.readline().strip().decode('ascii')
125            if charset.lower() == 'microsoft-cp1251':
126                charset = 'cp1251'
127            for pattern in stream:
128                pattern = pattern.decode(charset).strip()
129                if not pattern or pattern.startswith((
130                        '%', '#', 'LEFTHYPHENMIN', 'RIGHTHYPHENMIN',
131                        'COMPOUNDLEFTHYPHENMIN', 'COMPOUNDRIGHTHYPHENMIN')):
132                    continue
133
134                # replace ^^hh with the real character
135                pattern = parse_hex(
136                    lambda match: chr(int(match.group(1), 16)), pattern)
137
138                # read nonstandard hyphen alternatives
139                if '/' in pattern:
140                    pattern, alternative = pattern.split('/', 1)
141                    factory = AlternativeParser(pattern, alternative)
142                else:
143                    factory = int
144
145                tags, values = zip(*[
146                    (string, factory(i or '0'))
147                    for i, string in parse(pattern)])
148
149                # if only zeros, skip this pattern
150                if max(values) == 0:
151                    continue
152
153                # chop zeros from beginning and end, and store start offset
154                start, end = 0, len(values)
155                while not values[start]:
156                    start += 1
157                while not values[end - 1]:
158                    end -= 1
159
160                self.patterns[''.join(tags)] = start, values[start:end]
161
162        self.cache = {}
163        self.maxlen = max(len(key) for key in self.patterns)
164
165    def positions(self, word):
166        """Get a list of positions where the word can be hyphenated.
167
168        :param word: unicode string of the word to hyphenate
169
170        E.g. for the dutch word 'lettergrepen' this method returns ``[3, 6,
171        9]``.
172
173        Each position is a ``DataInt`` with a data attribute.
174
175        If the data attribute is not ``None``, it contains a tuple with
176        information about nonstandard hyphenation at that point: ``(change,
177        index, cut)``.
178
179        change
180          a string like ``'ff=f'``, that describes how hyphenation should
181          take place.
182
183        index
184          where to substitute the change, counting from the current point
185
186        cut
187          how many characters to remove while substituting the nonstandard
188          hyphenation
189
190        """
191        word = word.lower()
192        points = self.cache.get(word)
193        if points is None:
194            pointed_word = '.%s.' % word
195            references = [0] * (len(pointed_word) + 1)
196
197            for i in range(len(pointed_word) - 1):
198                for j in range(
199                        i + 1, min(i + self.maxlen, len(pointed_word)) + 1):
200                    pattern = self.patterns.get(pointed_word[i:j])
201                    if pattern:
202                        offset, values = pattern
203                        slice_ = slice(i + offset, i + offset + len(values))
204                        references[slice_] = map(
205                            max, values, references[slice_])
206
207            points = [
208                DataInt(i - 1, reference=reference)
209                for i, reference in enumerate(references) if reference % 2]
210            self.cache[word] = points
211        return points
212
213
214class Pyphen(object):
215    """Hyphenation class, with methods to hyphenate strings in various ways."""
216
217    def __init__(self, filename=None, lang=None, left=2, right=2, cache=True):
218        """Create an hyphenation instance for given lang or filename.
219
220        :param filename: filename of hyph_*.dic to read
221        :param lang: lang of the included dict to use if no filename is given
222        :param left: minimum number of characters of the first syllabe
223        :param right: minimum number of characters of the last syllabe
224        :param cache: if ``True``, use cached copy of the hyphenation patterns
225
226        """
227        if not filename:
228            filename = LANGUAGES[language_fallback(lang)]
229        self.left = left
230        self.right = right
231        if not cache or filename not in hdcache:
232            hdcache[filename] = HyphDict(filename)
233        self.hd = hdcache[filename]
234
235    def positions(self, word):
236        """Get a list of positions where the word can be hyphenated.
237
238        :param word: unicode string of the word to hyphenate
239
240        See also ``HyphDict.positions``. The points that are too far to the
241        left or right are removed.
242
243        """
244        right = len(word) - self.right
245        return [i for i in self.hd.positions(word) if self.left <= i <= right]
246
247    def iterate(self, word):
248        """Iterate over all hyphenation possibilities, the longest first.
249
250        :param word: unicode string of the word to hyphenate
251
252        """
253        for position in reversed(self.positions(word)):
254            if position.data:
255                # get the nonstandard hyphenation data
256                change, index, cut = position.data
257                index += position
258                if word.isupper():
259                    change = change.upper()
260                c1, c2 = change.split('=')
261                yield word[:index] + c1, c2 + word[index + cut:]
262            else:
263                yield word[:position], word[position:]
264
265    def wrap(self, word, width, hyphen='-'):
266        """Get the longest possible first part and the last part of a word.
267
268        :param word: unicode string of the word to hyphenate
269        :param width: maximum length of the first part
270        :param hyphen: unicode string used as hyphen character
271
272        The first part has the hyphen already attached.
273
274        Returns ``None`` if there is no hyphenation point before ``width``, or
275        if the word could not be hyphenated.
276
277        """
278        width -= len(hyphen)
279        for w1, w2 in self.iterate(word):
280            if len(w1) <= width:
281                return w1 + hyphen, w2
282
283    def inserted(self, word, hyphen='-'):
284        """Get the word as a string with all the possible hyphens inserted.
285
286        :param word: unicode string of the word to hyphenate
287        :param hyphen: unicode string used as hyphen character
288
289        E.g. for the dutch word ``'lettergrepen'``, this method returns the
290        unicode string ``'let-ter-gre-pen'``. The hyphen string to use can be
291        given as the second parameter, that defaults to ``'-'``.
292
293        """
294        word_list = list(word)
295        for position in reversed(self.positions(word)):
296            if position.data:
297                # get the nonstandard hyphenation data
298                change, index, cut = position.data
299                index += position
300                if word.isupper():
301                    change = change.upper()
302                word_list[index:index + cut] = change.replace('=', hyphen)
303            else:
304                word_list.insert(position, hyphen)
305
306        return ''.join(word_list)
307
308    __call__ = iterate
309