1# This file is part of Pyphen 2# 3# Copyright 2008 - Wilbert Berendsen <info@wilbertberendsen.nl> 4# Copyright 2012-2013 - Guillaume Ayoub <guillaume.ayoub@kozea.fr> 5# 6# This library is free software. It is released under the 7# GPL 2.0+/LGPL 2.1+/MPL 1.1 tri-license. See COPYING.GPL, COPYING.LGPL and 8# COPYING.MPL for more details. 9# 10# This library is distributed in the hope that it will be useful, but WITHOUT 11# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 12# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 13# details. 14 15""" 16 17Pyphen 18====== 19 20Pure Python module to hyphenate text, inspired by Ruby's Text::Hyphen. 21 22""" 23 24import os 25import re 26 27__all__ = ('Pyphen', 'LANGUAGES', 'language_fallback') 28 29# cache of per-file HyphDict objects 30hdcache = {} 31 32# precompile some stuff 33parse_hex = re.compile(r'\^{2}([0-9a-f]{2})').sub 34parse = re.compile(r'(\d?)(\D?)').findall 35 36try: 37 from pkg_resources import resource_filename 38 dictionaries_root = resource_filename('pyphen', 'dictionaries') 39except ImportError: 40 dictionaries_root = os.path.join(os.path.dirname(__file__), 'dictionaries') 41 42LANGUAGES = {} 43for filename in sorted(os.listdir(dictionaries_root)): 44 if filename.endswith('.dic'): 45 name = filename[5:-4] 46 full_path = os.path.join(dictionaries_root, filename) 47 LANGUAGES[name] = full_path 48 short_name = name.split('_')[0] 49 if short_name not in LANGUAGES: 50 LANGUAGES[short_name] = full_path 51 52 53def language_fallback(language): 54 """Get a fallback language available in our dictionaries. 55 56 http://www.unicode.org/reports/tr35/#Locale_Inheritance 57 58 We use the normal truncation inheritance. This function needs aliases 59 including scripts for languages with multiple regions available. 60 61 """ 62 parts = language.replace('-', '_').split('_') 63 while parts: 64 language = '_'.join(parts) 65 if language in LANGUAGES: 66 return language 67 parts.pop() 68 69 70class AlternativeParser(object): 71 """Parser of nonstandard hyphen pattern alternative. 72 73 The instance returns a special int with data about the current position in 74 the pattern when called with an odd value. 75 76 """ 77 def __init__(self, pattern, alternative): 78 alternative = alternative.split(',') 79 self.change = alternative[0] 80 self.index = int(alternative[1]) 81 self.cut = int(alternative[2]) 82 if pattern.startswith('.'): 83 self.index += 1 84 85 def __call__(self, value): 86 self.index -= 1 87 value = int(value) 88 if value & 1: 89 return DataInt(value, (self.change, self.index, self.cut)) 90 else: 91 return value 92 93 94class DataInt(int): 95 """``int`` with some other data can be stuck to in a ``data`` attribute.""" 96 def __new__(cls, value, data=None, reference=None): 97 """Create a new ``DataInt``. 98 99 Call with ``reference=dataint_object`` to use the data from another 100 ``DataInt``. 101 102 """ 103 obj = int.__new__(cls, value) 104 if reference and isinstance(reference, DataInt): 105 obj.data = reference.data 106 else: 107 obj.data = data 108 return obj 109 110 111class HyphDict(object): 112 """Hyphenation patterns.""" 113 114 def __init__(self, filename): 115 """Read a ``hyph_*.dic`` and parse its patterns. 116 117 :param filename: filename of hyph_*.dic to read 118 119 """ 120 self.patterns = {} 121 122 with open(filename, 'rb') as stream: 123 # see "man 4 hunspell", iscii-devanagari is not supported by python 124 charset = stream.readline().strip().decode('ascii') 125 if charset.lower() == 'microsoft-cp1251': 126 charset = 'cp1251' 127 for pattern in stream: 128 pattern = pattern.decode(charset).strip() 129 if not pattern or pattern.startswith(( 130 '%', '#', 'LEFTHYPHENMIN', 'RIGHTHYPHENMIN', 131 'COMPOUNDLEFTHYPHENMIN', 'COMPOUNDRIGHTHYPHENMIN')): 132 continue 133 134 # replace ^^hh with the real character 135 pattern = parse_hex( 136 lambda match: chr(int(match.group(1), 16)), pattern) 137 138 # read nonstandard hyphen alternatives 139 if '/' in pattern: 140 pattern, alternative = pattern.split('/', 1) 141 factory = AlternativeParser(pattern, alternative) 142 else: 143 factory = int 144 145 tags, values = zip(*[ 146 (string, factory(i or '0')) 147 for i, string in parse(pattern)]) 148 149 # if only zeros, skip this pattern 150 if max(values) == 0: 151 continue 152 153 # chop zeros from beginning and end, and store start offset 154 start, end = 0, len(values) 155 while not values[start]: 156 start += 1 157 while not values[end - 1]: 158 end -= 1 159 160 self.patterns[''.join(tags)] = start, values[start:end] 161 162 self.cache = {} 163 self.maxlen = max(len(key) for key in self.patterns) 164 165 def positions(self, word): 166 """Get a list of positions where the word can be hyphenated. 167 168 :param word: unicode string of the word to hyphenate 169 170 E.g. for the dutch word 'lettergrepen' this method returns ``[3, 6, 171 9]``. 172 173 Each position is a ``DataInt`` with a data attribute. 174 175 If the data attribute is not ``None``, it contains a tuple with 176 information about nonstandard hyphenation at that point: ``(change, 177 index, cut)``. 178 179 change 180 a string like ``'ff=f'``, that describes how hyphenation should 181 take place. 182 183 index 184 where to substitute the change, counting from the current point 185 186 cut 187 how many characters to remove while substituting the nonstandard 188 hyphenation 189 190 """ 191 word = word.lower() 192 points = self.cache.get(word) 193 if points is None: 194 pointed_word = '.%s.' % word 195 references = [0] * (len(pointed_word) + 1) 196 197 for i in range(len(pointed_word) - 1): 198 for j in range( 199 i + 1, min(i + self.maxlen, len(pointed_word)) + 1): 200 pattern = self.patterns.get(pointed_word[i:j]) 201 if pattern: 202 offset, values = pattern 203 slice_ = slice(i + offset, i + offset + len(values)) 204 references[slice_] = map( 205 max, values, references[slice_]) 206 207 points = [ 208 DataInt(i - 1, reference=reference) 209 for i, reference in enumerate(references) if reference % 2] 210 self.cache[word] = points 211 return points 212 213 214class Pyphen(object): 215 """Hyphenation class, with methods to hyphenate strings in various ways.""" 216 217 def __init__(self, filename=None, lang=None, left=2, right=2, cache=True): 218 """Create an hyphenation instance for given lang or filename. 219 220 :param filename: filename of hyph_*.dic to read 221 :param lang: lang of the included dict to use if no filename is given 222 :param left: minimum number of characters of the first syllabe 223 :param right: minimum number of characters of the last syllabe 224 :param cache: if ``True``, use cached copy of the hyphenation patterns 225 226 """ 227 if not filename: 228 filename = LANGUAGES[language_fallback(lang)] 229 self.left = left 230 self.right = right 231 if not cache or filename not in hdcache: 232 hdcache[filename] = HyphDict(filename) 233 self.hd = hdcache[filename] 234 235 def positions(self, word): 236 """Get a list of positions where the word can be hyphenated. 237 238 :param word: unicode string of the word to hyphenate 239 240 See also ``HyphDict.positions``. The points that are too far to the 241 left or right are removed. 242 243 """ 244 right = len(word) - self.right 245 return [i for i in self.hd.positions(word) if self.left <= i <= right] 246 247 def iterate(self, word): 248 """Iterate over all hyphenation possibilities, the longest first. 249 250 :param word: unicode string of the word to hyphenate 251 252 """ 253 for position in reversed(self.positions(word)): 254 if position.data: 255 # get the nonstandard hyphenation data 256 change, index, cut = position.data 257 index += position 258 if word.isupper(): 259 change = change.upper() 260 c1, c2 = change.split('=') 261 yield word[:index] + c1, c2 + word[index + cut:] 262 else: 263 yield word[:position], word[position:] 264 265 def wrap(self, word, width, hyphen='-'): 266 """Get the longest possible first part and the last part of a word. 267 268 :param word: unicode string of the word to hyphenate 269 :param width: maximum length of the first part 270 :param hyphen: unicode string used as hyphen character 271 272 The first part has the hyphen already attached. 273 274 Returns ``None`` if there is no hyphenation point before ``width``, or 275 if the word could not be hyphenated. 276 277 """ 278 width -= len(hyphen) 279 for w1, w2 in self.iterate(word): 280 if len(w1) <= width: 281 return w1 + hyphen, w2 282 283 def inserted(self, word, hyphen='-'): 284 """Get the word as a string with all the possible hyphens inserted. 285 286 :param word: unicode string of the word to hyphenate 287 :param hyphen: unicode string used as hyphen character 288 289 E.g. for the dutch word ``'lettergrepen'``, this method returns the 290 unicode string ``'let-ter-gre-pen'``. The hyphen string to use can be 291 given as the second parameter, that defaults to ``'-'``. 292 293 """ 294 word_list = list(word) 295 for position in reversed(self.positions(word)): 296 if position.data: 297 # get the nonstandard hyphenation data 298 change, index, cut = position.data 299 index += position 300 if word.isupper(): 301 change = change.upper() 302 word_list[index:index + cut] = change.replace('=', hyphen) 303 else: 304 word_list.insert(position, hyphen) 305 306 return ''.join(word_list) 307 308 __call__ = iterate 309