1# -*- coding: utf-8 -*- 2 3# This work is licensed under the MIT License. 4# To view a copy of this license, visit https://opensource.org/licenses/MIT 5 6# Written by Abdullah Diab (mpcabd) 7# Email: mpcabd@gmail.com 8# Website: http://mpcabd.xyz 9 10from __future__ import unicode_literals 11 12import re 13 14from itertools import repeat 15 16from .ligatures import LIGATURES 17from .reshaper_config import auto_config 18from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC, 19 LETTERS_ARABIC_V2, LETTERS_KURDISH, FINAL, 20 INITIAL, MEDIAL, connects_with_letters_before_and_after, 21 connects_with_letter_before, connects_with_letter_after) 22 23HARAKAT_RE = re.compile( 24 '[' 25 '\u0610-\u061a' 26 '\u064b-\u065f' 27 '\u0670' 28 '\u06d6-\u06dc' 29 '\u06df-\u06e8' 30 '\u06ea-\u06ed' 31 '\u08d4-\u08e1' 32 '\u08d4-\u08ed' 33 '\u08e3-\u08ff' 34 ']', 35 36 re.UNICODE | re.X 37) 38 39 40class ArabicReshaper(object): 41 """ 42 A class for Arabic reshaper, it allows for fine-tune configuration over the 43 API. 44 45 If no configuration is passed to the constructor, the class will check for 46 an environment variable :envvar:`PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE` 47 , if the variable is available, the class will load the file pointed to by 48 the variable, and will read it as an ini file. 49 If the variable doesn't exist, the class will load with the default 50 configuration file :file:`default-config.ini` 51 52 Check these links for information on the configuration files format: 53 54 * Python 3: https://docs.python.org/3/library/configparser.html 55 * Python 2: https://docs.python.org/2/library/configparser.html 56 57 See the default configuration file :file:`default-config.ini` for details 58 on how to configure your reshaper. 59 """ 60 61 def __init__(self, configuration=None, configuration_file=None): 62 super(ArabicReshaper, self).__init__() 63 64 self.configuration = auto_config(configuration, configuration_file) 65 self.language = self.configuration.get('language') 66 67 if self.language == 'ArabicV2': 68 self.letters = LETTERS_ARABIC_V2 69 elif self.language == 'Kurdish': 70 self.letters = LETTERS_KURDISH 71 else: 72 self.letters = LETTERS_ARABIC 73 74 @property 75 def _ligatures_re(self): 76 if not hasattr(self, '__ligatures_re'): 77 patterns = [] 78 re_group_index_to_ligature_forms = {} 79 index = 0 80 FORMS = 1 81 MATCH = 0 82 for ligature_record in LIGATURES: 83 ligature, replacement = ligature_record 84 if not self.configuration.getboolean(ligature): 85 continue 86 re_group_index_to_ligature_forms[index] = replacement[FORMS] 87 patterns.append('({})'.format(replacement[MATCH])) 88 index += 1 89 self._re_group_index_to_ligature_forms = ( 90 re_group_index_to_ligature_forms 91 ) 92 self.__ligatures_re = re.compile('|'.join(patterns), re.UNICODE) 93 return self.__ligatures_re 94 95 def _get_ligature_forms_from_re_group_index(self, group_index): 96 if not hasattr(self, '_re_group_index_to_ligature_forms'): 97 return self._ligatures_re 98 return self._re_group_index_to_ligature_forms[group_index] 99 100 def reshape(self, text): 101 if not text: 102 return '' 103 104 output = [] 105 106 LETTER = 0 107 FORM = 1 108 NOT_SUPPORTED = -1 109 110 delete_harakat = self.configuration.getboolean('delete_harakat') 111 delete_tatweel = self.configuration.getboolean('delete_tatweel') 112 support_zwj = self.configuration.getboolean('support_zwj') 113 shift_harakat_position = self.configuration.getboolean( 114 'shift_harakat_position' 115 ) 116 use_unshaped_instead_of_isolated = self.configuration.getboolean( 117 'use_unshaped_instead_of_isolated' 118 ) 119 120 positions_harakat = {} 121 122 isolated_form = (UNSHAPED 123 if use_unshaped_instead_of_isolated else ISOLATED) 124 125 for letter in text: 126 if HARAKAT_RE.match(letter): 127 if not delete_harakat: 128 position = len(output) - 1 129 if shift_harakat_position: 130 position -= 1 131 if position not in positions_harakat: 132 positions_harakat[position] = [] 133 if shift_harakat_position: 134 positions_harakat[position].insert(0, letter) 135 else: 136 positions_harakat[position].append(letter) 137 elif letter == TATWEEL and delete_tatweel: 138 pass 139 elif letter == ZWJ and not support_zwj: 140 pass 141 elif letter not in self.letters: 142 output.append((letter, NOT_SUPPORTED)) 143 elif not output: # first letter 144 output.append((letter, isolated_form)) 145 else: 146 previous_letter = output[-1] 147 if previous_letter[FORM] == NOT_SUPPORTED: 148 output.append((letter, isolated_form)) 149 elif not connects_with_letter_before(letter, self.letters): 150 output.append((letter, isolated_form)) 151 elif not connects_with_letter_after( 152 previous_letter[LETTER], self.letters): 153 output.append((letter, isolated_form)) 154 elif (previous_letter[FORM] == FINAL and not 155 connects_with_letters_before_and_after( 156 previous_letter[LETTER], self.letters 157 )): 158 output.append((letter, isolated_form)) 159 elif previous_letter[FORM] == isolated_form: 160 output[-1] = ( 161 previous_letter[LETTER], 162 INITIAL 163 ) 164 output.append((letter, FINAL)) 165 # Otherwise, we will change the previous letter to connect 166 # to the current letter 167 else: 168 output[-1] = ( 169 previous_letter[LETTER], 170 MEDIAL 171 ) 172 output.append((letter, FINAL)) 173 174 # Remove ZWJ if it's the second to last item as it won't be useful 175 if support_zwj and len(output) > 1 and output[-2][LETTER] == ZWJ: 176 output.pop(len(output) - 2) 177 178 if support_zwj and output and output[-1][LETTER] == ZWJ: 179 output.pop() 180 181 if self.configuration.getboolean('support_ligatures'): 182 # Clean text from Harakat to be able to find ligatures 183 text = HARAKAT_RE.sub('', text) 184 185 # Clean text from Tatweel to find ligatures if delete_tatweel 186 if delete_tatweel: 187 text = text.replace(TATWEEL, '') 188 189 for match in re.finditer(self._ligatures_re, text): 190 group_index = next(( 191 i for i, group in enumerate(match.groups()) if group 192 ), -1) 193 forms = self._get_ligature_forms_from_re_group_index( 194 group_index 195 ) 196 a, b = match.span() 197 a_form = output[a][FORM] 198 b_form = output[b - 1][FORM] 199 ligature_form = None 200 201 # +-----------+----------+---------+---------+----------+ 202 # | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL | 203 # +-----------+----------+---------+---------+----------+ 204 # | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED | 205 # | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED | 206 # | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL | 207 # | FINAL | FINAL | MEDIAL | MEDIAL | FINAL | 208 # +-----------+----------+---------+---------+----------+ 209 210 if a_form in (isolated_form, INITIAL): 211 if b_form in (isolated_form, FINAL): 212 ligature_form = ISOLATED 213 else: 214 ligature_form = INITIAL 215 else: 216 if b_form in (isolated_form, FINAL): 217 ligature_form = FINAL 218 else: 219 ligature_form = MEDIAL 220 if not forms[ligature_form]: 221 continue 222 output[a] = (forms[ligature_form], NOT_SUPPORTED) 223 output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a) 224 225 result = [] 226 if not delete_harakat and -1 in positions_harakat: 227 result.extend(positions_harakat[-1]) 228 for i, o in enumerate(output): 229 if o[LETTER]: 230 if o[FORM] == NOT_SUPPORTED or o[FORM] == UNSHAPED: 231 result.append(o[LETTER]) 232 else: 233 result.append(self.letters[o[LETTER]][o[FORM]]) 234 235 if not delete_harakat: 236 if i in positions_harakat: 237 result.extend(positions_harakat[i]) 238 239 return ''.join(result) 240 241 242default_reshaper = ArabicReshaper() 243reshape = default_reshaper.reshape 244