1# -*- coding: utf-8 -*-
2
3# This work is licensed under the MIT License.
4# To view a copy of this license, visit https://opensource.org/licenses/MIT
5
6# Written by Abdullah Diab (mpcabd)
7# Email: mpcabd@gmail.com
8# Website: http://mpcabd.xyz
9
10from __future__ import unicode_literals
11
12import re
13
14from itertools import repeat
15
16from .ligatures import LIGATURES
17from .reshaper_config import auto_config
18from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC,
19                      LETTERS_ARABIC_V2, LETTERS_KURDISH, FINAL,
20                      INITIAL, MEDIAL, connects_with_letters_before_and_after,
21                      connects_with_letter_before, connects_with_letter_after)
22
23HARAKAT_RE = re.compile(
24    '['
25    '\u0610-\u061a'
26    '\u064b-\u065f'
27    '\u0670'
28    '\u06d6-\u06dc'
29    '\u06df-\u06e8'
30    '\u06ea-\u06ed'
31    '\u08d4-\u08e1'
32    '\u08d4-\u08ed'
33    '\u08e3-\u08ff'
34    ']',
35
36    re.UNICODE | re.X
37)
38
39
40class ArabicReshaper(object):
41    """
42    A class for Arabic reshaper, it allows for fine-tune configuration over the
43    API.
44
45    If no configuration is passed to the constructor, the class will check for
46    an environment variable :envvar:`PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE`
47    , if the variable is available, the class will load the file pointed to by
48    the variable, and will read it as an ini file.
49    If the variable doesn't exist, the class will load with the default
50    configuration file :file:`default-config.ini`
51
52    Check these links for information on the configuration files format:
53
54    * Python 3: https://docs.python.org/3/library/configparser.html
55    * Python 2: https://docs.python.org/2/library/configparser.html
56
57    See the default configuration file :file:`default-config.ini` for details
58    on how to configure your reshaper.
59    """
60
61    def __init__(self, configuration=None, configuration_file=None):
62        super(ArabicReshaper, self).__init__()
63
64        self.configuration = auto_config(configuration, configuration_file)
65        self.language = self.configuration.get('language')
66
67        if self.language == 'ArabicV2':
68            self.letters = LETTERS_ARABIC_V2
69        elif self.language == 'Kurdish':
70            self.letters = LETTERS_KURDISH
71        else:
72            self.letters = LETTERS_ARABIC
73
74    @property
75    def _ligatures_re(self):
76        if not hasattr(self, '__ligatures_re'):
77            patterns = []
78            re_group_index_to_ligature_forms = {}
79            index = 0
80            FORMS = 1
81            MATCH = 0
82            for ligature_record in LIGATURES:
83                ligature, replacement = ligature_record
84                if not self.configuration.getboolean(ligature):
85                    continue
86                re_group_index_to_ligature_forms[index] = replacement[FORMS]
87                patterns.append('({})'.format(replacement[MATCH]))
88                index += 1
89            self._re_group_index_to_ligature_forms = (
90                re_group_index_to_ligature_forms
91            )
92            self.__ligatures_re = re.compile('|'.join(patterns), re.UNICODE)
93        return self.__ligatures_re
94
95    def _get_ligature_forms_from_re_group_index(self, group_index):
96        if not hasattr(self, '_re_group_index_to_ligature_forms'):
97            return self._ligatures_re
98        return self._re_group_index_to_ligature_forms[group_index]
99
100    def reshape(self, text):
101        if not text:
102            return ''
103
104        output = []
105
106        LETTER = 0
107        FORM = 1
108        NOT_SUPPORTED = -1
109
110        delete_harakat = self.configuration.getboolean('delete_harakat')
111        delete_tatweel = self.configuration.getboolean('delete_tatweel')
112        support_zwj = self.configuration.getboolean('support_zwj')
113        shift_harakat_position = self.configuration.getboolean(
114            'shift_harakat_position'
115        )
116        use_unshaped_instead_of_isolated = self.configuration.getboolean(
117            'use_unshaped_instead_of_isolated'
118        )
119
120        positions_harakat = {}
121
122        isolated_form = (UNSHAPED
123                         if use_unshaped_instead_of_isolated else ISOLATED)
124
125        for letter in text:
126            if HARAKAT_RE.match(letter):
127                if not delete_harakat:
128                    position = len(output) - 1
129                    if shift_harakat_position:
130                        position -= 1
131                    if position not in positions_harakat:
132                        positions_harakat[position] = []
133                    if shift_harakat_position:
134                        positions_harakat[position].insert(0, letter)
135                    else:
136                        positions_harakat[position].append(letter)
137            elif letter == TATWEEL and delete_tatweel:
138                pass
139            elif letter == ZWJ and not support_zwj:
140                pass
141            elif letter not in self.letters:
142                output.append((letter, NOT_SUPPORTED))
143            elif not output:  # first letter
144                output.append((letter, isolated_form))
145            else:
146                previous_letter = output[-1]
147                if previous_letter[FORM] == NOT_SUPPORTED:
148                    output.append((letter, isolated_form))
149                elif not connects_with_letter_before(letter, self.letters):
150                    output.append((letter, isolated_form))
151                elif not connects_with_letter_after(
152                        previous_letter[LETTER], self.letters):
153                    output.append((letter, isolated_form))
154                elif (previous_letter[FORM] == FINAL and not
155                      connects_with_letters_before_and_after(
156                          previous_letter[LETTER], self.letters
157                )):
158                    output.append((letter, isolated_form))
159                elif previous_letter[FORM] == isolated_form:
160                    output[-1] = (
161                        previous_letter[LETTER],
162                        INITIAL
163                    )
164                    output.append((letter, FINAL))
165                # Otherwise, we will change the previous letter to connect
166                # to the current letter
167                else:
168                    output[-1] = (
169                        previous_letter[LETTER],
170                        MEDIAL
171                    )
172                    output.append((letter, FINAL))
173
174            # Remove ZWJ if it's the second to last item as it won't be useful
175            if support_zwj and len(output) > 1 and output[-2][LETTER] == ZWJ:
176                output.pop(len(output) - 2)
177
178        if support_zwj and output and output[-1][LETTER] == ZWJ:
179            output.pop()
180
181        if self.configuration.getboolean('support_ligatures'):
182            # Clean text from Harakat to be able to find ligatures
183            text = HARAKAT_RE.sub('', text)
184
185            # Clean text from Tatweel to find ligatures if delete_tatweel
186            if delete_tatweel:
187                text = text.replace(TATWEEL, '')
188
189            for match in re.finditer(self._ligatures_re, text):
190                group_index = next((
191                    i for i, group in enumerate(match.groups()) if group
192                ), -1)
193                forms = self._get_ligature_forms_from_re_group_index(
194                    group_index
195                )
196                a, b = match.span()
197                a_form = output[a][FORM]
198                b_form = output[b - 1][FORM]
199                ligature_form = None
200
201                # +-----------+----------+---------+---------+----------+
202                # | a   \   b | ISOLATED | INITIAL | MEDIAL  | FINAL    |
203                # +-----------+----------+---------+---------+----------+
204                # | ISOLATED  | ISOLATED | INITIAL | INITIAL | ISOLATED |
205                # | INITIAL   | ISOLATED | INITIAL | INITIAL | ISOLATED |
206                # | MEDIAL    | FINAL    | MEDIAL  | MEDIAL  | FINAL    |
207                # | FINAL     | FINAL    | MEDIAL  | MEDIAL  | FINAL    |
208                # +-----------+----------+---------+---------+----------+
209
210                if a_form in (isolated_form, INITIAL):
211                    if b_form in (isolated_form, FINAL):
212                        ligature_form = ISOLATED
213                    else:
214                        ligature_form = INITIAL
215                else:
216                    if b_form in (isolated_form, FINAL):
217                        ligature_form = FINAL
218                    else:
219                        ligature_form = MEDIAL
220                if not forms[ligature_form]:
221                    continue
222                output[a] = (forms[ligature_form], NOT_SUPPORTED)
223                output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a)
224
225        result = []
226        if not delete_harakat and -1 in positions_harakat:
227            result.extend(positions_harakat[-1])
228        for i, o in enumerate(output):
229            if o[LETTER]:
230                if o[FORM] == NOT_SUPPORTED or o[FORM] == UNSHAPED:
231                    result.append(o[LETTER])
232                else:
233                    result.append(self.letters[o[LETTER]][o[FORM]])
234
235            if not delete_harakat:
236                if i in positions_harakat:
237                    result.extend(positions_harakat[i])
238
239        return ''.join(result)
240
241
242default_reshaper = ArabicReshaper()
243reshape = default_reshaper.reshape
244