1#!python3
2
3"""
4Text tools
5"""
6
7import re
8import textwrap
9from itertools import chain
10
11
12_zEndOfSentence = re.compile(r'[.?!…]+[»”’)]?[   ]+[»”’]?(?=[«"“‘–—   ]*[A-ZÀÂÉÈÊÎÔÇ])|[:;][   ]+')
13
14def getSentenceBoundaries (sText):
15    "generator: returns start and end of sentences found in <sText>"
16    iStart = 0
17    for m in _zEndOfSentence.finditer(sText):
18        yield (iStart, m.end())
19        iStart = m.end()
20    yield (iStart, len(sText))
21
22
23def getSentence (sText):
24    "generator: returns sentences found in <sText>"
25    for iStart, iEnd in getSentenceBoundaries(sText):
26        yield sText[iStart:iEnd]
27
28
29def getParagraph (sText):
30    "generator: returns paragraphs of text"
31    iStart = 0
32    sText = sText.replace("\r\n", "\n").replace("\r", "\n")
33    iEnd = sText.find("\n", iStart)
34    while iEnd != -1:
35        yield sText[iStart:iEnd]
36        iStart = iEnd + 1
37        iEnd = sText.find("\n", iStart)
38    yield sText[iStart:]
39
40
41def wrap (sText, nWidth=80):
42    "generator: returns text line by line"
43    sText = sText.rstrip("\r\n")
44    while sText:
45        if len(sText) > nWidth:
46            nEnd = sText.rfind(" ", 0, nWidth) + 1
47            if nEnd > 0:
48                yield sText[0:nEnd]
49                sText = sText[nEnd:]
50            else:
51                yield sText[0:nWidth]
52                sText = sText[nWidth:]
53        else:
54            break
55    yield sText
56
57
58def generateParagraph (sParagraph, aGrammErrs, aSpellErrs, nWidth=100):
59    "Returns a text with readable errors"
60    if not sParagraph:
61        return ("", [])
62    lGrammErrs = sorted(aGrammErrs, key=lambda d: d["nStart"])
63    lSpellErrs = sorted(aSpellErrs, key=lambda d: d['nStart'])
64    lErrors = sorted(lGrammErrs + lSpellErrs, key=lambda d: d["nStart"])
65    for n, dErr in enumerate(lErrors, 1):
66        dErr["iError"] = n
67    sText = ""
68    nOffset = 0
69    for sLine in wrap(sParagraph, nWidth): # textwrap.wrap(sParagraph, nWidth, drop_whitespace=False)
70        sText += sLine + "\n"
71        nLineLen = len(sLine)
72        sErrLine = ""
73        nLenErrLine = 0
74        nGrammErr = 0
75        nSpellErr = 0
76        for dErr in lGrammErrs:
77            nStart = dErr["nStart"] - nOffset
78            if nStart < nLineLen:
79                nGrammErr += 1
80                if nStart >= nLenErrLine:
81                    sErrLine += " " * (nStart - nLenErrLine) + "^" * (dErr["nEnd"] - dErr["nStart"])
82                    nLenErrLine = len(sErrLine)
83            else:
84                break
85        for dErr in lSpellErrs:
86            nStart = dErr['nStart'] - nOffset
87            if nStart < nLineLen:
88                nSpellErr += 1
89                nEnd = dErr['nEnd'] - nOffset
90                if nEnd > len(sErrLine):
91                    sErrLine += " " * (nEnd - len(sErrLine))
92                sErrLine = sErrLine[:nStart] + "°" * (nEnd - nStart) + sErrLine[nEnd:]
93            else:
94                break
95        if sErrLine:
96            sText += sErrLine + "\n"
97        if nGrammErr:
98            sText += getReadableErrors(lGrammErrs[:nGrammErr], nWidth)
99            del lGrammErrs[0:nGrammErr]
100        if nSpellErr:
101            sText += getReadableErrors(lSpellErrs[:nSpellErr], nWidth, True)
102            del lSpellErrs[0:nSpellErr]
103        nOffset += nLineLen
104    return (sText, lErrors)
105
106
107def getReadableErrors (lErrs, nWidth, bSpellingError=False):
108    "Returns lErrs errors as readable errors"
109    sErrors = ""
110    for dErr in lErrs:
111        if not bSpellingError or "aSuggestions" in dErr:
112            sMsg, *others = getReadableError(dErr, bSpellingError).split("\n")
113            sErrors += "\n".join(textwrap.wrap(sMsg, nWidth, subsequent_indent="  ")) + "\n"
114            for arg in others:
115                sErrors += "\n".join(textwrap.wrap(arg, nWidth, subsequent_indent="    ")) + "\n"
116    if sErrors != "":
117        sErrors += "\n"
118    return sErrors
119
120
121def getReadableError (dErr, bSpellingError=False):
122    "Returns an error dErr as a readable error"
123    try:
124        if bSpellingError:
125            sText = u"* {iError} [{nStart}:{nEnd}]  # {sValue}:".format(**dErr)
126        else:
127            sText = u"* {iError} [{nStart}:{nEnd}]  # {sLineId} / {sRuleId}:\n".format(**dErr)
128            sText += "  " + dErr.get("sMessage", "# error : message not found")
129        if dErr.get("aSuggestions", None):
130            sText += "\n  > Suggestions : " + " | ".join(dErr["aSuggestions"])
131        if dErr.get("URL", None):
132            sText += "\n  > URL: " + dErr["URL"]
133        return sText
134    except KeyError:
135        return u"* Non-compliant error: {}".format(dErr)
136
137
138def createParagraphWithLines (lLine):
139    "Returns a text as merged lines and a set of data about lines (line_number_y, start_x, end_x)"
140    sText = ""
141    lLineSet = []
142    nLine = 1
143    n = 0
144    for iLineNumber, sLine in lLine:
145        sLine = sLine.rstrip("\r\n")
146        if nLine < len(lLine) and not sLine.endswith((" ", " ", "-", "–", "—")):
147            sLine += " "
148        lLineSet.append((iLineNumber, n, n + len(sLine)))
149        n += len(sLine)
150        sText += sLine
151        nLine += 1
152    return sText, lLineSet
153
154
155def convertToXY (aGrammErrs, aSpellErrs, lLineSet):
156    """Converts errors position as an y and x position in a text (y is line number, x is row number).
157       lLineSet is a list of sets (line_number_y, start_x, end_x) describing how the paragraph is divided."""
158    for dErr in chain(aGrammErrs, aSpellErrs):
159        i = 0
160        for i, elem in enumerate(lLineSet, 1):
161            if dErr['nEnd'] <= elem[2]:
162                dErr['nEndY'] = elem[0]
163                dErr['nEndX'] = dErr['nEnd'] - elem[1]
164                break
165        if i:
166            for elem in reversed(lLineSet[:i]):
167                if dErr['nStart'] >= elem[1]:
168                    dErr['nStartY'] = elem[0]
169                    dErr['nStartX'] = dErr['nStart'] - elem[1]
170                    break
171        del dErr['nStart']
172        del dErr['nEnd']
173    return aGrammErrs, aSpellErrs
174