1#!python3 2 3""" 4Text tools 5""" 6 7import re 8import textwrap 9from itertools import chain 10 11 12_zEndOfSentence = re.compile(r'[.?!…]+[»”’)]?[ ]+[»”’]?(?=[«"“‘–— ]*[A-ZÀÂÉÈÊÎÔÇ])|[:;][ ]+') 13 14def getSentenceBoundaries (sText): 15 "generator: returns start and end of sentences found in <sText>" 16 iStart = 0 17 for m in _zEndOfSentence.finditer(sText): 18 yield (iStart, m.end()) 19 iStart = m.end() 20 yield (iStart, len(sText)) 21 22 23def getSentence (sText): 24 "generator: returns sentences found in <sText>" 25 for iStart, iEnd in getSentenceBoundaries(sText): 26 yield sText[iStart:iEnd] 27 28 29def getParagraph (sText): 30 "generator: returns paragraphs of text" 31 iStart = 0 32 sText = sText.replace("\r\n", "\n").replace("\r", "\n") 33 iEnd = sText.find("\n", iStart) 34 while iEnd != -1: 35 yield sText[iStart:iEnd] 36 iStart = iEnd + 1 37 iEnd = sText.find("\n", iStart) 38 yield sText[iStart:] 39 40 41def wrap (sText, nWidth=80): 42 "generator: returns text line by line" 43 sText = sText.rstrip("\r\n") 44 while sText: 45 if len(sText) > nWidth: 46 nEnd = sText.rfind(" ", 0, nWidth) + 1 47 if nEnd > 0: 48 yield sText[0:nEnd] 49 sText = sText[nEnd:] 50 else: 51 yield sText[0:nWidth] 52 sText = sText[nWidth:] 53 else: 54 break 55 yield sText 56 57 58def generateParagraph (sParagraph, aGrammErrs, aSpellErrs, nWidth=100): 59 "Returns a text with readable errors" 60 if not sParagraph: 61 return ("", []) 62 lGrammErrs = sorted(aGrammErrs, key=lambda d: d["nStart"]) 63 lSpellErrs = sorted(aSpellErrs, key=lambda d: d['nStart']) 64 lErrors = sorted(lGrammErrs + lSpellErrs, key=lambda d: d["nStart"]) 65 for n, dErr in enumerate(lErrors, 1): 66 dErr["iError"] = n 67 sText = "" 68 nOffset = 0 69 for sLine in wrap(sParagraph, nWidth): # textwrap.wrap(sParagraph, nWidth, drop_whitespace=False) 70 sText += sLine + "\n" 71 nLineLen = len(sLine) 72 sErrLine = "" 73 nLenErrLine = 0 74 nGrammErr = 0 75 nSpellErr = 0 76 for dErr in lGrammErrs: 77 nStart = dErr["nStart"] - nOffset 78 if nStart < nLineLen: 79 nGrammErr += 1 80 if nStart >= nLenErrLine: 81 sErrLine += " " * (nStart - nLenErrLine) + "^" * (dErr["nEnd"] - dErr["nStart"]) 82 nLenErrLine = len(sErrLine) 83 else: 84 break 85 for dErr in lSpellErrs: 86 nStart = dErr['nStart'] - nOffset 87 if nStart < nLineLen: 88 nSpellErr += 1 89 nEnd = dErr['nEnd'] - nOffset 90 if nEnd > len(sErrLine): 91 sErrLine += " " * (nEnd - len(sErrLine)) 92 sErrLine = sErrLine[:nStart] + "°" * (nEnd - nStart) + sErrLine[nEnd:] 93 else: 94 break 95 if sErrLine: 96 sText += sErrLine + "\n" 97 if nGrammErr: 98 sText += getReadableErrors(lGrammErrs[:nGrammErr], nWidth) 99 del lGrammErrs[0:nGrammErr] 100 if nSpellErr: 101 sText += getReadableErrors(lSpellErrs[:nSpellErr], nWidth, True) 102 del lSpellErrs[0:nSpellErr] 103 nOffset += nLineLen 104 return (sText, lErrors) 105 106 107def getReadableErrors (lErrs, nWidth, bSpellingError=False): 108 "Returns lErrs errors as readable errors" 109 sErrors = "" 110 for dErr in lErrs: 111 if not bSpellingError or "aSuggestions" in dErr: 112 sMsg, *others = getReadableError(dErr, bSpellingError).split("\n") 113 sErrors += "\n".join(textwrap.wrap(sMsg, nWidth, subsequent_indent=" ")) + "\n" 114 for arg in others: 115 sErrors += "\n".join(textwrap.wrap(arg, nWidth, subsequent_indent=" ")) + "\n" 116 if sErrors != "": 117 sErrors += "\n" 118 return sErrors 119 120 121def getReadableError (dErr, bSpellingError=False): 122 "Returns an error dErr as a readable error" 123 try: 124 if bSpellingError: 125 sText = u"* {iError} [{nStart}:{nEnd}] # {sValue}:".format(**dErr) 126 else: 127 sText = u"* {iError} [{nStart}:{nEnd}] # {sLineId} / {sRuleId}:\n".format(**dErr) 128 sText += " " + dErr.get("sMessage", "# error : message not found") 129 if dErr.get("aSuggestions", None): 130 sText += "\n > Suggestions : " + " | ".join(dErr["aSuggestions"]) 131 if dErr.get("URL", None): 132 sText += "\n > URL: " + dErr["URL"] 133 return sText 134 except KeyError: 135 return u"* Non-compliant error: {}".format(dErr) 136 137 138def createParagraphWithLines (lLine): 139 "Returns a text as merged lines and a set of data about lines (line_number_y, start_x, end_x)" 140 sText = "" 141 lLineSet = [] 142 nLine = 1 143 n = 0 144 for iLineNumber, sLine in lLine: 145 sLine = sLine.rstrip("\r\n") 146 if nLine < len(lLine) and not sLine.endswith((" ", " ", "-", "–", "—")): 147 sLine += " " 148 lLineSet.append((iLineNumber, n, n + len(sLine))) 149 n += len(sLine) 150 sText += sLine 151 nLine += 1 152 return sText, lLineSet 153 154 155def convertToXY (aGrammErrs, aSpellErrs, lLineSet): 156 """Converts errors position as an y and x position in a text (y is line number, x is row number). 157 lLineSet is a list of sets (line_number_y, start_x, end_x) describing how the paragraph is divided.""" 158 for dErr in chain(aGrammErrs, aSpellErrs): 159 i = 0 160 for i, elem in enumerate(lLineSet, 1): 161 if dErr['nEnd'] <= elem[2]: 162 dErr['nEndY'] = elem[0] 163 dErr['nEndX'] = dErr['nEnd'] - elem[1] 164 break 165 if i: 166 for elem in reversed(lLineSet[:i]): 167 if dErr['nStart'] >= elem[1]: 168 dErr['nStartY'] = elem[0] 169 dErr['nStartX'] = dErr['nStart'] - elem[1] 170 break 171 del dErr['nStart'] 172 del dErr['nEnd'] 173 return aGrammErrs, aSpellErrs 174