1 // Scintilla source code edit control
2 /** @file LexPython.cxx
3 ** Lexer for Python.
4 **/
5 // Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7
8 #include <cstdlib>
9 #include <cassert>
10 #include <cstring>
11
12 #include <string>
13 #include <vector>
14 #include <map>
15 #include <algorithm>
16
17 #include "ILexer.h"
18 #include "Scintilla.h"
19 #include "SciLexer.h"
20
21 #include "StringCopy.h"
22 #include "WordList.h"
23 #include "LexAccessor.h"
24 #include "Accessor.h"
25 #include "StyleContext.h"
26 #include "CharacterSet.h"
27 #include "CharacterCategory.h"
28 #include "LexerModule.h"
29 #include "OptionSet.h"
30 #include "SubStyles.h"
31 #include "DefaultLexer.h"
32
33 using namespace Scintilla;
34
35 namespace {
36 // Use an unnamed namespace to protect the functions and classes from name conflicts
37
38 /* Notes on f-strings: f-strings are strings prefixed with f (e.g. f'') that may
39 have arbitrary expressions in {}. The tokens in the expressions are lexed as if
40 they were outside of any string. Expressions may contain { and } characters as
41 long as there is a closing } for every {, may be 2+ lines in a triple quoted
42 string, and may have a formatting specifier following a ! or :, but both !
43 and : are valid inside of a bracketed expression and != is a valid
44 expression token even outside of a bracketed expression.
45
46 When in an f-string expression, the lexer keeps track of the state value of
47 the f-string and the nesting count for the expression (# of [, (, { seen - # of
48 }, ), ] seen). f-strings may be nested (e.g. f'{ a + f"{1+2}"') so a stack of
49 states and nesting counts is kept. If a f-string expression continues beyond
50 the end of a line, this stack is saved in a std::map that maps a line number to
51 the stack at the end of that line. std::vector is used for the stack.
52
53 The PEP for f-strings is at https://www.python.org/dev/peps/pep-0498/
54 */
55 struct SingleFStringExpState {
56 int state;
57 int nestingCount;
58 };
59
60 /* kwCDef, kwCTypeName only used for Cython */
61 enum kwType { kwOther, kwClass, kwDef, kwImport, kwCDef, kwCTypeName, kwCPDef };
62
63 enum literalsAllowed { litNone = 0, litU = 1, litB = 2, litF = 4 };
64
65 constexpr int indicatorWhitespace = 1;
66
IsPyComment(Accessor & styler,Sci_Position pos,Sci_Position len)67 bool IsPyComment(Accessor &styler, Sci_Position pos, Sci_Position len) {
68 return len > 0 && styler[pos] == '#';
69 }
70
IsPyStringTypeChar(int ch,literalsAllowed allowed)71 bool IsPyStringTypeChar(int ch, literalsAllowed allowed) noexcept {
72 return
73 ((allowed & litB) && (ch == 'b' || ch == 'B')) ||
74 ((allowed & litU) && (ch == 'u' || ch == 'U')) ||
75 ((allowed & litF) && (ch == 'f' || ch == 'F'));
76 }
77
IsPyStringStart(int ch,int chNext,int chNext2,literalsAllowed allowed)78 bool IsPyStringStart(int ch, int chNext, int chNext2, literalsAllowed allowed) noexcept {
79 if (ch == '\'' || ch == '"')
80 return true;
81 if (IsPyStringTypeChar(ch, allowed)) {
82 if (chNext == '"' || chNext == '\'')
83 return true;
84 if ((chNext == 'r' || chNext == 'R') && (chNext2 == '"' || chNext2 == '\''))
85 return true;
86 }
87 if ((ch == 'r' || ch == 'R') && (chNext == '"' || chNext == '\''))
88 return true;
89
90 return false;
91 }
92
IsPyFStringState(int st)93 bool IsPyFStringState(int st) noexcept {
94 return ((st == SCE_P_FCHARACTER) || (st == SCE_P_FSTRING) ||
95 (st == SCE_P_FTRIPLE) || (st == SCE_P_FTRIPLEDOUBLE));
96 }
97
IsPySingleQuoteStringState(int st)98 bool IsPySingleQuoteStringState(int st) noexcept {
99 return ((st == SCE_P_CHARACTER) || (st == SCE_P_STRING) ||
100 (st == SCE_P_FCHARACTER) || (st == SCE_P_FSTRING));
101 }
102
IsPyTripleQuoteStringState(int st)103 bool IsPyTripleQuoteStringState(int st) noexcept {
104 return ((st == SCE_P_TRIPLE) || (st == SCE_P_TRIPLEDOUBLE) ||
105 (st == SCE_P_FTRIPLE) || (st == SCE_P_FTRIPLEDOUBLE));
106 }
107
GetPyStringQuoteChar(int st)108 char GetPyStringQuoteChar(int st) noexcept {
109 if ((st == SCE_P_CHARACTER) || (st == SCE_P_FCHARACTER) ||
110 (st == SCE_P_TRIPLE) || (st == SCE_P_FTRIPLE))
111 return '\'';
112 if ((st == SCE_P_STRING) || (st == SCE_P_FSTRING) ||
113 (st == SCE_P_TRIPLEDOUBLE) || (st == SCE_P_FTRIPLEDOUBLE))
114 return '"';
115
116 return '\0';
117 }
118
PushStateToStack(int state,std::vector<SingleFStringExpState> & stack,SingleFStringExpState * & currentFStringExp)119 void PushStateToStack(int state, std::vector<SingleFStringExpState> &stack, SingleFStringExpState *¤tFStringExp) {
120 SingleFStringExpState single = {state, 0};
121 stack.push_back(single);
122
123 currentFStringExp = &stack.back();
124 }
125
PopFromStateStack(std::vector<SingleFStringExpState> & stack,SingleFStringExpState * & currentFStringExp)126 int PopFromStateStack(std::vector<SingleFStringExpState> &stack, SingleFStringExpState *¤tFStringExp) noexcept {
127 int state = 0;
128
129 if (!stack.empty()) {
130 state = stack.back().state;
131 stack.pop_back();
132 }
133
134 if (stack.empty()) {
135 currentFStringExp = nullptr;
136 } else {
137 currentFStringExp = &stack.back();
138 }
139
140 return state;
141 }
142
143 /* Return the state to use for the string starting at i; *nextIndex will be set to the first index following the quote(s) */
GetPyStringState(Accessor & styler,Sci_Position i,Sci_PositionU * nextIndex,literalsAllowed allowed)144 int GetPyStringState(Accessor &styler, Sci_Position i, Sci_PositionU *nextIndex, literalsAllowed allowed) {
145 char ch = styler.SafeGetCharAt(i);
146 char chNext = styler.SafeGetCharAt(i + 1);
147 const int firstIsF = (ch == 'f' || ch == 'F');
148
149 // Advance beyond r, u, or ur prefix (or r, b, or br in Python 2.7+ and r, f, or fr in Python 3.6+), but bail if there are any unexpected chars
150 if (ch == 'r' || ch == 'R') {
151 i++;
152 ch = styler.SafeGetCharAt(i);
153 chNext = styler.SafeGetCharAt(i + 1);
154 } else if (IsPyStringTypeChar(ch, allowed)) {
155 if (chNext == 'r' || chNext == 'R')
156 i += 2;
157 else
158 i += 1;
159 ch = styler.SafeGetCharAt(i);
160 chNext = styler.SafeGetCharAt(i + 1);
161 }
162
163 if (ch != '"' && ch != '\'') {
164 *nextIndex = i + 1;
165 return SCE_P_DEFAULT;
166 }
167
168 if (ch == chNext && ch == styler.SafeGetCharAt(i + 2)) {
169 *nextIndex = i + 3;
170
171 if (ch == '"')
172 return (firstIsF ? SCE_P_FTRIPLEDOUBLE : SCE_P_TRIPLEDOUBLE);
173 else
174 return (firstIsF ? SCE_P_FTRIPLE : SCE_P_TRIPLE);
175 } else {
176 *nextIndex = i + 1;
177
178 if (ch == '"')
179 return (firstIsF ? SCE_P_FSTRING : SCE_P_STRING);
180 else
181 return (firstIsF ? SCE_P_FCHARACTER : SCE_P_CHARACTER);
182 }
183 }
184
IsAWordChar(int ch,bool unicodeIdentifiers)185 inline bool IsAWordChar(int ch, bool unicodeIdentifiers) {
186 if (IsASCII(ch))
187 return (IsAlphaNumeric(ch) || ch == '.' || ch == '_');
188
189 if (!unicodeIdentifiers)
190 return false;
191
192 // Python uses the XID_Continue set from Unicode data
193 return IsXidContinue(ch);
194 }
195
IsAWordStart(int ch,bool unicodeIdentifiers)196 inline bool IsAWordStart(int ch, bool unicodeIdentifiers) {
197 if (IsASCII(ch))
198 return (IsUpperOrLowerCase(ch) || ch == '_');
199
200 if (!unicodeIdentifiers)
201 return false;
202
203 // Python uses the XID_Start set from Unicode data
204 return IsXidStart(ch);
205 }
206
IsFirstNonWhitespace(Sci_Position pos,Accessor & styler)207 bool IsFirstNonWhitespace(Sci_Position pos, Accessor &styler) {
208 const Sci_Position line = styler.GetLine(pos);
209 const Sci_Position start_pos = styler.LineStart(line);
210 for (Sci_Position i = start_pos; i < pos; i++) {
211 const char ch = styler[i];
212 if (!(ch == ' ' || ch == '\t'))
213 return false;
214 }
215 return true;
216 }
217
218 // Options used for LexerPython
219 struct OptionsPython {
220 int whingeLevel;
221 bool base2or8Literals;
222 bool stringsU;
223 bool stringsB;
224 bool stringsF;
225 bool stringsOverNewline;
226 bool keywords2NoSubIdentifiers;
227 bool fold;
228 bool foldQuotes;
229 bool foldCompact;
230 bool unicodeIdentifiers;
231
OptionsPython__anon2838a20b0111::OptionsPython232 OptionsPython() {
233 whingeLevel = 0;
234 base2or8Literals = true;
235 stringsU = true;
236 stringsB = true;
237 stringsF = true;
238 stringsOverNewline = false;
239 keywords2NoSubIdentifiers = false;
240 fold = false;
241 foldQuotes = false;
242 foldCompact = false;
243 unicodeIdentifiers = true;
244 }
245
AllowedLiterals__anon2838a20b0111::OptionsPython246 literalsAllowed AllowedLiterals() const noexcept {
247 literalsAllowed allowedLiterals = stringsU ? litU : litNone;
248 if (stringsB)
249 allowedLiterals = static_cast<literalsAllowed>(allowedLiterals | litB);
250 if (stringsF)
251 allowedLiterals = static_cast<literalsAllowed>(allowedLiterals | litF);
252 return allowedLiterals;
253 }
254 };
255
256 const char *const pythonWordListDesc[] = {
257 "Keywords",
258 "Highlighted identifiers",
259 nullptr
260 };
261
262 struct OptionSetPython : public OptionSet<OptionsPython> {
OptionSetPython__anon2838a20b0111::OptionSetPython263 OptionSetPython() {
264 DefineProperty("tab.timmy.whinge.level", &OptionsPython::whingeLevel,
265 "For Python code, checks whether indenting is consistent. "
266 "The default, 0 turns off indentation checking, "
267 "1 checks whether each line is potentially inconsistent with the previous line, "
268 "2 checks whether any space characters occur before a tab character in the indentation, "
269 "3 checks whether any spaces are in the indentation, and "
270 "4 checks for any tab characters in the indentation. "
271 "1 is a good level to use.");
272
273 DefineProperty("lexer.python.literals.binary", &OptionsPython::base2or8Literals,
274 "Set to 0 to not recognise Python 3 binary and octal literals: 0b1011 0o712.");
275
276 DefineProperty("lexer.python.strings.u", &OptionsPython::stringsU,
277 "Set to 0 to not recognise Python Unicode literals u\"x\" as used before Python 3.");
278
279 DefineProperty("lexer.python.strings.b", &OptionsPython::stringsB,
280 "Set to 0 to not recognise Python 3 bytes literals b\"x\".");
281
282 DefineProperty("lexer.python.strings.f", &OptionsPython::stringsF,
283 "Set to 0 to not recognise Python 3.6 f-string literals f\"var={var}\".");
284
285 DefineProperty("lexer.python.strings.over.newline", &OptionsPython::stringsOverNewline,
286 "Set to 1 to allow strings to span newline characters.");
287
288 DefineProperty("lexer.python.keywords2.no.sub.identifiers", &OptionsPython::keywords2NoSubIdentifiers,
289 "When enabled, it will not style keywords2 items that are used as a sub-identifier. "
290 "Example: when set, will not highlight \"foo.open\" when \"open\" is a keywords2 item.");
291
292 DefineProperty("fold", &OptionsPython::fold);
293
294 DefineProperty("fold.quotes.python", &OptionsPython::foldQuotes,
295 "This option enables folding multi-line quoted strings when using the Python lexer.");
296
297 DefineProperty("fold.compact", &OptionsPython::foldCompact);
298
299 DefineProperty("lexer.python.unicode.identifiers", &OptionsPython::unicodeIdentifiers,
300 "Set to 0 to not recognise Python 3 Unicode identifiers.");
301
302 DefineWordListSets(pythonWordListDesc);
303 }
304 };
305
306 const char styleSubable[] = { SCE_P_IDENTIFIER, 0 };
307
308 LexicalClass lexicalClasses[] = {
309 // Lexer Python SCLEX_PYTHON SCE_P_:
310 0, "SCE_P_DEFAULT", "default", "White space",
311 1, "SCE_P_COMMENTLINE", "comment line", "Comment",
312 2, "SCE_P_NUMBER", "literal numeric", "Number",
313 3, "SCE_P_STRING", "literal string", "String",
314 4, "SCE_P_CHARACTER", "literal string", "Single quoted string",
315 5, "SCE_P_WORD", "keyword", "Keyword",
316 6, "SCE_P_TRIPLE", "literal string", "Triple quotes",
317 7, "SCE_P_TRIPLEDOUBLE", "literal string", "Triple double quotes",
318 8, "SCE_P_CLASSNAME", "identifier", "Class name definition",
319 9, "SCE_P_DEFNAME", "identifier", "Function or method name definition",
320 10, "SCE_P_OPERATOR", "operator", "Operators",
321 11, "SCE_P_IDENTIFIER", "identifier", "Identifiers",
322 12, "SCE_P_COMMENTBLOCK", "comment", "Comment-blocks",
323 13, "SCE_P_STRINGEOL", "error literal string", "End of line where string is not closed",
324 14, "SCE_P_WORD2", "identifier", "Highlighted identifiers",
325 15, "SCE_P_DECORATOR", "preprocessor", "Decorators",
326 16, "SCE_P_FSTRING", "literal string interpolated", "F-String",
327 17, "SCE_P_FCHARACTER", "literal string interpolated", "Single quoted f-string",
328 18, "SCE_P_FTRIPLE", "literal string interpolated", "Triple quoted f-string",
329 19, "SCE_P_FTRIPLEDOUBLE", "literal string interpolated", "Triple double quoted f-string",
330 };
331
332 }
333
334 class LexerPython : public DefaultLexer {
335 WordList keywords;
336 WordList keywords2;
337 OptionsPython options;
338 OptionSetPython osPython;
339 enum { ssIdentifier };
340 SubStyles subStyles;
341 std::map<Sci_Position, std::vector<SingleFStringExpState> > ftripleStateAtEol;
342 public:
LexerPython()343 explicit LexerPython() :
344 DefaultLexer("python", SCLEX_PYTHON, lexicalClasses, ELEMENTS(lexicalClasses)),
345 subStyles(styleSubable, 0x80, 0x40, 0) {
346 }
~LexerPython()347 ~LexerPython() override {
348 }
Release()349 void SCI_METHOD Release() override {
350 delete this;
351 }
Version() const352 int SCI_METHOD Version() const override {
353 return lvRelease5;
354 }
PropertyNames()355 const char *SCI_METHOD PropertyNames() override {
356 return osPython.PropertyNames();
357 }
PropertyType(const char * name)358 int SCI_METHOD PropertyType(const char *name) override {
359 return osPython.PropertyType(name);
360 }
DescribeProperty(const char * name)361 const char *SCI_METHOD DescribeProperty(const char *name) override {
362 return osPython.DescribeProperty(name);
363 }
364 Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override;
PropertyGet(const char * key)365 const char * SCI_METHOD PropertyGet(const char *key) override {
366 return osPython.PropertyGet(key);
367 }
DescribeWordListSets()368 const char *SCI_METHOD DescribeWordListSets() override {
369 return osPython.DescribeWordListSets();
370 }
371 Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override;
372 void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
373 void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
374
PrivateCall(int,void *)375 void *SCI_METHOD PrivateCall(int, void *) override {
376 return nullptr;
377 }
378
LineEndTypesSupported()379 int SCI_METHOD LineEndTypesSupported() override {
380 return SC_LINE_END_TYPE_UNICODE;
381 }
382
AllocateSubStyles(int styleBase,int numberStyles)383 int SCI_METHOD AllocateSubStyles(int styleBase, int numberStyles) override {
384 return subStyles.Allocate(styleBase, numberStyles);
385 }
SubStylesStart(int styleBase)386 int SCI_METHOD SubStylesStart(int styleBase) override {
387 return subStyles.Start(styleBase);
388 }
SubStylesLength(int styleBase)389 int SCI_METHOD SubStylesLength(int styleBase) override {
390 return subStyles.Length(styleBase);
391 }
StyleFromSubStyle(int subStyle)392 int SCI_METHOD StyleFromSubStyle(int subStyle) override {
393 const int styleBase = subStyles.BaseStyle(subStyle);
394 return styleBase;
395 }
PrimaryStyleFromStyle(int style)396 int SCI_METHOD PrimaryStyleFromStyle(int style) override {
397 return style;
398 }
FreeSubStyles()399 void SCI_METHOD FreeSubStyles() override {
400 subStyles.Free();
401 }
SetIdentifiers(int style,const char * identifiers)402 void SCI_METHOD SetIdentifiers(int style, const char *identifiers) override {
403 subStyles.SetIdentifiers(style, identifiers);
404 }
DistanceToSecondaryStyles()405 int SCI_METHOD DistanceToSecondaryStyles() override {
406 return 0;
407 }
GetSubStyleBases()408 const char *SCI_METHOD GetSubStyleBases() override {
409 return styleSubable;
410 }
411
LexerFactoryPython()412 static ILexer5 *LexerFactoryPython() {
413 return new LexerPython();
414 }
415
416 private:
417 void ProcessLineEnd(StyleContext &sc, std::vector<SingleFStringExpState> &fstringStateStack, SingleFStringExpState *¤tFStringExp, bool &inContinuedString);
418 };
419
PropertySet(const char * key,const char * val)420 Sci_Position SCI_METHOD LexerPython::PropertySet(const char *key, const char *val) {
421 if (osPython.PropertySet(&options, key, val)) {
422 return 0;
423 }
424 return -1;
425 }
426
WordListSet(int n,const char * wl)427 Sci_Position SCI_METHOD LexerPython::WordListSet(int n, const char *wl) {
428 WordList *wordListN = nullptr;
429 switch (n) {
430 case 0:
431 wordListN = &keywords;
432 break;
433 case 1:
434 wordListN = &keywords2;
435 break;
436 }
437 Sci_Position firstModification = -1;
438 if (wordListN) {
439 WordList wlNew;
440 wlNew.Set(wl);
441 if (*wordListN != wlNew) {
442 wordListN->Set(wl);
443 firstModification = 0;
444 }
445 }
446 return firstModification;
447 }
448
ProcessLineEnd(StyleContext & sc,std::vector<SingleFStringExpState> & fstringStateStack,SingleFStringExpState * & currentFStringExp,bool & inContinuedString)449 void LexerPython::ProcessLineEnd(StyleContext &sc, std::vector<SingleFStringExpState> &fstringStateStack, SingleFStringExpState *¤tFStringExp, bool &inContinuedString) {
450 long deepestSingleStateIndex = -1;
451 unsigned long i;
452
453 // Find the deepest single quote state because that string will end; no \ continuation in f-string
454 for (i = 0; i < fstringStateStack.size(); i++) {
455 if (IsPySingleQuoteStringState(fstringStateStack[i].state)) {
456 deepestSingleStateIndex = i;
457 break;
458 }
459 }
460
461 if (deepestSingleStateIndex != -1) {
462 sc.SetState(fstringStateStack[deepestSingleStateIndex].state);
463 while (fstringStateStack.size() > static_cast<unsigned long>(deepestSingleStateIndex)) {
464 PopFromStateStack(fstringStateStack, currentFStringExp);
465 }
466 }
467 if (!fstringStateStack.empty()) {
468 std::pair<Sci_Position, std::vector<SingleFStringExpState> > val;
469 val.first = sc.currentLine;
470 val.second = fstringStateStack;
471
472 ftripleStateAtEol.insert(val);
473 }
474
475 if ((sc.state == SCE_P_DEFAULT)
476 || IsPyTripleQuoteStringState(sc.state)) {
477 // Perform colourisation of white space and triple quoted strings at end of each line to allow
478 // tab marking to work inside white space and triple quoted strings
479 sc.SetState(sc.state);
480 }
481 if (IsPySingleQuoteStringState(sc.state)) {
482 if (inContinuedString || options.stringsOverNewline) {
483 inContinuedString = false;
484 } else {
485 sc.ChangeState(SCE_P_STRINGEOL);
486 sc.ForwardSetState(SCE_P_DEFAULT);
487 }
488 }
489 }
490
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)491 void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
492 Accessor styler(pAccess, nullptr);
493
494 // Track whether in f-string expression; vector is used for a stack to
495 // handle nested f-strings such as f"""{f'''{f"{f'{1}'}"}'''}"""
496 std::vector<SingleFStringExpState> fstringStateStack;
497 SingleFStringExpState *currentFStringExp = nullptr;
498
499 const Sci_Position endPos = startPos + length;
500
501 // Backtrack to previous line in case need to fix its tab whinging
502 Sci_Position lineCurrent = styler.GetLine(startPos);
503 if (startPos > 0) {
504 if (lineCurrent > 0) {
505 lineCurrent--;
506 // Look for backslash-continued lines
507 while (lineCurrent > 0) {
508 const Sci_Position eolPos = styler.LineStart(lineCurrent) - 1;
509 const int eolStyle = styler.StyleAt(eolPos);
510 if (eolStyle == SCE_P_STRING
511 || eolStyle == SCE_P_CHARACTER
512 || eolStyle == SCE_P_STRINGEOL) {
513 lineCurrent -= 1;
514 } else {
515 break;
516 }
517 }
518 startPos = styler.LineStart(lineCurrent);
519 }
520 initStyle = startPos == 0 ? SCE_P_DEFAULT : styler.StyleAt(startPos - 1);
521 }
522
523 const literalsAllowed allowedLiterals = options.AllowedLiterals();
524
525 initStyle = initStyle & 31;
526 if (initStyle == SCE_P_STRINGEOL) {
527 initStyle = SCE_P_DEFAULT;
528 }
529
530 // Set up fstate stack from last line and remove any subsequent ftriple at eol states
531 std::map<Sci_Position, std::vector<SingleFStringExpState> >::iterator it;
532 it = ftripleStateAtEol.find(lineCurrent - 1);
533 if (it != ftripleStateAtEol.end() && !it->second.empty()) {
534 fstringStateStack = it->second;
535 currentFStringExp = &fstringStateStack.back();
536 }
537 it = ftripleStateAtEol.lower_bound(lineCurrent);
538 if (it != ftripleStateAtEol.end()) {
539 ftripleStateAtEol.erase(it, ftripleStateAtEol.end());
540 }
541
542 kwType kwLast = kwOther;
543 int spaceFlags = 0;
544 styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
545 bool base_n_number = false;
546
547 const WordClassifier &classifierIdentifiers = subStyles.Classifier(SCE_P_IDENTIFIER);
548
549 StyleContext sc(startPos, endPos - startPos, initStyle, styler);
550
551 bool indentGood = true;
552 Sci_Position startIndicator = sc.currentPos;
553 bool inContinuedString = false;
554
555 for (; sc.More(); sc.Forward()) {
556
557 if (sc.atLineStart) {
558 styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
559 indentGood = true;
560 if (options.whingeLevel == 1) {
561 indentGood = (spaceFlags & wsInconsistent) == 0;
562 } else if (options.whingeLevel == 2) {
563 indentGood = (spaceFlags & wsSpaceTab) == 0;
564 } else if (options.whingeLevel == 3) {
565 indentGood = (spaceFlags & wsSpace) == 0;
566 } else if (options.whingeLevel == 4) {
567 indentGood = (spaceFlags & wsTab) == 0;
568 }
569 if (!indentGood) {
570 styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 0);
571 startIndicator = sc.currentPos;
572 }
573 }
574
575 if (sc.atLineEnd) {
576 ProcessLineEnd(sc, fstringStateStack, currentFStringExp, inContinuedString);
577 lineCurrent++;
578 if (!sc.More())
579 break;
580 }
581
582 bool needEOLCheck = false;
583
584
585 if (sc.state == SCE_P_OPERATOR) {
586 kwLast = kwOther;
587 sc.SetState(SCE_P_DEFAULT);
588 } else if (sc.state == SCE_P_NUMBER) {
589 if (!IsAWordChar(sc.ch, false) &&
590 !(!base_n_number && ((sc.ch == '+' || sc.ch == '-') && (sc.chPrev == 'e' || sc.chPrev == 'E')))) {
591 sc.SetState(SCE_P_DEFAULT);
592 }
593 } else if (sc.state == SCE_P_IDENTIFIER) {
594 if ((sc.ch == '.') || (!IsAWordChar(sc.ch, options.unicodeIdentifiers))) {
595 char s[100];
596 sc.GetCurrent(s, sizeof(s));
597 int style = SCE_P_IDENTIFIER;
598 if ((kwLast == kwImport) && (strcmp(s, "as") == 0)) {
599 style = SCE_P_WORD;
600 } else if (keywords.InList(s)) {
601 style = SCE_P_WORD;
602 } else if (kwLast == kwClass) {
603 style = SCE_P_CLASSNAME;
604 } else if (kwLast == kwDef) {
605 style = SCE_P_DEFNAME;
606 } else if (kwLast == kwCDef || kwLast == kwCPDef) {
607 Sci_Position pos = sc.currentPos;
608 unsigned char ch = styler.SafeGetCharAt(pos, '\0');
609 while (ch != '\0') {
610 if (ch == '(') {
611 style = SCE_P_DEFNAME;
612 break;
613 } else if (ch == ':') {
614 style = SCE_P_CLASSNAME;
615 break;
616 } else if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') {
617 pos++;
618 ch = styler.SafeGetCharAt(pos, '\0');
619 } else {
620 break;
621 }
622 }
623 } else if (keywords2.InList(s)) {
624 if (options.keywords2NoSubIdentifiers) {
625 // We don't want to highlight keywords2
626 // that are used as a sub-identifier,
627 // i.e. not open in "foo.open".
628 const Sci_Position pos = styler.GetStartSegment() - 1;
629 if (pos < 0 || (styler.SafeGetCharAt(pos, '\0') != '.'))
630 style = SCE_P_WORD2;
631 } else {
632 style = SCE_P_WORD2;
633 }
634 } else {
635 int subStyle = classifierIdentifiers.ValueFor(s);
636 if (subStyle >= 0) {
637 style = subStyle;
638 }
639 }
640 sc.ChangeState(style);
641 sc.SetState(SCE_P_DEFAULT);
642 if (style == SCE_P_WORD) {
643 if (0 == strcmp(s, "class"))
644 kwLast = kwClass;
645 else if (0 == strcmp(s, "def"))
646 kwLast = kwDef;
647 else if (0 == strcmp(s, "import"))
648 kwLast = kwImport;
649 else if (0 == strcmp(s, "cdef"))
650 kwLast = kwCDef;
651 else if (0 == strcmp(s, "cpdef"))
652 kwLast = kwCPDef;
653 else if (0 == strcmp(s, "cimport"))
654 kwLast = kwImport;
655 else if (kwLast != kwCDef && kwLast != kwCPDef)
656 kwLast = kwOther;
657 } else if (kwLast != kwCDef && kwLast != kwCPDef) {
658 kwLast = kwOther;
659 }
660 }
661 } else if ((sc.state == SCE_P_COMMENTLINE) || (sc.state == SCE_P_COMMENTBLOCK)) {
662 if (sc.ch == '\r' || sc.ch == '\n') {
663 sc.SetState(SCE_P_DEFAULT);
664 }
665 } else if (sc.state == SCE_P_DECORATOR) {
666 if (!IsAWordStart(sc.ch, options.unicodeIdentifiers)) {
667 sc.SetState(SCE_P_DEFAULT);
668 }
669 } else if (IsPySingleQuoteStringState(sc.state)) {
670 if (sc.ch == '\\') {
671 if ((sc.chNext == '\r') && (sc.GetRelative(2) == '\n')) {
672 sc.Forward();
673 }
674 if (sc.chNext == '\n' || sc.chNext == '\r') {
675 inContinuedString = true;
676 } else {
677 // Don't roll over the newline.
678 sc.Forward();
679 }
680 } else if (sc.ch == GetPyStringQuoteChar(sc.state)) {
681 sc.ForwardSetState(SCE_P_DEFAULT);
682 needEOLCheck = true;
683 }
684 } else if ((sc.state == SCE_P_TRIPLE) || (sc.state == SCE_P_FTRIPLE)) {
685 if (sc.ch == '\\') {
686 sc.Forward();
687 } else if (sc.Match(R"(''')")) {
688 sc.Forward();
689 sc.Forward();
690 sc.ForwardSetState(SCE_P_DEFAULT);
691 needEOLCheck = true;
692 }
693 } else if ((sc.state == SCE_P_TRIPLEDOUBLE) || (sc.state == SCE_P_FTRIPLEDOUBLE)) {
694 if (sc.ch == '\\') {
695 sc.Forward();
696 } else if (sc.Match(R"(""")")) {
697 sc.Forward();
698 sc.Forward();
699 sc.ForwardSetState(SCE_P_DEFAULT);
700 needEOLCheck = true;
701 }
702 }
703
704 // Note if used and not if else because string states also match
705 // some of the above clauses
706 if (IsPyFStringState(sc.state) && sc.ch == '{') {
707 if (sc.chNext == '{') {
708 sc.Forward();
709 } else {
710 PushStateToStack(sc.state, fstringStateStack, currentFStringExp);
711 sc.ForwardSetState(SCE_P_DEFAULT);
712 }
713 needEOLCheck = true;
714 }
715
716 // If in an f-string expression, check for the ending quote(s)
717 // and end f-string to handle syntactically incorrect cases like
718 // f'{' and f"""{"""
719 if (!fstringStateStack.empty() && (sc.ch == '\'' || sc.ch == '"')) {
720 long matching_stack_i = -1;
721 for (unsigned long stack_i = 0; stack_i < fstringStateStack.size() && matching_stack_i == -1; stack_i++) {
722 const int stack_state = fstringStateStack[stack_i].state;
723 const char quote = GetPyStringQuoteChar(stack_state);
724 if (sc.ch == quote) {
725 if (IsPySingleQuoteStringState(stack_state)) {
726 matching_stack_i = stack_i;
727 } else if (quote == '"' ? sc.Match(R"(""")") : sc.Match("'''")) {
728 matching_stack_i = stack_i;
729 }
730 }
731 }
732
733 if (matching_stack_i != -1) {
734 sc.SetState(fstringStateStack[matching_stack_i].state);
735 if (IsPyTripleQuoteStringState(fstringStateStack[matching_stack_i].state)) {
736 sc.Forward();
737 sc.Forward();
738 }
739 sc.ForwardSetState(SCE_P_DEFAULT);
740 needEOLCheck = true;
741
742 while (fstringStateStack.size() > static_cast<unsigned long>(matching_stack_i)) {
743 PopFromStateStack(fstringStateStack, currentFStringExp);
744 }
745 }
746 }
747 // End of code to find the end of a state
748
749 if (!indentGood && !IsASpaceOrTab(sc.ch)) {
750 styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 1);
751 startIndicator = sc.currentPos;
752 indentGood = true;
753 }
754
755 // One cdef or cpdef line, clear kwLast only at end of line
756 if ((kwLast == kwCDef || kwLast == kwCPDef) && sc.atLineEnd) {
757 kwLast = kwOther;
758 }
759
760 // State exit code may have moved on to end of line
761 if (needEOLCheck && sc.atLineEnd) {
762 ProcessLineEnd(sc, fstringStateStack, currentFStringExp, inContinuedString);
763 lineCurrent++;
764 styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
765 if (!sc.More())
766 break;
767 }
768
769 // If in f-string expression, check for }, :, ! to resume f-string state or update nesting count
770 if (currentFStringExp && !IsPySingleQuoteStringState(sc.state) && !IsPyTripleQuoteStringState(sc.state)) {
771 if (currentFStringExp->nestingCount == 0 && (sc.ch == '}' || sc.ch == ':' || (sc.ch == '!' && sc.chNext != '='))) {
772 sc.SetState(PopFromStateStack(fstringStateStack, currentFStringExp));
773 } else {
774 if (sc.ch == '{' || sc.ch == '[' || sc.ch == '(') {
775 currentFStringExp->nestingCount++;
776 } else if (sc.ch == '}' || sc.ch == ']' || sc.ch == ')') {
777 currentFStringExp->nestingCount--;
778 }
779 }
780 }
781
782 // Check for a new state starting character
783 if (sc.state == SCE_P_DEFAULT) {
784 if (IsADigit(sc.ch) || (sc.ch == '.' && IsADigit(sc.chNext))) {
785 if (sc.ch == '0' && (sc.chNext == 'x' || sc.chNext == 'X')) {
786 base_n_number = true;
787 sc.SetState(SCE_P_NUMBER);
788 } else if (sc.ch == '0' &&
789 (sc.chNext == 'o' || sc.chNext == 'O' || sc.chNext == 'b' || sc.chNext == 'B')) {
790 if (options.base2or8Literals) {
791 base_n_number = true;
792 sc.SetState(SCE_P_NUMBER);
793 } else {
794 sc.SetState(SCE_P_NUMBER);
795 sc.ForwardSetState(SCE_P_IDENTIFIER);
796 }
797 } else {
798 base_n_number = false;
799 sc.SetState(SCE_P_NUMBER);
800 }
801 } else if (isoperator(sc.ch) || sc.ch == '`') {
802 sc.SetState(SCE_P_OPERATOR);
803 } else if (sc.ch == '#') {
804 sc.SetState(sc.chNext == '#' ? SCE_P_COMMENTBLOCK : SCE_P_COMMENTLINE);
805 } else if (sc.ch == '@') {
806 if (IsFirstNonWhitespace(sc.currentPos, styler))
807 sc.SetState(SCE_P_DECORATOR);
808 else
809 sc.SetState(SCE_P_OPERATOR);
810 } else if (IsPyStringStart(sc.ch, sc.chNext, sc.GetRelative(2), allowedLiterals)) {
811 Sci_PositionU nextIndex = 0;
812 sc.SetState(GetPyStringState(styler, sc.currentPos, &nextIndex, allowedLiterals));
813 while (nextIndex > (sc.currentPos + 1) && sc.More()) {
814 sc.Forward();
815 }
816 } else if (IsAWordStart(sc.ch, options.unicodeIdentifiers)) {
817 sc.SetState(SCE_P_IDENTIFIER);
818 }
819 }
820 }
821 styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 0);
822 sc.Complete();
823 }
824
IsCommentLine(Sci_Position line,Accessor & styler)825 static bool IsCommentLine(Sci_Position line, Accessor &styler) {
826 const Sci_Position pos = styler.LineStart(line);
827 const Sci_Position eol_pos = styler.LineStart(line + 1) - 1;
828 for (Sci_Position i = pos; i < eol_pos; i++) {
829 const char ch = styler[i];
830 if (ch == '#')
831 return true;
832 else if (ch != ' ' && ch != '\t')
833 return false;
834 }
835 return false;
836 }
837
IsQuoteLine(Sci_Position line,const Accessor & styler)838 static bool IsQuoteLine(Sci_Position line, const Accessor &styler) {
839 const int style = styler.StyleAt(styler.LineStart(line)) & 31;
840 return IsPyTripleQuoteStringState(style);
841 }
842
843
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)844 void SCI_METHOD LexerPython::Fold(Sci_PositionU startPos, Sci_Position length, int /*initStyle - unused*/, IDocument *pAccess) {
845 if (!options.fold)
846 return;
847
848 Accessor styler(pAccess, nullptr);
849
850 const Sci_Position maxPos = startPos + length;
851 const Sci_Position maxLines = (maxPos == styler.Length()) ? styler.GetLine(maxPos) : styler.GetLine(maxPos - 1); // Requested last line
852 const Sci_Position docLines = styler.GetLine(styler.Length()); // Available last line
853
854 // Backtrack to previous non-blank line so we can determine indent level
855 // for any white space lines (needed esp. within triple quoted strings)
856 // and so we can fix any preceding fold level (which is why we go back
857 // at least one line in all cases)
858 int spaceFlags = 0;
859 Sci_Position lineCurrent = styler.GetLine(startPos);
860 int indentCurrent = styler.IndentAmount(lineCurrent, &spaceFlags, nullptr);
861 while (lineCurrent > 0) {
862 lineCurrent--;
863 indentCurrent = styler.IndentAmount(lineCurrent, &spaceFlags, nullptr);
864 if (!(indentCurrent & SC_FOLDLEVELWHITEFLAG) &&
865 (!IsCommentLine(lineCurrent, styler)) &&
866 (!IsQuoteLine(lineCurrent, styler)))
867 break;
868 }
869 int indentCurrentLevel = indentCurrent & SC_FOLDLEVELNUMBERMASK;
870
871 // Set up initial loop state
872 startPos = styler.LineStart(lineCurrent);
873 int prev_state = SCE_P_DEFAULT & 31;
874 if (lineCurrent >= 1)
875 prev_state = styler.StyleAt(startPos - 1) & 31;
876 int prevQuote = options.foldQuotes && IsPyTripleQuoteStringState(prev_state);
877
878 // Process all characters to end of requested range or end of any triple quote
879 //that hangs over the end of the range. Cap processing in all cases
880 // to end of document (in case of unclosed quote at end).
881 while ((lineCurrent <= docLines) && ((lineCurrent <= maxLines) || prevQuote)) {
882
883 // Gather info
884 int lev = indentCurrent;
885 Sci_Position lineNext = lineCurrent + 1;
886 int indentNext = indentCurrent;
887 int quote = false;
888 if (lineNext <= docLines) {
889 // Information about next line is only available if not at end of document
890 indentNext = styler.IndentAmount(lineNext, &spaceFlags, nullptr);
891 const Sci_Position lookAtPos = (styler.LineStart(lineNext) == styler.Length()) ? styler.Length() - 1 : styler.LineStart(lineNext);
892 const int style = styler.StyleAt(lookAtPos) & 31;
893 quote = options.foldQuotes && IsPyTripleQuoteStringState(style);
894 }
895 const int quote_start = (quote && !prevQuote);
896 const int quote_continue = (quote && prevQuote);
897 if (!quote || !prevQuote)
898 indentCurrentLevel = indentCurrent & SC_FOLDLEVELNUMBERMASK;
899 if (quote)
900 indentNext = indentCurrentLevel;
901 if (indentNext & SC_FOLDLEVELWHITEFLAG)
902 indentNext = SC_FOLDLEVELWHITEFLAG | indentCurrentLevel;
903
904 if (quote_start) {
905 // Place fold point at start of triple quoted string
906 lev |= SC_FOLDLEVELHEADERFLAG;
907 } else if (quote_continue || prevQuote) {
908 // Add level to rest of lines in the string
909 lev = lev + 1;
910 }
911
912 // Skip past any blank lines for next indent level info; we skip also
913 // comments (all comments, not just those starting in column 0)
914 // which effectively folds them into surrounding code rather
915 // than screwing up folding. If comments end file, use the min
916 // comment indent as the level after
917
918 int minCommentLevel = indentCurrentLevel;
919 while (!quote &&
920 (lineNext < docLines) &&
921 ((indentNext & SC_FOLDLEVELWHITEFLAG) ||
922 (lineNext <= docLines && IsCommentLine(lineNext, styler)))) {
923
924 if (IsCommentLine(lineNext, styler) && indentNext < minCommentLevel) {
925 minCommentLevel = indentNext;
926 }
927
928 lineNext++;
929 indentNext = styler.IndentAmount(lineNext, &spaceFlags, nullptr);
930 }
931
932 const int levelAfterComments = ((lineNext < docLines) ? indentNext & SC_FOLDLEVELNUMBERMASK : minCommentLevel);
933 const int levelBeforeComments = std::max(indentCurrentLevel, levelAfterComments);
934
935 // Now set all the indent levels on the lines we skipped
936 // Do this from end to start. Once we encounter one line
937 // which is indented more than the line after the end of
938 // the comment-block, use the level of the block before
939
940 Sci_Position skipLine = lineNext;
941 int skipLevel = levelAfterComments;
942
943 while (--skipLine > lineCurrent) {
944 const int skipLineIndent = styler.IndentAmount(skipLine, &spaceFlags, nullptr);
945
946 if (options.foldCompact) {
947 if ((skipLineIndent & SC_FOLDLEVELNUMBERMASK) > levelAfterComments)
948 skipLevel = levelBeforeComments;
949
950 const int whiteFlag = skipLineIndent & SC_FOLDLEVELWHITEFLAG;
951
952 styler.SetLevel(skipLine, skipLevel | whiteFlag);
953 } else {
954 if ((skipLineIndent & SC_FOLDLEVELNUMBERMASK) > levelAfterComments &&
955 !(skipLineIndent & SC_FOLDLEVELWHITEFLAG) &&
956 !IsCommentLine(skipLine, styler))
957 skipLevel = levelBeforeComments;
958
959 styler.SetLevel(skipLine, skipLevel);
960 }
961 }
962
963 // Set fold header on non-quote line
964 if (!quote && !(indentCurrent & SC_FOLDLEVELWHITEFLAG)) {
965 if ((indentCurrent & SC_FOLDLEVELNUMBERMASK) < (indentNext & SC_FOLDLEVELNUMBERMASK))
966 lev |= SC_FOLDLEVELHEADERFLAG;
967 }
968
969 // Keep track of triple quote state of previous line
970 prevQuote = quote;
971
972 // Set fold level for this line and move to next line
973 styler.SetLevel(lineCurrent, options.foldCompact ? lev : lev & ~SC_FOLDLEVELWHITEFLAG);
974 indentCurrent = indentNext;
975 lineCurrent = lineNext;
976 }
977
978 // NOTE: Cannot set level of last line here because indentCurrent doesn't have
979 // header flag set; the loop above is crafted to take care of this case!
980 //styler.SetLevel(lineCurrent, indentCurrent);
981 }
982
983 LexerModule lmPython(SCLEX_PYTHON, LexerPython::LexerFactoryPython, "python",
984 pythonWordListDesc);
985