1 // Scintilla source code edit control
2 /** @file LexPython.cxx
3  ** Lexer for Python.
4  **/
5 // Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #include <stdlib.h>
9 #include <string.h>
10 #include <stdio.h>
11 #include <stdarg.h>
12 #include <assert.h>
13 #include <ctype.h>
14 
15 #include <string>
16 #include <vector>
17 #include <map>
18 #include <algorithm>
19 
20 #include "ILexer.h"
21 #include "Scintilla.h"
22 #include "SciLexer.h"
23 
24 #include "StringCopy.h"
25 #include "WordList.h"
26 #include "LexAccessor.h"
27 #include "Accessor.h"
28 #include "StyleContext.h"
29 #include "CharacterSet.h"
30 #include "CharacterCategory.h"
31 #include "LexerModule.h"
32 #include "OptionSet.h"
33 #include "SubStyles.h"
34 #include "DefaultLexer.h"
35 
36 using namespace Scintilla;
37 
38 namespace {
39 // Use an unnamed namespace to protect the functions and classes from name conflicts
40 
41 /* Notes on f-strings: f-strings are strings prefixed with f (e.g. f'') that may
42    have arbitrary expressions in {}.  The tokens in the expressions are lexed as if
43    they were outside of any string.  Expressions may contain { and } characters as
44    long as there is a closing } for every {, may be 2+ lines in a triple quoted
45    string, and may have a formatting specifier following a ! or :, but both !
46    and : are valid inside of a bracketed expression and != is a valid
47    expression token even outside of a bracketed expression.
48 
49    When in an f-string expression, the lexer keeps track of the state value of
50    the f-string and the nesting count for the expression (# of [, (, { seen - # of
51    }, ), ] seen).  f-strings may be nested (e.g. f'{ a + f"{1+2}"') so a stack of
52    states and nesting counts is kept.  If a f-string expression continues beyond
53    the end of a line, this stack is saved in a std::map that maps a line number to
54    the stack at the end of that line.  std::vector is used for the stack.
55 
56    The PEP for f-strings is at https://www.python.org/dev/peps/pep-0498/
57 */
58 struct SingleFStringExpState {
59 	int state;
60 	int nestingCount;
61 };
62 
63 /* kwCDef, kwCTypeName only used for Cython */
64 enum kwType { kwOther, kwClass, kwDef, kwImport, kwCDef, kwCTypeName, kwCPDef };
65 
66 enum literalsAllowed { litNone = 0, litU = 1, litB = 2, litF = 4 };
67 
68 const int indicatorWhitespace = 1;
69 
IsPyComment(Accessor & styler,Sci_Position pos,Sci_Position len)70 bool IsPyComment(Accessor &styler, Sci_Position pos, Sci_Position len) {
71 	return len > 0 && styler[pos] == '#';
72 }
73 
IsPyStringTypeChar(int ch,literalsAllowed allowed)74 bool IsPyStringTypeChar(int ch, literalsAllowed allowed) {
75 	return
76 		((allowed & litB) && (ch == 'b' || ch == 'B')) ||
77 		((allowed & litU) && (ch == 'u' || ch == 'U')) ||
78 		((allowed & litF) && (ch == 'f' || ch == 'F'));
79 }
80 
IsPyStringStart(int ch,int chNext,int chNext2,literalsAllowed allowed)81 bool IsPyStringStart(int ch, int chNext, int chNext2, literalsAllowed allowed) {
82 	if (ch == '\'' || ch == '"')
83 		return true;
84 	if (IsPyStringTypeChar(ch, allowed)) {
85 		if (chNext == '"' || chNext == '\'')
86 			return true;
87 		if ((chNext == 'r' || chNext == 'R') && (chNext2 == '"' || chNext2 == '\''))
88 			return true;
89 	}
90 	if ((ch == 'r' || ch == 'R') && (chNext == '"' || chNext == '\''))
91 		return true;
92 
93 	return false;
94 }
95 
IsPyFStringState(int st)96 bool IsPyFStringState(int st) {
97 	return ((st == SCE_P_FCHARACTER) || (st == SCE_P_FSTRING) ||
98 		(st == SCE_P_FTRIPLE) || (st == SCE_P_FTRIPLEDOUBLE));
99 }
100 
IsPySingleQuoteStringState(int st)101 bool IsPySingleQuoteStringState(int st) {
102 	return ((st == SCE_P_CHARACTER) || (st == SCE_P_STRING) ||
103 		(st == SCE_P_FCHARACTER) || (st == SCE_P_FSTRING));
104 }
105 
IsPyTripleQuoteStringState(int st)106 bool IsPyTripleQuoteStringState(int st) {
107 	return ((st == SCE_P_TRIPLE) || (st == SCE_P_TRIPLEDOUBLE) ||
108 		(st == SCE_P_FTRIPLE) || (st == SCE_P_FTRIPLEDOUBLE));
109 }
110 
GetPyStringQuoteChar(int st)111 char GetPyStringQuoteChar(int st) {
112 	if ((st == SCE_P_CHARACTER) || (st == SCE_P_FCHARACTER) ||
113 			(st == SCE_P_TRIPLE) || (st == SCE_P_FTRIPLE))
114 		return '\'';
115 	if ((st == SCE_P_STRING) || (st == SCE_P_FSTRING) ||
116 			(st == SCE_P_TRIPLEDOUBLE) || (st == SCE_P_FTRIPLEDOUBLE))
117 		return '"';
118 
119 	return '\0';
120 }
121 
PushStateToStack(int state,std::vector<SingleFStringExpState> & stack,SingleFStringExpState * & currentFStringExp)122 void PushStateToStack(int state, std::vector<SingleFStringExpState> &stack, SingleFStringExpState *&currentFStringExp) {
123 	SingleFStringExpState single = {state, 0};
124 	stack.push_back(single);
125 
126 	currentFStringExp = &stack.back();
127 }
128 
PopFromStateStack(std::vector<SingleFStringExpState> & stack,SingleFStringExpState * & currentFStringExp)129 int PopFromStateStack(std::vector<SingleFStringExpState> &stack, SingleFStringExpState *&currentFStringExp) {
130 	int state = 0;
131 
132 	if (!stack.empty()) {
133 		state = stack.back().state;
134 		stack.pop_back();
135 	}
136 
137 	if (stack.empty()) {
138 		currentFStringExp = NULL;
139 	} else {
140 		currentFStringExp = &stack.back();
141 	}
142 
143 	return state;
144 }
145 
146 /* Return the state to use for the string starting at i; *nextIndex will be set to the first index following the quote(s) */
GetPyStringState(Accessor & styler,Sci_Position i,Sci_PositionU * nextIndex,literalsAllowed allowed)147 int GetPyStringState(Accessor &styler, Sci_Position i, Sci_PositionU *nextIndex, literalsAllowed allowed) {
148 	char ch = styler.SafeGetCharAt(i);
149 	char chNext = styler.SafeGetCharAt(i + 1);
150 	const int firstIsF = (ch == 'f' || ch == 'F');
151 
152 	// Advance beyond r, u, or ur prefix (or r, b, or br in Python 2.7+ and r, f, or fr in Python 3.6+), but bail if there are any unexpected chars
153 	if (ch == 'r' || ch == 'R') {
154 		i++;
155 		ch = styler.SafeGetCharAt(i);
156 		chNext = styler.SafeGetCharAt(i + 1);
157 	} else if (IsPyStringTypeChar(ch, allowed)) {
158 		if (chNext == 'r' || chNext == 'R')
159 			i += 2;
160 		else
161 			i += 1;
162 		ch = styler.SafeGetCharAt(i);
163 		chNext = styler.SafeGetCharAt(i + 1);
164 	}
165 
166 	if (ch != '"' && ch != '\'') {
167 		*nextIndex = i + 1;
168 		return SCE_P_DEFAULT;
169 	}
170 
171 	if (ch == chNext && ch == styler.SafeGetCharAt(i + 2)) {
172 		*nextIndex = i + 3;
173 
174 		if (ch == '"')
175 			return (firstIsF ? SCE_P_FTRIPLEDOUBLE : SCE_P_TRIPLEDOUBLE);
176 		else
177 			return (firstIsF ? SCE_P_FTRIPLE : SCE_P_TRIPLE);
178 	} else {
179 		*nextIndex = i + 1;
180 
181 		if (ch == '"')
182 			return (firstIsF ? SCE_P_FSTRING : SCE_P_STRING);
183 		else
184 			return (firstIsF ? SCE_P_FCHARACTER : SCE_P_CHARACTER);
185 	}
186 }
187 
IsAWordChar(int ch,bool unicodeIdentifiers)188 inline bool IsAWordChar(int ch, bool unicodeIdentifiers) {
189 	if (ch < 0x80)
190 		return (isalnum(ch) || ch == '.' || ch == '_');
191 
192 	if (!unicodeIdentifiers)
193 		return false;
194 
195 	// Python uses the XID_Continue set from unicode data
196 	return IsXidContinue(ch);
197 }
198 
IsAWordStart(int ch,bool unicodeIdentifiers)199 inline bool IsAWordStart(int ch, bool unicodeIdentifiers) {
200 	if (ch < 0x80)
201 		return (isalpha(ch) || ch == '_');
202 
203 	if (!unicodeIdentifiers)
204 		return false;
205 
206 	// Python uses the XID_Start set from unicode data
207 	return IsXidStart(ch);
208 }
209 
IsFirstNonWhitespace(Sci_Position pos,Accessor & styler)210 static bool IsFirstNonWhitespace(Sci_Position pos, Accessor &styler) {
211 	Sci_Position line = styler.GetLine(pos);
212 	Sci_Position start_pos = styler.LineStart(line);
213 	for (Sci_Position i = start_pos; i < pos; i++) {
214 		const char ch = styler[i];
215 		if (!(ch == ' ' || ch == '\t'))
216 			return false;
217 	}
218 	return true;
219 }
220 
221 // Options used for LexerPython
222 struct OptionsPython {
223 	int whingeLevel;
224 	bool base2or8Literals;
225 	bool stringsU;
226 	bool stringsB;
227 	bool stringsF;
228 	bool stringsOverNewline;
229 	bool keywords2NoSubIdentifiers;
230 	bool fold;
231 	bool foldQuotes;
232 	bool foldCompact;
233 	bool unicodeIdentifiers;
234 
OptionsPython__anond1800df60111::OptionsPython235 	OptionsPython() {
236 		whingeLevel = 0;
237 		base2or8Literals = true;
238 		stringsU = true;
239 		stringsB = true;
240 		stringsF = true;
241 		stringsOverNewline = false;
242 		keywords2NoSubIdentifiers = false;
243 		fold = false;
244 		foldQuotes = false;
245 		foldCompact = false;
246 		unicodeIdentifiers = true;
247 	}
248 
AllowedLiterals__anond1800df60111::OptionsPython249 	literalsAllowed AllowedLiterals() const {
250 		literalsAllowed allowedLiterals = stringsU ? litU : litNone;
251 		if (stringsB)
252 			allowedLiterals = static_cast<literalsAllowed>(allowedLiterals | litB);
253 		if (stringsF)
254 			allowedLiterals = static_cast<literalsAllowed>(allowedLiterals | litF);
255 		return allowedLiterals;
256 	}
257 };
258 
259 static const char *const pythonWordListDesc[] = {
260 	"Keywords",
261 	"Highlighted identifiers",
262 	0
263 };
264 
265 struct OptionSetPython : public OptionSet<OptionsPython> {
OptionSetPython__anond1800df60111::OptionSetPython266 	OptionSetPython() {
267 		DefineProperty("tab.timmy.whinge.level", &OptionsPython::whingeLevel,
268 			       "For Python code, checks whether indenting is consistent. "
269 			       "The default, 0 turns off indentation checking, "
270 			       "1 checks whether each line is potentially inconsistent with the previous line, "
271 			       "2 checks whether any space characters occur before a tab character in the indentation, "
272 			       "3 checks whether any spaces are in the indentation, and "
273 			       "4 checks for any tab characters in the indentation. "
274 			       "1 is a good level to use.");
275 
276 		DefineProperty("lexer.python.literals.binary", &OptionsPython::base2or8Literals,
277 			       "Set to 0 to not recognise Python 3 binary and octal literals: 0b1011 0o712.");
278 
279 		DefineProperty("lexer.python.strings.u", &OptionsPython::stringsU,
280 			       "Set to 0 to not recognise Python Unicode literals u\"x\" as used before Python 3.");
281 
282 		DefineProperty("lexer.python.strings.b", &OptionsPython::stringsB,
283 			       "Set to 0 to not recognise Python 3 bytes literals b\"x\".");
284 
285 		DefineProperty("lexer.python.strings.f", &OptionsPython::stringsF,
286 			       "Set to 0 to not recognise Python 3.6 f-string literals f\"var={var}\".");
287 
288 		DefineProperty("lexer.python.strings.over.newline", &OptionsPython::stringsOverNewline,
289 			       "Set to 1 to allow strings to span newline characters.");
290 
291 		DefineProperty("lexer.python.keywords2.no.sub.identifiers", &OptionsPython::keywords2NoSubIdentifiers,
292 			       "When enabled, it will not style keywords2 items that are used as a sub-identifier. "
293 			       "Example: when set, will not highlight \"foo.open\" when \"open\" is a keywords2 item.");
294 
295 		DefineProperty("fold", &OptionsPython::fold);
296 
297 		DefineProperty("fold.quotes.python", &OptionsPython::foldQuotes,
298 			       "This option enables folding multi-line quoted strings when using the Python lexer.");
299 
300 		DefineProperty("fold.compact", &OptionsPython::foldCompact);
301 
302 		DefineProperty("lexer.python.unicode.identifiers", &OptionsPython::unicodeIdentifiers,
303 			       "Set to 0 to not recognise Python 3 unicode identifiers.");
304 
305 		DefineWordListSets(pythonWordListDesc);
306 	}
307 };
308 
309 const char styleSubable[] = { SCE_P_IDENTIFIER, 0 };
310 
311 LexicalClass lexicalClasses[] = {
312 	// Lexer Python SCLEX_PYTHON SCE_P_:
313 	0, "SCE_P_DEFAULT", "default", "White space",
314 	1, "SCE_P_COMMENTLINE", "comment line", "Comment",
315 	2, "SCE_P_NUMBER", "literal numeric", "Number",
316 	3, "SCE_P_STRING", "literal string", "String",
317 	4, "SCE_P_CHARACTER", "literal string", "Single quoted string",
318 	5, "SCE_P_WORD", "keyword", "Keyword",
319 	6, "SCE_P_TRIPLE", "literal string", "Triple quotes",
320 	7, "SCE_P_TRIPLEDOUBLE", "literal string", "Triple double quotes",
321 	8, "SCE_P_CLASSNAME", "identifier", "Class name definition",
322 	9, "SCE_P_DEFNAME", "identifier", "Function or method name definition",
323 	10, "SCE_P_OPERATOR", "operator", "Operators",
324 	11, "SCE_P_IDENTIFIER", "identifier", "Identifiers",
325 	12, "SCE_P_COMMENTBLOCK", "comment", "Comment-blocks",
326 	13, "SCE_P_STRINGEOL", "error literal string", "End of line where string is not closed",
327 	14, "SCE_P_WORD2", "identifier", "Highlighted identifiers",
328 	15, "SCE_P_DECORATOR", "preprocessor", "Decorators",
329 	16, "SCE_P_FSTRING", "literal string interpolated", "F-String",
330 	17, "SCE_P_FCHARACTER", "literal string interpolated", "Single quoted f-string",
331 	18, "SCE_P_FTRIPLE", "literal string interpolated", "Triple quoted f-string",
332 	19, "SCE_P_FTRIPLEDOUBLE", "literal string interpolated", "Triple double quoted f-string",
333 };
334 
335 }
336 
337 class LexerPython : public DefaultLexer {
338 	WordList keywords;
339 	WordList keywords2;
340 	OptionsPython options;
341 	OptionSetPython osPython;
342 	enum { ssIdentifier };
343 	SubStyles subStyles;
344 	std::map<Sci_Position, std::vector<SingleFStringExpState> > ftripleStateAtEol;
345 public:
LexerPython()346 	explicit LexerPython() :
347 		DefaultLexer("python", SCLEX_PYTHON, lexicalClasses, ELEMENTS(lexicalClasses)),
348 		subStyles(styleSubable, 0x80, 0x40, 0) {
349 	}
~LexerPython()350 	~LexerPython() override {
351 	}
Release()352 	void SCI_METHOD Release() override {
353 		delete this;
354 	}
Version() const355 	int SCI_METHOD Version() const override {
356 		return lvRelease5;
357 	}
PropertyNames()358 	const char *SCI_METHOD PropertyNames() override {
359 		return osPython.PropertyNames();
360 	}
PropertyType(const char * name)361 	int SCI_METHOD PropertyType(const char *name) override {
362 		return osPython.PropertyType(name);
363 	}
DescribeProperty(const char * name)364 	const char *SCI_METHOD DescribeProperty(const char *name) override {
365 		return osPython.DescribeProperty(name);
366 	}
367 	Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override;
PropertyGet(const char * key)368 	const char * SCI_METHOD PropertyGet(const char *key) override {
369 		return osPython.PropertyGet(key);
370 	}
DescribeWordListSets()371 	const char *SCI_METHOD DescribeWordListSets() override {
372 		return osPython.DescribeWordListSets();
373 	}
374 	Sci_Position SCI_METHOD WordListSet(int n, const char *wl) override;
375 	void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
376 	void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
377 
PrivateCall(int,void *)378 	void *SCI_METHOD PrivateCall(int, void *) override {
379 		return 0;
380 	}
381 
LineEndTypesSupported()382 	int SCI_METHOD LineEndTypesSupported() override {
383 		return SC_LINE_END_TYPE_UNICODE;
384 	}
385 
AllocateSubStyles(int styleBase,int numberStyles)386 	int SCI_METHOD AllocateSubStyles(int styleBase, int numberStyles) override {
387 		return subStyles.Allocate(styleBase, numberStyles);
388 	}
SubStylesStart(int styleBase)389 	int SCI_METHOD SubStylesStart(int styleBase) override {
390 		return subStyles.Start(styleBase);
391 	}
SubStylesLength(int styleBase)392 	int SCI_METHOD SubStylesLength(int styleBase) override {
393 		return subStyles.Length(styleBase);
394 	}
StyleFromSubStyle(int subStyle)395 	int SCI_METHOD StyleFromSubStyle(int subStyle) override {
396 		const int styleBase = subStyles.BaseStyle(subStyle);
397 		return styleBase;
398 	}
PrimaryStyleFromStyle(int style)399 	int SCI_METHOD PrimaryStyleFromStyle(int style) override {
400 		return style;
401 	}
FreeSubStyles()402 	void SCI_METHOD FreeSubStyles() override {
403 		subStyles.Free();
404 	}
SetIdentifiers(int style,const char * identifiers)405 	void SCI_METHOD SetIdentifiers(int style, const char *identifiers) override {
406 		subStyles.SetIdentifiers(style, identifiers);
407 	}
DistanceToSecondaryStyles()408 	int SCI_METHOD DistanceToSecondaryStyles() override {
409 		return 0;
410 	}
GetSubStyleBases()411 	const char *SCI_METHOD GetSubStyleBases() override {
412 		return styleSubable;
413 	}
414 
LexerFactoryPython()415 	static ILexer5 *LexerFactoryPython() {
416 		return new LexerPython();
417 	}
418 
419 private:
420 	void ProcessLineEnd(StyleContext &sc, std::vector<SingleFStringExpState> &fstringStateStack, SingleFStringExpState *&currentFStringExp, bool &inContinuedString);
421 };
422 
PropertySet(const char * key,const char * val)423 Sci_Position SCI_METHOD LexerPython::PropertySet(const char *key, const char *val) {
424 	if (osPython.PropertySet(&options, key, val)) {
425 		return 0;
426 	}
427 	return -1;
428 }
429 
WordListSet(int n,const char * wl)430 Sci_Position SCI_METHOD LexerPython::WordListSet(int n, const char *wl) {
431 	WordList *wordListN = 0;
432 	switch (n) {
433 	case 0:
434 		wordListN = &keywords;
435 		break;
436 	case 1:
437 		wordListN = &keywords2;
438 		break;
439 	}
440 	Sci_Position firstModification = -1;
441 	if (wordListN) {
442 		WordList wlNew;
443 		wlNew.Set(wl);
444 		if (*wordListN != wlNew) {
445 			wordListN->Set(wl);
446 			firstModification = 0;
447 		}
448 	}
449 	return firstModification;
450 }
451 
ProcessLineEnd(StyleContext & sc,std::vector<SingleFStringExpState> & fstringStateStack,SingleFStringExpState * & currentFStringExp,bool & inContinuedString)452 void LexerPython::ProcessLineEnd(StyleContext &sc, std::vector<SingleFStringExpState> &fstringStateStack, SingleFStringExpState *&currentFStringExp, bool &inContinuedString) {
453 	long deepestSingleStateIndex = -1;
454 	unsigned long i;
455 
456 	// Find the deepest single quote state because that string will end; no \ continuation in f-string
457 	for (i = 0; i < fstringStateStack.size(); i++) {
458 		if (IsPySingleQuoteStringState(fstringStateStack[i].state)) {
459 			deepestSingleStateIndex = i;
460 			break;
461 		}
462 	}
463 
464 	if (deepestSingleStateIndex != -1) {
465 		sc.SetState(fstringStateStack[deepestSingleStateIndex].state);
466 		while (fstringStateStack.size() > static_cast<unsigned long>(deepestSingleStateIndex)) {
467 			PopFromStateStack(fstringStateStack, currentFStringExp);
468 		}
469 	}
470 	if (!fstringStateStack.empty()) {
471 		std::pair<Sci_Position, std::vector<SingleFStringExpState> > val;
472 		val.first = sc.currentLine;
473 		val.second = fstringStateStack;
474 
475 		ftripleStateAtEol.insert(val);
476 	}
477 
478 	if ((sc.state == SCE_P_DEFAULT)
479 			|| IsPyTripleQuoteStringState(sc.state)) {
480 		// Perform colourisation of white space and triple quoted strings at end of each line to allow
481 		// tab marking to work inside white space and triple quoted strings
482 		sc.SetState(sc.state);
483 	}
484 	if (IsPySingleQuoteStringState(sc.state)) {
485 		if (inContinuedString || options.stringsOverNewline) {
486 			inContinuedString = false;
487 		} else {
488 			sc.ChangeState(SCE_P_STRINGEOL);
489 			sc.ForwardSetState(SCE_P_DEFAULT);
490 		}
491 	}
492 }
493 
Lex(Sci_PositionU startPos,Sci_Position length,int initStyle,IDocument * pAccess)494 void SCI_METHOD LexerPython::Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) {
495 	Accessor styler(pAccess, NULL);
496 
497 	// Track whether in f-string expression; vector is used for a stack to
498 	// handle nested f-strings such as f"""{f'''{f"{f'{1}'}"}'''}"""
499 	std::vector<SingleFStringExpState> fstringStateStack;
500 	SingleFStringExpState *currentFStringExp = NULL;
501 
502 	const Sci_Position endPos = startPos + length;
503 
504 	// Backtrack to previous line in case need to fix its tab whinging
505 	Sci_Position lineCurrent = styler.GetLine(startPos);
506 	if (startPos > 0) {
507 		if (lineCurrent > 0) {
508 			lineCurrent--;
509 			// Look for backslash-continued lines
510 			while (lineCurrent > 0) {
511 				Sci_Position eolPos = styler.LineStart(lineCurrent) - 1;
512 				const int eolStyle = styler.StyleAt(eolPos);
513 				if (eolStyle == SCE_P_STRING
514 						|| eolStyle == SCE_P_CHARACTER
515 						|| eolStyle == SCE_P_STRINGEOL) {
516 					lineCurrent -= 1;
517 				} else {
518 					break;
519 				}
520 			}
521 			startPos = styler.LineStart(lineCurrent);
522 		}
523 		initStyle = startPos == 0 ? SCE_P_DEFAULT : styler.StyleAt(startPos - 1);
524 	}
525 
526 	const literalsAllowed allowedLiterals = options.AllowedLiterals();
527 
528 	initStyle = initStyle & 31;
529 	if (initStyle == SCE_P_STRINGEOL) {
530 		initStyle = SCE_P_DEFAULT;
531 	}
532 
533 	// Set up fstate stack from last line and remove any subsequent ftriple at eol states
534 	std::map<Sci_Position, std::vector<SingleFStringExpState> >::iterator it;
535 	it = ftripleStateAtEol.find(lineCurrent - 1);
536 	if (it != ftripleStateAtEol.end() && !it->second.empty()) {
537 		fstringStateStack = it->second;
538 		currentFStringExp = &fstringStateStack.back();
539 	}
540 	it = ftripleStateAtEol.lower_bound(lineCurrent);
541 	if (it != ftripleStateAtEol.end()) {
542 		ftripleStateAtEol.erase(it, ftripleStateAtEol.end());
543 	}
544 
545 	kwType kwLast = kwOther;
546 	int spaceFlags = 0;
547 	styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
548 	bool base_n_number = false;
549 
550 	const WordClassifier &classifierIdentifiers = subStyles.Classifier(SCE_P_IDENTIFIER);
551 
552 	StyleContext sc(startPos, endPos - startPos, initStyle, styler);
553 
554 	bool indentGood = true;
555 	Sci_Position startIndicator = sc.currentPos;
556 	bool inContinuedString = false;
557 
558 	for (; sc.More(); sc.Forward()) {
559 
560 		if (sc.atLineStart) {
561 			styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
562 			indentGood = true;
563 			if (options.whingeLevel == 1) {
564 				indentGood = (spaceFlags & wsInconsistent) == 0;
565 			} else if (options.whingeLevel == 2) {
566 				indentGood = (spaceFlags & wsSpaceTab) == 0;
567 			} else if (options.whingeLevel == 3) {
568 				indentGood = (spaceFlags & wsSpace) == 0;
569 			} else if (options.whingeLevel == 4) {
570 				indentGood = (spaceFlags & wsTab) == 0;
571 			}
572 			if (!indentGood) {
573 				styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 0);
574 				startIndicator = sc.currentPos;
575 			}
576 		}
577 
578 		if (sc.atLineEnd) {
579 			ProcessLineEnd(sc, fstringStateStack, currentFStringExp, inContinuedString);
580 			lineCurrent++;
581 			if (!sc.More())
582 				break;
583 		}
584 
585 		bool needEOLCheck = false;
586 
587 
588 		if (sc.state == SCE_P_OPERATOR) {
589 			kwLast = kwOther;
590 			sc.SetState(SCE_P_DEFAULT);
591 		} else if (sc.state == SCE_P_NUMBER) {
592 			if (!IsAWordChar(sc.ch, false) &&
593 					!(!base_n_number && ((sc.ch == '+' || sc.ch == '-') && (sc.chPrev == 'e' || sc.chPrev == 'E')))) {
594 				sc.SetState(SCE_P_DEFAULT);
595 			}
596 		} else if (sc.state == SCE_P_IDENTIFIER) {
597 			if ((sc.ch == '.') || (!IsAWordChar(sc.ch, options.unicodeIdentifiers))) {
598 				char s[100];
599 				sc.GetCurrent(s, sizeof(s));
600 				int style = SCE_P_IDENTIFIER;
601 				if ((kwLast == kwImport) && (strcmp(s, "as") == 0)) {
602 					style = SCE_P_WORD;
603 				} else if (keywords.InList(s)) {
604 					style = SCE_P_WORD;
605 				} else if (kwLast == kwClass) {
606 					style = SCE_P_CLASSNAME;
607 				} else if (kwLast == kwDef) {
608 					style = SCE_P_DEFNAME;
609 				} else if (kwLast == kwCDef || kwLast == kwCPDef) {
610 					Sci_Position pos = sc.currentPos;
611 					unsigned char ch = styler.SafeGetCharAt(pos, '\0');
612 					while (ch != '\0') {
613 						if (ch == '(') {
614 							style = SCE_P_DEFNAME;
615 							break;
616 						} else if (ch == ':') {
617 							style = SCE_P_CLASSNAME;
618 							break;
619 						} else if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') {
620 							pos++;
621 							ch = styler.SafeGetCharAt(pos, '\0');
622 						} else {
623 							break;
624 						}
625 					}
626 				} else if (keywords2.InList(s)) {
627 					if (options.keywords2NoSubIdentifiers) {
628 						// We don't want to highlight keywords2
629 						// that are used as a sub-identifier,
630 						// i.e. not open in "foo.open".
631 						Sci_Position pos = styler.GetStartSegment() - 1;
632 						if (pos < 0 || (styler.SafeGetCharAt(pos, '\0') != '.'))
633 							style = SCE_P_WORD2;
634 					} else {
635 						style = SCE_P_WORD2;
636 					}
637 				} else {
638 					int subStyle = classifierIdentifiers.ValueFor(s);
639 					if (subStyle >= 0) {
640 						style = subStyle;
641 					}
642 				}
643 				sc.ChangeState(style);
644 				sc.SetState(SCE_P_DEFAULT);
645 				if (style == SCE_P_WORD) {
646 					if (0 == strcmp(s, "class"))
647 						kwLast = kwClass;
648 					else if (0 == strcmp(s, "def"))
649 						kwLast = kwDef;
650 					else if (0 == strcmp(s, "import"))
651 						kwLast = kwImport;
652 					else if (0 == strcmp(s, "cdef"))
653 						kwLast = kwCDef;
654 					else if (0 == strcmp(s, "cpdef"))
655 						kwLast = kwCPDef;
656 					else if (0 == strcmp(s, "cimport"))
657 						kwLast = kwImport;
658 					else if (kwLast != kwCDef && kwLast != kwCPDef)
659 						kwLast = kwOther;
660 				} else if (kwLast != kwCDef && kwLast != kwCPDef) {
661 					kwLast = kwOther;
662 				}
663 			}
664 		} else if ((sc.state == SCE_P_COMMENTLINE) || (sc.state == SCE_P_COMMENTBLOCK)) {
665 			if (sc.ch == '\r' || sc.ch == '\n') {
666 				sc.SetState(SCE_P_DEFAULT);
667 			}
668 		} else if (sc.state == SCE_P_DECORATOR) {
669 			if (!IsAWordStart(sc.ch, options.unicodeIdentifiers)) {
670 				sc.SetState(SCE_P_DEFAULT);
671 			}
672 		} else if (IsPySingleQuoteStringState(sc.state)) {
673 			if (sc.ch == '\\') {
674 				if ((sc.chNext == '\r') && (sc.GetRelative(2) == '\n')) {
675 					sc.Forward();
676 				}
677 				if (sc.chNext == '\n' || sc.chNext == '\r') {
678 					inContinuedString = true;
679 				} else {
680 					// Don't roll over the newline.
681 					sc.Forward();
682 				}
683 			} else if (sc.ch == GetPyStringQuoteChar(sc.state)) {
684 				sc.ForwardSetState(SCE_P_DEFAULT);
685 				needEOLCheck = true;
686 			}
687 		} else if ((sc.state == SCE_P_TRIPLE) || (sc.state == SCE_P_FTRIPLE)) {
688 			if (sc.ch == '\\') {
689 				sc.Forward();
690 			} else if (sc.Match(R"(''')")) {
691 				sc.Forward();
692 				sc.Forward();
693 				sc.ForwardSetState(SCE_P_DEFAULT);
694 				needEOLCheck = true;
695 			}
696 		} else if ((sc.state == SCE_P_TRIPLEDOUBLE) || (sc.state == SCE_P_FTRIPLEDOUBLE)) {
697 			if (sc.ch == '\\') {
698 				sc.Forward();
699 			} else if (sc.Match(R"(""")")) {
700 				sc.Forward();
701 				sc.Forward();
702 				sc.ForwardSetState(SCE_P_DEFAULT);
703 				needEOLCheck = true;
704 			}
705 		}
706 
707 		// Note if used and not if else because string states also match
708 		// some of the above clauses
709 		if (IsPyFStringState(sc.state) && sc.ch == '{') {
710 			if (sc.chNext == '{') {
711 				sc.Forward();
712 			} else {
713 				PushStateToStack(sc.state, fstringStateStack, currentFStringExp);
714 				sc.ForwardSetState(SCE_P_DEFAULT);
715 			}
716 			needEOLCheck = true;
717 		}
718 
719 		// If in an f-string expression, check for the ending quote(s)
720 		// and end f-string to handle syntactically incorrect cases like
721 		// f'{' and f"""{"""
722 		if (!fstringStateStack.empty() && (sc.ch == '\'' || sc.ch == '"')) {
723 			long matching_stack_i = -1;
724 			for (unsigned long stack_i = 0; stack_i < fstringStateStack.size() && matching_stack_i == -1; stack_i++) {
725 				const int stack_state = fstringStateStack[stack_i].state;
726 				const char quote = GetPyStringQuoteChar(stack_state);
727 				if (sc.ch == quote) {
728 					if (IsPySingleQuoteStringState(stack_state)) {
729 						matching_stack_i = stack_i;
730 					} else if (quote == '"' ? sc.Match(R"(""")") : sc.Match("'''")) {
731 						matching_stack_i = stack_i;
732 					}
733 				}
734 			}
735 
736 			if (matching_stack_i != -1) {
737 				sc.SetState(fstringStateStack[matching_stack_i].state);
738 				if (IsPyTripleQuoteStringState(fstringStateStack[matching_stack_i].state)) {
739 					sc.Forward();
740 					sc.Forward();
741 				}
742 				sc.ForwardSetState(SCE_P_DEFAULT);
743 				needEOLCheck = true;
744 
745 				while (fstringStateStack.size() > static_cast<unsigned long>(matching_stack_i)) {
746 					PopFromStateStack(fstringStateStack, currentFStringExp);
747 				}
748 			}
749 		}
750 		// End of code to find the end of a state
751 
752 		if (!indentGood && !IsASpaceOrTab(sc.ch)) {
753 			styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 1);
754 			startIndicator = sc.currentPos;
755 			indentGood = true;
756 		}
757 
758 		// One cdef or cpdef line, clear kwLast only at end of line
759 		if ((kwLast == kwCDef || kwLast == kwCPDef) && sc.atLineEnd) {
760 			kwLast = kwOther;
761 		}
762 
763 		// State exit code may have moved on to end of line
764 		if (needEOLCheck && sc.atLineEnd) {
765 			ProcessLineEnd(sc, fstringStateStack, currentFStringExp, inContinuedString);
766 			lineCurrent++;
767 			styler.IndentAmount(lineCurrent, &spaceFlags, IsPyComment);
768 			if (!sc.More())
769 				break;
770 		}
771 
772 		// If in f-string expression, check for }, :, ! to resume f-string state or update nesting count
773 		if (currentFStringExp != NULL && !IsPySingleQuoteStringState(sc.state) && !IsPyTripleQuoteStringState(sc.state)) {
774 			if (currentFStringExp->nestingCount == 0 && (sc.ch == '}' || sc.ch == ':' || (sc.ch == '!' && sc.chNext != '='))) {
775 				sc.SetState(PopFromStateStack(fstringStateStack, currentFStringExp));
776 			} else {
777 				if (sc.ch == '{' || sc.ch == '[' || sc.ch == '(') {
778 					currentFStringExp->nestingCount++;
779 				} else if (sc.ch == '}' || sc.ch == ']' || sc.ch == ')') {
780 					currentFStringExp->nestingCount--;
781 				}
782 			}
783 		}
784 
785 		// Check for a new state starting character
786 		if (sc.state == SCE_P_DEFAULT) {
787 			if (IsADigit(sc.ch) || (sc.ch == '.' && IsADigit(sc.chNext))) {
788 				if (sc.ch == '0' && (sc.chNext == 'x' || sc.chNext == 'X')) {
789 					base_n_number = true;
790 					sc.SetState(SCE_P_NUMBER);
791 				} else if (sc.ch == '0' &&
792 						(sc.chNext == 'o' || sc.chNext == 'O' || sc.chNext == 'b' || sc.chNext == 'B')) {
793 					if (options.base2or8Literals) {
794 						base_n_number = true;
795 						sc.SetState(SCE_P_NUMBER);
796 					} else {
797 						sc.SetState(SCE_P_NUMBER);
798 						sc.ForwardSetState(SCE_P_IDENTIFIER);
799 					}
800 				} else {
801 					base_n_number = false;
802 					sc.SetState(SCE_P_NUMBER);
803 				}
804 			} else if ((IsASCII(sc.ch) && isoperator(static_cast<char>(sc.ch))) || sc.ch == '`') {
805 				sc.SetState(SCE_P_OPERATOR);
806 			} else if (sc.ch == '#') {
807 				sc.SetState(sc.chNext == '#' ? SCE_P_COMMENTBLOCK : SCE_P_COMMENTLINE);
808 			} else if (sc.ch == '@') {
809 				if (IsFirstNonWhitespace(sc.currentPos, styler))
810 					sc.SetState(SCE_P_DECORATOR);
811 				else
812 					sc.SetState(SCE_P_OPERATOR);
813 			} else if (IsPyStringStart(sc.ch, sc.chNext, sc.GetRelative(2), allowedLiterals)) {
814 				Sci_PositionU nextIndex = 0;
815 				sc.SetState(GetPyStringState(styler, sc.currentPos, &nextIndex, allowedLiterals));
816 				while (nextIndex > (sc.currentPos + 1) && sc.More()) {
817 					sc.Forward();
818 				}
819 			} else if (IsAWordStart(sc.ch, options.unicodeIdentifiers)) {
820 				sc.SetState(SCE_P_IDENTIFIER);
821 			}
822 		}
823 	}
824 	styler.IndicatorFill(startIndicator, sc.currentPos, indicatorWhitespace, 0);
825 	sc.Complete();
826 }
827 
IsCommentLine(Sci_Position line,Accessor & styler)828 static bool IsCommentLine(Sci_Position line, Accessor &styler) {
829 	Sci_Position pos = styler.LineStart(line);
830 	const Sci_Position eol_pos = styler.LineStart(line + 1) - 1;
831 	for (Sci_Position i = pos; i < eol_pos; i++) {
832 		const char ch = styler[i];
833 		if (ch == '#')
834 			return true;
835 		else if (ch != ' ' && ch != '\t')
836 			return false;
837 	}
838 	return false;
839 }
840 
IsQuoteLine(Sci_Position line,const Accessor & styler)841 static bool IsQuoteLine(Sci_Position line, const Accessor &styler) {
842 	const int style = styler.StyleAt(styler.LineStart(line)) & 31;
843 	return IsPyTripleQuoteStringState(style);
844 }
845 
846 
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)847 void SCI_METHOD LexerPython::Fold(Sci_PositionU startPos, Sci_Position length, int /*initStyle - unused*/, IDocument *pAccess) {
848 	if (!options.fold)
849 		return;
850 
851 	Accessor styler(pAccess, NULL);
852 
853 	const Sci_Position maxPos = startPos + length;
854 	const Sci_Position maxLines = (maxPos == styler.Length()) ? styler.GetLine(maxPos) : styler.GetLine(maxPos - 1);	// Requested last line
855 	const Sci_Position docLines = styler.GetLine(styler.Length());	// Available last line
856 
857 	// Backtrack to previous non-blank line so we can determine indent level
858 	// for any white space lines (needed esp. within triple quoted strings)
859 	// and so we can fix any preceding fold level (which is why we go back
860 	// at least one line in all cases)
861 	int spaceFlags = 0;
862 	Sci_Position lineCurrent = styler.GetLine(startPos);
863 	int indentCurrent = styler.IndentAmount(lineCurrent, &spaceFlags, NULL);
864 	while (lineCurrent > 0) {
865 		lineCurrent--;
866 		indentCurrent = styler.IndentAmount(lineCurrent, &spaceFlags, NULL);
867 		if (!(indentCurrent & SC_FOLDLEVELWHITEFLAG) &&
868 				(!IsCommentLine(lineCurrent, styler)) &&
869 				(!IsQuoteLine(lineCurrent, styler)))
870 			break;
871 	}
872 	int indentCurrentLevel = indentCurrent & SC_FOLDLEVELNUMBERMASK;
873 
874 	// Set up initial loop state
875 	startPos = styler.LineStart(lineCurrent);
876 	int prev_state = SCE_P_DEFAULT & 31;
877 	if (lineCurrent >= 1)
878 		prev_state = styler.StyleAt(startPos - 1) & 31;
879 	int prevQuote = options.foldQuotes && IsPyTripleQuoteStringState(prev_state);
880 
881 	// Process all characters to end of requested range or end of any triple quote
882 	//that hangs over the end of the range.  Cap processing in all cases
883 	// to end of document (in case of unclosed quote at end).
884 	while ((lineCurrent <= docLines) && ((lineCurrent <= maxLines) || prevQuote)) {
885 
886 		// Gather info
887 		int lev = indentCurrent;
888 		Sci_Position lineNext = lineCurrent + 1;
889 		int indentNext = indentCurrent;
890 		int quote = false;
891 		if (lineNext <= docLines) {
892 			// Information about next line is only available if not at end of document
893 			indentNext = styler.IndentAmount(lineNext, &spaceFlags, NULL);
894 			Sci_Position lookAtPos = (styler.LineStart(lineNext) == styler.Length()) ? styler.Length() - 1 : styler.LineStart(lineNext);
895 			const int style = styler.StyleAt(lookAtPos) & 31;
896 			quote = options.foldQuotes && IsPyTripleQuoteStringState(style);
897 		}
898 		const int quote_start = (quote && !prevQuote);
899 		const int quote_continue = (quote && prevQuote);
900 		if (!quote || !prevQuote)
901 			indentCurrentLevel = indentCurrent & SC_FOLDLEVELNUMBERMASK;
902 		if (quote)
903 			indentNext = indentCurrentLevel;
904 		if (indentNext & SC_FOLDLEVELWHITEFLAG)
905 			indentNext = SC_FOLDLEVELWHITEFLAG | indentCurrentLevel;
906 
907 		if (quote_start) {
908 			// Place fold point at start of triple quoted string
909 			lev |= SC_FOLDLEVELHEADERFLAG;
910 		} else if (quote_continue || prevQuote) {
911 			// Add level to rest of lines in the string
912 			lev = lev + 1;
913 		}
914 
915 		// Skip past any blank lines for next indent level info; we skip also
916 		// comments (all comments, not just those starting in column 0)
917 		// which effectively folds them into surrounding code rather
918 		// than screwing up folding.  If comments end file, use the min
919 		// comment indent as the level after
920 
921 		int minCommentLevel = indentCurrentLevel;
922 		while (!quote &&
923 				(lineNext < docLines) &&
924 				((indentNext & SC_FOLDLEVELWHITEFLAG) ||
925 				 (lineNext <= docLines && IsCommentLine(lineNext, styler)))) {
926 
927 			if (IsCommentLine(lineNext, styler) && indentNext < minCommentLevel) {
928 				minCommentLevel = indentNext;
929 			}
930 
931 			lineNext++;
932 			indentNext = styler.IndentAmount(lineNext, &spaceFlags, NULL);
933 		}
934 
935 		const int levelAfterComments = ((lineNext < docLines) ? indentNext & SC_FOLDLEVELNUMBERMASK : minCommentLevel);
936 		const int levelBeforeComments = std::max(indentCurrentLevel, levelAfterComments);
937 
938 		// Now set all the indent levels on the lines we skipped
939 		// Do this from end to start.  Once we encounter one line
940 		// which is indented more than the line after the end of
941 		// the comment-block, use the level of the block before
942 
943 		Sci_Position skipLine = lineNext;
944 		int skipLevel = levelAfterComments;
945 
946 		while (--skipLine > lineCurrent) {
947 			const int skipLineIndent = styler.IndentAmount(skipLine, &spaceFlags, NULL);
948 
949 			if (options.foldCompact) {
950 				if ((skipLineIndent & SC_FOLDLEVELNUMBERMASK) > levelAfterComments)
951 					skipLevel = levelBeforeComments;
952 
953 				int whiteFlag = skipLineIndent & SC_FOLDLEVELWHITEFLAG;
954 
955 				styler.SetLevel(skipLine, skipLevel | whiteFlag);
956 			} else {
957 				if ((skipLineIndent & SC_FOLDLEVELNUMBERMASK) > levelAfterComments &&
958 						!(skipLineIndent & SC_FOLDLEVELWHITEFLAG) &&
959 						!IsCommentLine(skipLine, styler))
960 					skipLevel = levelBeforeComments;
961 
962 				styler.SetLevel(skipLine, skipLevel);
963 			}
964 		}
965 
966 		// Set fold header on non-quote line
967 		if (!quote && !(indentCurrent & SC_FOLDLEVELWHITEFLAG)) {
968 			if ((indentCurrent & SC_FOLDLEVELNUMBERMASK) < (indentNext & SC_FOLDLEVELNUMBERMASK))
969 				lev |= SC_FOLDLEVELHEADERFLAG;
970 		}
971 
972 		// Keep track of triple quote state of previous line
973 		prevQuote = quote;
974 
975 		// Set fold level for this line and move to next line
976 		styler.SetLevel(lineCurrent, options.foldCompact ? lev : lev & ~SC_FOLDLEVELWHITEFLAG);
977 		indentCurrent = indentNext;
978 		lineCurrent = lineNext;
979 	}
980 
981 	// NOTE: Cannot set level of last line here because indentCurrent doesn't have
982 	// header flag set; the loop above is crafted to take care of this case!
983 	//styler.SetLevel(lineCurrent, indentCurrent);
984 }
985 
986 LexerModule lmPython(SCLEX_PYTHON, LexerPython::LexerFactoryPython, "python",
987 		     pythonWordListDesc);
988