1 // Scintilla source code edit control
2 /** @file Document.cxx
3  ** Text document that handles notifications, DBCS, styling, words and end of line.
4  **/
5 // Copyright 1998-2011 by Neil Hodgson <neilh@scintilla.org>
6 // The License.txt file describes the conditions under which this software may be distributed.
7 
8 #include <cstddef>
9 #include <cstdlib>
10 #include <cassert>
11 #include <cstring>
12 #include <cstdio>
13 #include <cmath>
14 
15 #include <stdexcept>
16 #include <string>
17 #include <string_view>
18 #include <vector>
19 #include <forward_list>
20 #include <algorithm>
21 #include <memory>
22 #include <chrono>
23 
24 #ifndef NO_CXX11_REGEX
25 #include <regex>
26 #endif
27 
28 #include "Platform.h"
29 
30 #include "ILoader.h"
31 #include "ILexer.h"
32 #include "Scintilla.h"
33 
34 #include "CharacterSet.h"
35 #include "CharacterCategory.h"
36 #include "Position.h"
37 #include "SplitVector.h"
38 #include "Partitioning.h"
39 #include "RunStyles.h"
40 #include "CellBuffer.h"
41 #include "PerLine.h"
42 #include "CharClassify.h"
43 #include "Decoration.h"
44 #include "CaseFolder.h"
45 #include "Document.h"
46 #include "RESearch.h"
47 #include "UniConversion.h"
48 #include "ElapsedPeriod.h"
49 
50 using namespace Scintilla;
51 
Colourise(Sci::Position start,Sci::Position end)52 void LexInterface::Colourise(Sci::Position start, Sci::Position end) {
53 	if (pdoc && instance && !performingStyle) {
54 		// Protect against reentrance, which may occur, for example, when
55 		// fold points are discovered while performing styling and the folding
56 		// code looks for child lines which may trigger styling.
57 		performingStyle = true;
58 
59 		const Sci::Position lengthDoc = pdoc->Length();
60 		if (end == -1)
61 			end = lengthDoc;
62 		const Sci::Position len = end - start;
63 
64 		PLATFORM_ASSERT(len >= 0);
65 		PLATFORM_ASSERT(start + len <= lengthDoc);
66 
67 		int styleStart = 0;
68 		if (start > 0)
69 			styleStart = pdoc->StyleAt(start - 1);
70 
71 		if (len > 0) {
72 			instance->Lex(start, len, styleStart, pdoc);
73 			instance->Fold(start, len, styleStart, pdoc);
74 		}
75 
76 		performingStyle = false;
77 	}
78 }
79 
LineEndTypesSupported()80 int LexInterface::LineEndTypesSupported() {
81 	if (instance) {
82 		return instance->LineEndTypesSupported();
83 	}
84 	return 0;
85 }
86 
ActionDuration(double duration_,double minDuration_,double maxDuration_)87 ActionDuration::ActionDuration(double duration_, double minDuration_, double maxDuration_) noexcept :
88 	duration(duration_), minDuration(minDuration_), maxDuration(maxDuration_) {
89 }
90 
AddSample(size_t numberActions,double durationOfActions)91 void ActionDuration::AddSample(size_t numberActions, double durationOfActions) noexcept {
92 	// Only adjust for multiple actions to avoid instability
93 	if (numberActions < 8)
94 		return;
95 
96 	// Alpha value for exponential smoothing.
97 	// Most recent value contributes 25% to smoothed value.
98 	const double alpha = 0.25;
99 
100 	const double durationOne = durationOfActions / numberActions;
101 	duration = std::clamp(alpha * durationOne + (1.0 - alpha) * duration,
102 		minDuration, maxDuration);
103 }
104 
Duration() const105 double ActionDuration::Duration() const noexcept {
106 	return duration;
107 }
108 
Document(int options)109 Document::Document(int options) :
110 	cb((options & SC_DOCUMENTOPTION_STYLES_NONE) == 0, (options & SC_DOCUMENTOPTION_TEXT_LARGE) != 0),
111 	durationStyleOneLine(0.00001, 0.000001, 0.0001) {
112 	refCount = 0;
113 #ifdef _WIN32
114 	eolMode = SC_EOL_CRLF;
115 #else
116 	eolMode = SC_EOL_LF;
117 #endif
118 	dbcsCodePage = SC_CP_UTF8;
119 	lineEndBitSet = SC_LINE_END_TYPE_DEFAULT;
120 	endStyled = 0;
121 	styleClock = 0;
122 	enteredModification = 0;
123 	enteredStyling = 0;
124 	enteredReadOnlyCount = 0;
125 	insertionSet = false;
126 	tabInChars = 8;
127 	indentInChars = 0;
128 	actualIndentInChars = 8;
129 	useTabs = true;
130 	tabIndents = true;
131 	backspaceUnindents = false;
132 
133 	matchesValid = false;
134 
135 	perLineData[ldMarkers] = std::make_unique<LineMarkers>();
136 	perLineData[ldLevels] = std::make_unique<LineLevels>();
137 	perLineData[ldState] = std::make_unique<LineState>();
138 	perLineData[ldMargin] = std::make_unique<LineAnnotation>();
139 	perLineData[ldAnnotation] = std::make_unique<LineAnnotation>();
140 
141 	decorations = DecorationListCreate(IsLarge());
142 
143 	cb.SetPerLine(this);
144 	cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);
145 }
146 
~Document()147 Document::~Document() {
148 	for (const WatcherWithUserData &watcher : watchers) {
149 		watcher.watcher->NotifyDeleted(this, watcher.userData);
150 	}
151 }
152 
153 // Increase reference count and return its previous value.
AddRef()154 int Document::AddRef() {
155 	return refCount++;
156 }
157 
158 // Decrease reference count and return its previous value.
159 // Delete the document if reference count reaches zero.
Release()160 int SCI_METHOD Document::Release() {
161 	const int curRefCount = --refCount;
162 	if (curRefCount == 0)
163 		delete this;
164 	return curRefCount;
165 }
166 
Init()167 void Document::Init() {
168 	for (const std::unique_ptr<PerLine> &pl : perLineData) {
169 		if (pl)
170 			pl->Init();
171 	}
172 }
173 
InsertLine(Sci::Line line)174 void Document::InsertLine(Sci::Line line) {
175 	for (const std::unique_ptr<PerLine> &pl : perLineData) {
176 		if (pl)
177 			pl->InsertLine(line);
178 	}
179 }
180 
RemoveLine(Sci::Line line)181 void Document::RemoveLine(Sci::Line line) {
182 	for (const std::unique_ptr<PerLine> &pl : perLineData) {
183 		if (pl)
184 			pl->RemoveLine(line);
185 	}
186 }
187 
Markers() const188 LineMarkers *Document::Markers() const noexcept {
189 	return static_cast<LineMarkers *>(perLineData[ldMarkers].get());
190 }
191 
Levels() const192 LineLevels *Document::Levels() const noexcept {
193 	return static_cast<LineLevels *>(perLineData[ldLevels].get());
194 }
195 
States() const196 LineState *Document::States() const noexcept {
197 	return static_cast<LineState *>(perLineData[ldState].get());
198 }
199 
Margins() const200 LineAnnotation *Document::Margins() const noexcept {
201 	return static_cast<LineAnnotation *>(perLineData[ldMargin].get());
202 }
203 
Annotations() const204 LineAnnotation *Document::Annotations() const noexcept {
205 	return static_cast<LineAnnotation *>(perLineData[ldAnnotation].get());
206 }
207 
LineEndTypesSupported() const208 int Document::LineEndTypesSupported() const {
209 	if ((SC_CP_UTF8 == dbcsCodePage) && pli)
210 		return pli->LineEndTypesSupported();
211 	else
212 		return 0;
213 }
214 
SetDBCSCodePage(int dbcsCodePage_)215 bool Document::SetDBCSCodePage(int dbcsCodePage_) {
216 	if (dbcsCodePage != dbcsCodePage_) {
217 		dbcsCodePage = dbcsCodePage_;
218 		SetCaseFolder(nullptr);
219 		cb.SetLineEndTypes(lineEndBitSet & LineEndTypesSupported());
220 		cb.SetUTF8Substance(SC_CP_UTF8 == dbcsCodePage);
221 		ModifiedAt(0);	// Need to restyle whole document
222 		return true;
223 	} else {
224 		return false;
225 	}
226 }
227 
SetLineEndTypesAllowed(int lineEndBitSet_)228 bool Document::SetLineEndTypesAllowed(int lineEndBitSet_) {
229 	if (lineEndBitSet != lineEndBitSet_) {
230 		lineEndBitSet = lineEndBitSet_;
231 		const int lineEndBitSetActive = lineEndBitSet & LineEndTypesSupported();
232 		if (lineEndBitSetActive != cb.GetLineEndTypes()) {
233 			ModifiedAt(0);
234 			cb.SetLineEndTypes(lineEndBitSetActive);
235 			return true;
236 		} else {
237 			return false;
238 		}
239 	} else {
240 		return false;
241 	}
242 }
243 
SetSavePoint()244 void Document::SetSavePoint() {
245 	cb.SetSavePoint();
246 	NotifySavePoint(true);
247 }
248 
TentativeUndo()249 void Document::TentativeUndo() {
250 	if (!TentativeActive())
251 		return;
252 	CheckReadOnly();
253 	if (enteredModification == 0) {
254 		enteredModification++;
255 		if (!cb.IsReadOnly()) {
256 			const bool startSavePoint = cb.IsSavePoint();
257 			bool multiLine = false;
258 			const int steps = cb.TentativeSteps();
259 			//Platform::DebugPrintf("Steps=%d\n", steps);
260 			for (int step = 0; step < steps; step++) {
261 				const Sci::Line prevLinesTotal = LinesTotal();
262 				const Action &action = cb.GetUndoStep();
263 				if (action.at == removeAction) {
264 					NotifyModified(DocModification(
265 									SC_MOD_BEFOREINSERT | SC_PERFORMED_UNDO, action));
266 				} else if (action.at == containerAction) {
267 					DocModification dm(SC_MOD_CONTAINER | SC_PERFORMED_UNDO);
268 					dm.token = action.position;
269 					NotifyModified(dm);
270 				} else {
271 					NotifyModified(DocModification(
272 									SC_MOD_BEFOREDELETE | SC_PERFORMED_UNDO, action));
273 				}
274 				cb.PerformUndoStep();
275 				if (action.at != containerAction) {
276 					ModifiedAt(action.position);
277 				}
278 
279 				int modFlags = SC_PERFORMED_UNDO;
280 				// With undo, an insertion action becomes a deletion notification
281 				if (action.at == removeAction) {
282 					modFlags |= SC_MOD_INSERTTEXT;
283 				} else if (action.at == insertAction) {
284 					modFlags |= SC_MOD_DELETETEXT;
285 				}
286 				if (steps > 1)
287 					modFlags |= SC_MULTISTEPUNDOREDO;
288 				const Sci::Line linesAdded = LinesTotal() - prevLinesTotal;
289 				if (linesAdded != 0)
290 					multiLine = true;
291 				if (step == steps - 1) {
292 					modFlags |= SC_LASTSTEPINUNDOREDO;
293 					if (multiLine)
294 						modFlags |= SC_MULTILINEUNDOREDO;
295 				}
296 				NotifyModified(DocModification(modFlags, action.position, action.lenData,
297 											   linesAdded, action.data.get()));
298 			}
299 
300 			const bool endSavePoint = cb.IsSavePoint();
301 			if (startSavePoint != endSavePoint)
302 				NotifySavePoint(endSavePoint);
303 
304 			cb.TentativeCommit();
305 		}
306 		enteredModification--;
307 	}
308 }
309 
GetMark(Sci::Line line) const310 int Document::GetMark(Sci::Line line) const noexcept {
311 	return Markers()->MarkValue(line);
312 }
313 
MarkerNext(Sci::Line lineStart,int mask) const314 Sci::Line Document::MarkerNext(Sci::Line lineStart, int mask) const {
315 	return Markers()->MarkerNext(lineStart, mask);
316 }
317 
AddMark(Sci::Line line,int markerNum)318 int Document::AddMark(Sci::Line line, int markerNum) {
319 	if (line >= 0 && line <= LinesTotal()) {
320 		const int prev = Markers()->AddMark(line, markerNum, LinesTotal());
321 		const DocModification mh(SC_MOD_CHANGEMARKER, LineStart(line), 0, 0, nullptr, line);
322 		NotifyModified(mh);
323 		return prev;
324 	} else {
325 		return -1;
326 	}
327 }
328 
AddMarkSet(Sci::Line line,int valueSet)329 void Document::AddMarkSet(Sci::Line line, int valueSet) {
330 	if (line < 0 || line > LinesTotal()) {
331 		return;
332 	}
333 	unsigned int m = valueSet;
334 	for (int i = 0; m; i++, m >>= 1) {
335 		if (m & 1)
336 			Markers()->AddMark(line, i, LinesTotal());
337 	}
338 	const DocModification mh(SC_MOD_CHANGEMARKER, LineStart(line), 0, 0, nullptr, line);
339 	NotifyModified(mh);
340 }
341 
DeleteMark(Sci::Line line,int markerNum)342 void Document::DeleteMark(Sci::Line line, int markerNum) {
343 	Markers()->DeleteMark(line, markerNum, false);
344 	const DocModification mh(SC_MOD_CHANGEMARKER, LineStart(line), 0, 0, nullptr, line);
345 	NotifyModified(mh);
346 }
347 
DeleteMarkFromHandle(int markerHandle)348 void Document::DeleteMarkFromHandle(int markerHandle) {
349 	Markers()->DeleteMarkFromHandle(markerHandle);
350 	DocModification mh(SC_MOD_CHANGEMARKER);
351 	mh.line = -1;
352 	NotifyModified(mh);
353 }
354 
DeleteAllMarks(int markerNum)355 void Document::DeleteAllMarks(int markerNum) {
356 	bool someChanges = false;
357 	for (Sci::Line line = 0; line < LinesTotal(); line++) {
358 		if (Markers()->DeleteMark(line, markerNum, true))
359 			someChanges = true;
360 	}
361 	if (someChanges) {
362 		DocModification mh(SC_MOD_CHANGEMARKER);
363 		mh.line = -1;
364 		NotifyModified(mh);
365 	}
366 }
367 
LineFromHandle(int markerHandle) const368 Sci::Line Document::LineFromHandle(int markerHandle) const {
369 	return Markers()->LineFromHandle(markerHandle);
370 }
371 
LineStart(Sci_Position line) const372 Sci_Position SCI_METHOD Document::LineStart(Sci_Position line) const {
373 	return cb.LineStart(static_cast<Sci::Line>(line));
374 }
375 
IsLineStartPosition(Sci::Position position) const376 bool Document::IsLineStartPosition(Sci::Position position) const {
377 	return LineStart(LineFromPosition(position)) == position;
378 }
379 
LineEnd(Sci_Position line) const380 Sci_Position SCI_METHOD Document::LineEnd(Sci_Position line) const {
381 	if (line >= LinesTotal() - 1) {
382 		return LineStart(line + 1);
383 	} else {
384 		Sci::Position position = LineStart(line + 1);
385 		if (SC_LINE_END_TYPE_UNICODE == cb.GetLineEndTypes()) {
386 			const unsigned char bytes[] = {
387 				cb.UCharAt(position-3),
388 				cb.UCharAt(position-2),
389 				cb.UCharAt(position-1),
390 			};
391 			if (UTF8IsSeparator(bytes)) {
392 				return position - UTF8SeparatorLength;
393 			}
394 			if (UTF8IsNEL(bytes+1)) {
395 				return position - UTF8NELLength;
396 			}
397 		}
398 		position--; // Back over CR or LF
399 		// When line terminator is CR+LF, may need to go back one more
400 		if ((position > LineStart(line)) && (cb.CharAt(position - 1) == '\r')) {
401 			position--;
402 		}
403 		return position;
404 	}
405 }
406 
SetErrorStatus(int status)407 void SCI_METHOD Document::SetErrorStatus(int status) {
408 	// Tell the watchers an error has occurred.
409 	for (const WatcherWithUserData &watcher : watchers) {
410 		watcher.watcher->NotifyErrorOccurred(this, watcher.userData, status);
411 	}
412 }
413 
LineFromPosition(Sci_Position pos) const414 Sci_Position SCI_METHOD Document::LineFromPosition(Sci_Position pos) const {
415 	return cb.LineFromPosition(pos);
416 }
417 
SciLineFromPosition(Sci::Position pos) const418 Sci::Line Document::SciLineFromPosition(Sci::Position pos) const noexcept {
419 	// Avoids casting in callers for this very common function
420 	return cb.LineFromPosition(pos);
421 }
422 
LineEndPosition(Sci::Position position) const423 Sci::Position Document::LineEndPosition(Sci::Position position) const {
424 	return LineEnd(LineFromPosition(position));
425 }
426 
IsLineEndPosition(Sci::Position position) const427 bool Document::IsLineEndPosition(Sci::Position position) const {
428 	return LineEnd(LineFromPosition(position)) == position;
429 }
430 
IsPositionInLineEnd(Sci::Position position) const431 bool Document::IsPositionInLineEnd(Sci::Position position) const {
432 	return position >= LineEnd(LineFromPosition(position));
433 }
434 
VCHomePosition(Sci::Position position) const435 Sci::Position Document::VCHomePosition(Sci::Position position) const {
436 	const Sci::Line line = SciLineFromPosition(position);
437 	const Sci::Position startPosition = LineStart(line);
438 	const Sci::Position endLine = LineEnd(line);
439 	Sci::Position startText = startPosition;
440 	while (startText < endLine && (cb.CharAt(startText) == ' ' || cb.CharAt(startText) == '\t'))
441 		startText++;
442 	if (position == startText)
443 		return startPosition;
444 	else
445 		return startText;
446 }
447 
IndexLineStart(Sci::Line line,int lineCharacterIndex) const448 Sci::Position Document::IndexLineStart(Sci::Line line, int lineCharacterIndex) const noexcept {
449 	return cb.IndexLineStart(line, lineCharacterIndex);
450 }
451 
LineFromPositionIndex(Sci::Position pos,int lineCharacterIndex) const452 Sci::Line Document::LineFromPositionIndex(Sci::Position pos, int lineCharacterIndex) const noexcept {
453 	return cb.LineFromPositionIndex(pos, lineCharacterIndex);
454 }
455 
SetLevel(Sci_Position line,int level)456 int SCI_METHOD Document::SetLevel(Sci_Position line, int level) {
457 	const int prev = Levels()->SetLevel(static_cast<Sci::Line>(line), level, LinesTotal());
458 	if (prev != level) {
459 		DocModification mh(SC_MOD_CHANGEFOLD | SC_MOD_CHANGEMARKER,
460 		                   LineStart(line), 0, 0, nullptr, static_cast<Sci::Line>(line));
461 		mh.foldLevelNow = level;
462 		mh.foldLevelPrev = prev;
463 		NotifyModified(mh);
464 	}
465 	return prev;
466 }
467 
GetLevel(Sci_Position line) const468 int SCI_METHOD Document::GetLevel(Sci_Position line) const {
469 	return Levels()->GetLevel(static_cast<Sci::Line>(line));
470 }
471 
ClearLevels()472 void Document::ClearLevels() {
473 	Levels()->ClearLevels();
474 }
475 
IsSubordinate(int levelStart,int levelTry)476 static bool IsSubordinate(int levelStart, int levelTry) noexcept {
477 	if (levelTry & SC_FOLDLEVELWHITEFLAG)
478 		return true;
479 	else
480 		return LevelNumber(levelStart) < LevelNumber(levelTry);
481 }
482 
GetLastChild(Sci::Line lineParent,int level,Sci::Line lastLine)483 Sci::Line Document::GetLastChild(Sci::Line lineParent, int level, Sci::Line lastLine) {
484 	if (level == -1)
485 		level = LevelNumber(GetLevel(lineParent));
486 	const Sci::Line maxLine = LinesTotal();
487 	const Sci::Line lookLastLine = (lastLine != -1) ? std::min(LinesTotal() - 1, lastLine) : -1;
488 	Sci::Line lineMaxSubord = lineParent;
489 	while (lineMaxSubord < maxLine - 1) {
490 		EnsureStyledTo(LineStart(lineMaxSubord + 2));
491 		if (!IsSubordinate(level, GetLevel(lineMaxSubord + 1)))
492 			break;
493 		if ((lookLastLine != -1) && (lineMaxSubord >= lookLastLine) && !(GetLevel(lineMaxSubord) & SC_FOLDLEVELWHITEFLAG))
494 			break;
495 		lineMaxSubord++;
496 	}
497 	if (lineMaxSubord > lineParent) {
498 		if (level > LevelNumber(GetLevel(lineMaxSubord + 1))) {
499 			// Have chewed up some whitespace that belongs to a parent so seek back
500 			if (GetLevel(lineMaxSubord) & SC_FOLDLEVELWHITEFLAG) {
501 				lineMaxSubord--;
502 			}
503 		}
504 	}
505 	return lineMaxSubord;
506 }
507 
GetFoldParent(Sci::Line line) const508 Sci::Line Document::GetFoldParent(Sci::Line line) const {
509 	const int level = LevelNumber(GetLevel(line));
510 	Sci::Line lineLook = line - 1;
511 	while ((lineLook > 0) && (
512 	            (!(GetLevel(lineLook) & SC_FOLDLEVELHEADERFLAG)) ||
513 	            (LevelNumber(GetLevel(lineLook)) >= level))
514 	      ) {
515 		lineLook--;
516 	}
517 	if ((GetLevel(lineLook) & SC_FOLDLEVELHEADERFLAG) &&
518 	        (LevelNumber(GetLevel(lineLook)) < level)) {
519 		return lineLook;
520 	} else {
521 		return -1;
522 	}
523 }
524 
GetHighlightDelimiters(HighlightDelimiter & highlightDelimiter,Sci::Line line,Sci::Line lastLine)525 void Document::GetHighlightDelimiters(HighlightDelimiter &highlightDelimiter, Sci::Line line, Sci::Line lastLine) {
526 	const int level = GetLevel(line);
527 	const Sci::Line lookLastLine = std::max(line, lastLine) + 1;
528 
529 	Sci::Line lookLine = line;
530 	int lookLineLevel = level;
531 	int lookLineLevelNum = LevelNumber(lookLineLevel);
532 	while ((lookLine > 0) && ((lookLineLevel & SC_FOLDLEVELWHITEFLAG) ||
533 		((lookLineLevel & SC_FOLDLEVELHEADERFLAG) && (lookLineLevelNum >= LevelNumber(GetLevel(lookLine + 1)))))) {
534 		lookLineLevel = GetLevel(--lookLine);
535 		lookLineLevelNum = LevelNumber(lookLineLevel);
536 	}
537 
538 	Sci::Line beginFoldBlock = (lookLineLevel & SC_FOLDLEVELHEADERFLAG) ? lookLine : GetFoldParent(lookLine);
539 	if (beginFoldBlock == -1) {
540 		highlightDelimiter.Clear();
541 		return;
542 	}
543 
544 	Sci::Line endFoldBlock = GetLastChild(beginFoldBlock, -1, lookLastLine);
545 	Sci::Line firstChangeableLineBefore = -1;
546 	if (endFoldBlock < line) {
547 		lookLine = beginFoldBlock - 1;
548 		lookLineLevel = GetLevel(lookLine);
549 		lookLineLevelNum = LevelNumber(lookLineLevel);
550 		while ((lookLine >= 0) && (lookLineLevelNum >= SC_FOLDLEVELBASE)) {
551 			if (lookLineLevel & SC_FOLDLEVELHEADERFLAG) {
552 				if (GetLastChild(lookLine, -1, lookLastLine) == line) {
553 					beginFoldBlock = lookLine;
554 					endFoldBlock = line;
555 					firstChangeableLineBefore = line - 1;
556 				}
557 			}
558 			if ((lookLine > 0) && (lookLineLevelNum == SC_FOLDLEVELBASE) && (LevelNumber(GetLevel(lookLine - 1)) > lookLineLevelNum))
559 				break;
560 			lookLineLevel = GetLevel(--lookLine);
561 			lookLineLevelNum = LevelNumber(lookLineLevel);
562 		}
563 	}
564 	if (firstChangeableLineBefore == -1) {
565 		for (lookLine = line - 1, lookLineLevel = GetLevel(lookLine), lookLineLevelNum = LevelNumber(lookLineLevel);
566 			lookLine >= beginFoldBlock;
567 			lookLineLevel = GetLevel(--lookLine), lookLineLevelNum = LevelNumber(lookLineLevel)) {
568 			if ((lookLineLevel & SC_FOLDLEVELWHITEFLAG) || (lookLineLevelNum > LevelNumber(level))) {
569 				firstChangeableLineBefore = lookLine;
570 				break;
571 			}
572 		}
573 	}
574 	if (firstChangeableLineBefore == -1)
575 		firstChangeableLineBefore = beginFoldBlock - 1;
576 
577 	Sci::Line firstChangeableLineAfter = -1;
578 	for (lookLine = line + 1, lookLineLevel = GetLevel(lookLine), lookLineLevelNum = LevelNumber(lookLineLevel);
579 		lookLine <= endFoldBlock;
580 		lookLineLevel = GetLevel(++lookLine), lookLineLevelNum = LevelNumber(lookLineLevel)) {
581 		if ((lookLineLevel & SC_FOLDLEVELHEADERFLAG) && (lookLineLevelNum < LevelNumber(GetLevel(lookLine + 1)))) {
582 			firstChangeableLineAfter = lookLine;
583 			break;
584 		}
585 	}
586 	if (firstChangeableLineAfter == -1)
587 		firstChangeableLineAfter = endFoldBlock + 1;
588 
589 	highlightDelimiter.beginFoldBlock = beginFoldBlock;
590 	highlightDelimiter.endFoldBlock = endFoldBlock;
591 	highlightDelimiter.firstChangeableLineBefore = firstChangeableLineBefore;
592 	highlightDelimiter.firstChangeableLineAfter = firstChangeableLineAfter;
593 }
594 
ClampPositionIntoDocument(Sci::Position pos) const595 Sci::Position Document::ClampPositionIntoDocument(Sci::Position pos) const noexcept {
596 	return std::clamp<Sci::Position>(pos, 0, LengthNoExcept());
597 }
598 
IsCrLf(Sci::Position pos) const599 bool Document::IsCrLf(Sci::Position pos) const noexcept {
600 	if (pos < 0)
601 		return false;
602 	if (pos >= (LengthNoExcept() - 1))
603 		return false;
604 	return (cb.CharAt(pos) == '\r') && (cb.CharAt(pos + 1) == '\n');
605 }
606 
LenChar(Sci::Position pos) const607 int Document::LenChar(Sci::Position pos) const noexcept {
608 	if (pos < 0 || pos >= LengthNoExcept()) {
609 		// Returning 1 instead of 0 to defend against hanging with a loop that goes (or starts) out of bounds.
610 		return 1;
611 	} else if (IsCrLf(pos)) {
612 		return 2;
613 	}
614 
615 	const unsigned char leadByte = cb.UCharAt(pos);
616 	if (!dbcsCodePage || UTF8IsAscii(leadByte)) {
617 		// Common case: ASCII character
618 		return 1;
619 	}
620 	if (SC_CP_UTF8 == dbcsCodePage) {
621 		const int widthCharBytes = UTF8BytesOfLead[leadByte];
622 		unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
623 		for (int b = 1; b < widthCharBytes; b++) {
624 			charBytes[b] = cb.UCharAt(pos + b);
625 		}
626 		const int utf8status = UTF8Classify(charBytes, widthCharBytes);
627 		if (utf8status & UTF8MaskInvalid) {
628 			// Treat as invalid and use up just one byte
629 			return 1;
630 		} else {
631 			return utf8status & UTF8MaskWidth;
632 		}
633 	} else {
634 		if (IsDBCSLeadByteNoExcept(leadByte) && ((pos + 1) < LengthNoExcept())) {
635 			return 2;
636 		} else {
637 			return 1;
638 		}
639 	}
640 }
641 
InGoodUTF8(Sci::Position pos,Sci::Position & start,Sci::Position & end) const642 bool Document::InGoodUTF8(Sci::Position pos, Sci::Position &start, Sci::Position &end) const noexcept {
643 	Sci::Position trail = pos;
644 	while ((trail>0) && (pos-trail < UTF8MaxBytes) && UTF8IsTrailByte(cb.UCharAt(trail-1)))
645 		trail--;
646 	start = (trail > 0) ? trail-1 : trail;
647 
648 	const unsigned char leadByte = cb.UCharAt(start);
649 	const int widthCharBytes = UTF8BytesOfLead[leadByte];
650 	if (widthCharBytes == 1) {
651 		return false;
652 	} else {
653 		const int trailBytes = widthCharBytes - 1;
654 		const Sci::Position len = pos - start;
655 		if (len > trailBytes)
656 			// pos too far from lead
657 			return false;
658 		unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0};
659 		for (Sci::Position b=1; b<widthCharBytes && ((start+b) < cb.Length()); b++)
660 			charBytes[b] = cb.CharAt(start+b);
661 		const int utf8status = UTF8Classify(charBytes, widthCharBytes);
662 		if (utf8status & UTF8MaskInvalid)
663 			return false;
664 		end = start + widthCharBytes;
665 		return true;
666 	}
667 }
668 
669 // Normalise a position so that it is not halfway through a two byte character.
670 // This can occur in two situations -
671 // When lines are terminated with \r\n pairs which should be treated as one character.
672 // When displaying DBCS text such as Japanese.
673 // If moving, move the position in the indicated direction.
MovePositionOutsideChar(Sci::Position pos,Sci::Position moveDir,bool checkLineEnd) const674 Sci::Position Document::MovePositionOutsideChar(Sci::Position pos, Sci::Position moveDir, bool checkLineEnd) const noexcept {
675 	//Platform::DebugPrintf("NoCRLF %d %d\n", pos, moveDir);
676 	// If out of range, just return minimum/maximum value.
677 	if (pos <= 0)
678 		return 0;
679 	if (pos >= LengthNoExcept())
680 		return LengthNoExcept();
681 
682 	// PLATFORM_ASSERT(pos > 0 && pos < LengthNoExcept());
683 	if (checkLineEnd && IsCrLf(pos - 1)) {
684 		if (moveDir > 0)
685 			return pos + 1;
686 		else
687 			return pos - 1;
688 	}
689 
690 	if (dbcsCodePage) {
691 		if (SC_CP_UTF8 == dbcsCodePage) {
692 			const unsigned char ch = cb.UCharAt(pos);
693 			// If ch is not a trail byte then pos is valid intercharacter position
694 			if (UTF8IsTrailByte(ch)) {
695 				Sci::Position startUTF = pos;
696 				Sci::Position endUTF = pos;
697 				if (InGoodUTF8(pos, startUTF, endUTF)) {
698 					// ch is a trail byte within a UTF-8 character
699 					if (moveDir > 0)
700 						pos = endUTF;
701 					else
702 						pos = startUTF;
703 				}
704 				// Else invalid UTF-8 so return position of isolated trail byte
705 			}
706 		} else {
707 			// Anchor DBCS calculations at start of line because start of line can
708 			// not be a DBCS trail byte.
709 			const Sci::Position posStartLine = cb.LineStart(cb.LineFromPosition(pos));
710 			if (pos == posStartLine)
711 				return pos;
712 
713 			// Step back until a non-lead-byte is found.
714 			Sci::Position posCheck = pos;
715 			while ((posCheck > posStartLine) && IsDBCSLeadByteNoExcept(cb.CharAt(posCheck-1)))
716 				posCheck--;
717 
718 			// Check from known start of character.
719 			while (posCheck < pos) {
720 				const int mbsize = IsDBCSLeadByteNoExcept(cb.CharAt(posCheck)) ? 2 : 1;
721 				if (posCheck + mbsize == pos) {
722 					return pos;
723 				} else if (posCheck + mbsize > pos) {
724 					if (moveDir > 0) {
725 						return posCheck + mbsize;
726 					} else {
727 						return posCheck;
728 					}
729 				}
730 				posCheck += mbsize;
731 			}
732 		}
733 	}
734 
735 	return pos;
736 }
737 
738 // NextPosition moves between valid positions - it can not handle a position in the middle of a
739 // multi-byte character. It is used to iterate through text more efficiently than MovePositionOutsideChar.
740 // A \r\n pair is treated as two characters.
NextPosition(Sci::Position pos,int moveDir) const741 Sci::Position Document::NextPosition(Sci::Position pos, int moveDir) const noexcept {
742 	// If out of range, just return minimum/maximum value.
743 	const int increment = (moveDir > 0) ? 1 : -1;
744 	if (pos + increment <= 0)
745 		return 0;
746 	if (pos + increment >= cb.Length())
747 		return cb.Length();
748 
749 	if (dbcsCodePage) {
750 		if (SC_CP_UTF8 == dbcsCodePage) {
751 			if (increment == 1) {
752 				// Simple forward movement case so can avoid some checks
753 				const unsigned char leadByte = cb.UCharAt(pos);
754 				if (UTF8IsAscii(leadByte)) {
755 					// Single byte character or invalid
756 					pos++;
757 				} else {
758 					const int widthCharBytes = UTF8BytesOfLead[leadByte];
759 					unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0};
760 					for (int b=1; b<widthCharBytes; b++)
761 						charBytes[b] = cb.CharAt(pos+b);
762 					const int utf8status = UTF8Classify(charBytes, widthCharBytes);
763 					if (utf8status & UTF8MaskInvalid)
764 						pos++;
765 					else
766 						pos += utf8status & UTF8MaskWidth;
767 				}
768 			} else {
769 				// Examine byte before position
770 				pos--;
771 				const unsigned char ch = cb.UCharAt(pos);
772 				// If ch is not a trail byte then pos is valid intercharacter position
773 				if (UTF8IsTrailByte(ch)) {
774 					// If ch is a trail byte in a valid UTF-8 character then return start of character
775 					Sci::Position startUTF = pos;
776 					Sci::Position endUTF = pos;
777 					if (InGoodUTF8(pos, startUTF, endUTF)) {
778 						pos = startUTF;
779 					}
780 					// Else invalid UTF-8 so return position of isolated trail byte
781 				}
782 			}
783 		} else {
784 			if (moveDir > 0) {
785 				const int mbsize = IsDBCSLeadByteNoExcept(cb.CharAt(pos)) ? 2 : 1;
786 				pos += mbsize;
787 				if (pos > cb.Length())
788 					pos = cb.Length();
789 			} else {
790 				// Anchor DBCS calculations at start of line because start of line can
791 				// not be a DBCS trail byte.
792 				const Sci::Position posStartLine = cb.LineStart(cb.LineFromPosition(pos));
793 				// See http://msdn.microsoft.com/en-us/library/cc194792%28v=MSDN.10%29.aspx
794 				// http://msdn.microsoft.com/en-us/library/cc194790.aspx
795 				if ((pos - 1) <= posStartLine) {
796 					return pos - 1;
797 				} else if (IsDBCSLeadByteNoExcept(cb.CharAt(pos - 1))) {
798 					// Must actually be trail byte
799 					return pos - 2;
800 				} else {
801 					// Otherwise, step back until a non-lead-byte is found.
802 					Sci::Position posTemp = pos - 1;
803 					while (posStartLine <= --posTemp && IsDBCSLeadByteNoExcept(cb.CharAt(posTemp)))
804 						;
805 					// Now posTemp+1 must point to the beginning of a character,
806 					// so figure out whether we went back an even or an odd
807 					// number of bytes and go back 1 or 2 bytes, respectively.
808 					return (pos - 1 - ((pos - posTemp) & 1));
809 				}
810 			}
811 		}
812 	} else {
813 		pos += increment;
814 	}
815 
816 	return pos;
817 }
818 
NextCharacter(Sci::Position & pos,int moveDir) const819 bool Document::NextCharacter(Sci::Position &pos, int moveDir) const noexcept {
820 	// Returns true if pos changed
821 	Sci::Position posNext = NextPosition(pos, moveDir);
822 	if (posNext == pos) {
823 		return false;
824 	} else {
825 		pos = posNext;
826 		return true;
827 	}
828 }
829 
CharacterAfter(Sci::Position position) const830 Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) const noexcept {
831 	if (position >= LengthNoExcept()) {
832 		return CharacterExtracted(unicodeReplacementChar, 0);
833 	}
834 	const unsigned char leadByte = cb.UCharAt(position);
835 	if (!dbcsCodePage || UTF8IsAscii(leadByte)) {
836 		// Common case: ASCII character
837 		return CharacterExtracted(leadByte, 1);
838 	}
839 	if (SC_CP_UTF8 == dbcsCodePage) {
840 		const int widthCharBytes = UTF8BytesOfLead[leadByte];
841 		unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
842 		for (int b = 1; b<widthCharBytes; b++)
843 			charBytes[b] = cb.UCharAt(position + b);
844 		const int utf8status = UTF8Classify(charBytes, widthCharBytes);
845 		if (utf8status & UTF8MaskInvalid) {
846 			// Treat as invalid and use up just one byte
847 			return CharacterExtracted(unicodeReplacementChar, 1);
848 		} else {
849 			return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
850 		}
851 	} else {
852 		if (IsDBCSLeadByteNoExcept(leadByte) && ((position + 1) < LengthNoExcept())) {
853 			return CharacterExtracted::DBCS(leadByte, cb.UCharAt(position + 1));
854 		} else {
855 			return CharacterExtracted(leadByte, 1);
856 		}
857 	}
858 }
859 
CharacterBefore(Sci::Position position) const860 Document::CharacterExtracted Document::CharacterBefore(Sci::Position position) const noexcept {
861 	if (position <= 0) {
862 		return CharacterExtracted(unicodeReplacementChar, 0);
863 	}
864 	const unsigned char previousByte = cb.UCharAt(position - 1);
865 	if (0 == dbcsCodePage) {
866 		return CharacterExtracted(previousByte, 1);
867 	}
868 	if (SC_CP_UTF8 == dbcsCodePage) {
869 		if (UTF8IsAscii(previousByte)) {
870 			return CharacterExtracted(previousByte, 1);
871 		}
872 		position--;
873 		// If previousByte is not a trail byte then its invalid
874 		if (UTF8IsTrailByte(previousByte)) {
875 			// If previousByte is a trail byte in a valid UTF-8 character then find start of character
876 			Sci::Position startUTF = position;
877 			Sci::Position endUTF = position;
878 			if (InGoodUTF8(position, startUTF, endUTF)) {
879 				const int widthCharBytes = static_cast<int>(endUTF - startUTF);
880 				unsigned char charBytes[UTF8MaxBytes] = { 0, 0, 0, 0 };
881 				for (int b = 0; b<widthCharBytes; b++)
882 					charBytes[b] = cb.UCharAt(startUTF + b);
883 				const int utf8status = UTF8Classify(charBytes, widthCharBytes);
884 				if (utf8status & UTF8MaskInvalid) {
885 					// Treat as invalid and use up just one byte
886 					return CharacterExtracted(unicodeReplacementChar, 1);
887 				} else {
888 					return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
889 				}
890 			}
891 			// Else invalid UTF-8 so return position of isolated trail byte
892 		}
893 		return CharacterExtracted(unicodeReplacementChar, 1);
894 	} else {
895 		// Moving backwards in DBCS is complex so use NextPosition
896 		const Sci::Position posStartCharacter = NextPosition(position, -1);
897 		return CharacterAfter(posStartCharacter);
898 	}
899 }
900 
901 // Return -1  on out-of-bounds
GetRelativePosition(Sci_Position positionStart,Sci_Position characterOffset) const902 Sci_Position SCI_METHOD Document::GetRelativePosition(Sci_Position positionStart, Sci_Position characterOffset) const {
903 	Sci::Position pos = positionStart;
904 	if (dbcsCodePage) {
905 		const int increment = (characterOffset > 0) ? 1 : -1;
906 		while (characterOffset != 0) {
907 			const Sci::Position posNext = NextPosition(pos, increment);
908 			if (posNext == pos)
909 				return INVALID_POSITION;
910 			pos = posNext;
911 			characterOffset -= increment;
912 		}
913 	} else {
914 		pos = positionStart + characterOffset;
915 		if ((pos < 0) || (pos > Length()))
916 			return INVALID_POSITION;
917 	}
918 	return pos;
919 }
920 
GetRelativePositionUTF16(Sci::Position positionStart,Sci::Position characterOffset) const921 Sci::Position Document::GetRelativePositionUTF16(Sci::Position positionStart, Sci::Position characterOffset) const noexcept {
922 	Sci::Position pos = positionStart;
923 	if (dbcsCodePage) {
924 		const int increment = (characterOffset > 0) ? 1 : -1;
925 		while (characterOffset != 0) {
926 			const Sci::Position posNext = NextPosition(pos, increment);
927 			if (posNext == pos)
928 				return INVALID_POSITION;
929 			if (std::abs(pos-posNext) > 3)	// 4 byte character = 2*UTF16.
930 				characterOffset -= increment;
931 			pos = posNext;
932 			characterOffset -= increment;
933 		}
934 	} else {
935 		pos = positionStart + characterOffset;
936 		if ((pos < 0) || (pos > LengthNoExcept()))
937 			return INVALID_POSITION;
938 	}
939 	return pos;
940 }
941 
GetCharacterAndWidth(Sci_Position position,Sci_Position * pWidth) const942 int SCI_METHOD Document::GetCharacterAndWidth(Sci_Position position, Sci_Position *pWidth) const {
943 	int character;
944 	int bytesInCharacter = 1;
945 	const unsigned char leadByte = cb.UCharAt(position);
946 	if (dbcsCodePage) {
947 		if (SC_CP_UTF8 == dbcsCodePage) {
948 			if (UTF8IsAscii(leadByte)) {
949 				// Single byte character or invalid
950 				character =  leadByte;
951 			} else {
952 				const int widthCharBytes = UTF8BytesOfLead[leadByte];
953 				unsigned char charBytes[UTF8MaxBytes] = {leadByte,0,0,0};
954 				for (int b=1; b<widthCharBytes; b++)
955 					charBytes[b] = cb.UCharAt(position+b);
956 				const int utf8status = UTF8Classify(charBytes, widthCharBytes);
957 				if (utf8status & UTF8MaskInvalid) {
958 					// Report as singleton surrogate values which are invalid Unicode
959 					character =  0xDC80 + leadByte;
960 				} else {
961 					bytesInCharacter = utf8status & UTF8MaskWidth;
962 					character = UnicodeFromUTF8(charBytes);
963 				}
964 			}
965 		} else {
966 			if (IsDBCSLeadByteNoExcept(leadByte)) {
967 				bytesInCharacter = 2;
968 				character = (leadByte << 8) | cb.UCharAt(position+1);
969 			} else {
970 				character = leadByte;
971 			}
972 		}
973 	} else {
974 		character = leadByte;
975 	}
976 	if (pWidth) {
977 		*pWidth = bytesInCharacter;
978 	}
979 	return character;
980 }
981 
CodePage() const982 int SCI_METHOD Document::CodePage() const {
983 	return dbcsCodePage;
984 }
985 
IsDBCSLeadByte(char ch) const986 bool SCI_METHOD Document::IsDBCSLeadByte(char ch) const {
987 	// Used by lexers so must match IDocument method exactly
988 	return IsDBCSLeadByteNoExcept(ch);
989 }
990 
IsDBCSLeadByteNoExcept(char ch) const991 bool Document::IsDBCSLeadByteNoExcept(char ch) const noexcept {
992 	// Used inside core Scintilla
993 	// Byte ranges found in Wikipedia articles with relevant search strings in each case
994 	const unsigned char uch = ch;
995 	switch (dbcsCodePage) {
996 		case 932:
997 			// Shift_jis
998 			return ((uch >= 0x81) && (uch <= 0x9F)) ||
999 				((uch >= 0xE0) && (uch <= 0xFC));
1000 				// Lead bytes F0 to FC may be a Microsoft addition.
1001 		case 936:
1002 			// GBK
1003 			return (uch >= 0x81) && (uch <= 0xFE);
1004 		case 949:
1005 			// Korean Wansung KS C-5601-1987
1006 			return (uch >= 0x81) && (uch <= 0xFE);
1007 		case 950:
1008 			// Big5
1009 			return (uch >= 0x81) && (uch <= 0xFE);
1010 		case 1361:
1011 			// Korean Johab KS C-5601-1992
1012 			return
1013 				((uch >= 0x84) && (uch <= 0xD3)) ||
1014 				((uch >= 0xD8) && (uch <= 0xDE)) ||
1015 				((uch >= 0xE0) && (uch <= 0xF9));
1016 	}
1017 	return false;
1018 }
1019 
IsDBCSLeadByteInvalid(char ch) const1020 bool Document::IsDBCSLeadByteInvalid(char ch) const noexcept {
1021 	const unsigned char lead = ch;
1022 	switch (dbcsCodePage) {
1023 	case 932:
1024 		// Shift_jis
1025 		return
1026 			(lead == 0x85) ||
1027 			(lead == 0x86) ||
1028 			(lead == 0xEB) ||
1029 			(lead == 0xEC) ||
1030 			(lead == 0xEF) ||
1031 			(lead == 0xFA) ||
1032 			(lead == 0xFB) ||
1033 			(lead == 0xFC);
1034 	case 936:
1035 		// GBK
1036 		return (lead == 0x80) || (lead == 0xFF);
1037 	case 949:
1038 		// Korean Wansung KS C-5601-1987
1039 		return (lead == 0x80) || (lead == 0xC9) || (lead >= 0xFE);
1040 	case 950:
1041 		// Big5
1042 		return
1043 			((lead >= 0x80) && (lead <= 0xA0)) ||
1044 			(lead == 0xC8) ||
1045 			(lead >= 0xFA);
1046 	case 1361:
1047 		// Korean Johab KS C-5601-1992
1048 		return
1049 			((lead >= 0x80) && (lead <= 0x83)) ||
1050 			((lead >= 0xD4) && (lead <= 0xD8)) ||
1051 			(lead == 0xDF) ||
1052 			(lead >= 0xFA);
1053 	}
1054 	return false;
1055 }
1056 
IsDBCSTrailByteInvalid(char ch) const1057 bool Document::IsDBCSTrailByteInvalid(char ch) const noexcept {
1058 	const unsigned char trail = ch;
1059 	switch (dbcsCodePage) {
1060 	case 932:
1061 		// Shift_jis
1062 		return
1063 			(trail <= 0x3F) ||
1064 			(trail == 0x7F) ||
1065 			(trail >= 0xFD);
1066 	case 936:
1067 		// GBK
1068 		return
1069 			(trail <= 0x3F) ||
1070 			(trail == 0x7F) ||
1071 			(trail == 0xFF);
1072 	case 949:
1073 		// Korean Wansung KS C-5601-1987
1074 		return
1075 			(trail <= 0x40) ||
1076 			((trail >= 0x5B) && (trail <= 0x60)) ||
1077 			((trail >= 0x7B) && (trail <= 0x80)) ||
1078 			(trail == 0xFF);
1079 	case 950:
1080 		// Big5
1081 		return
1082 			(trail <= 0x3F) ||
1083 			((trail >= 0x7F) && (trail <= 0xA0)) ||
1084 			(trail == 0xFF);
1085 	case 1361:
1086 		// Korean Johab KS C-5601-1992
1087 		return
1088 			(trail <= 0x30) ||
1089 			(trail == 0x7F) ||
1090 			(trail == 0x80) ||
1091 			(trail == 0xFF);
1092 	}
1093 	return false;
1094 }
1095 
DBCSDrawBytes(std::string_view text) const1096 int Document::DBCSDrawBytes(std::string_view text) const noexcept {
1097 	if (text.length() <= 1) {
1098 		return static_cast<int>(text.length());
1099 	}
1100 	if (IsDBCSLeadByteNoExcept(text[0])) {
1101 		return IsDBCSTrailByteInvalid(text[1]) ? 1 : 2;
1102 	} else {
1103 		return 1;
1104 	}
1105 }
1106 
IsSpaceOrTab(int ch)1107 static constexpr bool IsSpaceOrTab(int ch) noexcept {
1108 	return ch == ' ' || ch == '\t';
1109 }
1110 
1111 // Need to break text into segments near lengthSegment but taking into
1112 // account the encoding to not break inside a UTF-8 or DBCS character
1113 // and also trying to avoid breaking inside a pair of combining characters.
1114 // The segment length must always be long enough (more than 4 bytes)
1115 // so that there will be at least one whole character to make a segment.
1116 // For UTF-8, text must consist only of valid whole characters.
1117 // In preference order from best to worst:
1118 //   1) Break after space
1119 //   2) Break before punctuation
1120 //   3) Break after whole character
1121 
SafeSegment(const char * text,int length,int lengthSegment) const1122 int Document::SafeSegment(const char *text, int length, int lengthSegment) const noexcept {
1123 	if (length <= lengthSegment)
1124 		return length;
1125 	int lastSpaceBreak = -1;
1126 	int lastPunctuationBreak = -1;
1127 	int lastEncodingAllowedBreak = 0;
1128 	for (int j=0; j < lengthSegment;) {
1129 		const unsigned char ch = text[j];
1130 		if (j > 0) {
1131 			if (IsSpaceOrTab(text[j - 1]) && !IsSpaceOrTab(text[j])) {
1132 				lastSpaceBreak = j;
1133 			}
1134 			if (ch < 'A') {
1135 				lastPunctuationBreak = j;
1136 			}
1137 		}
1138 		lastEncodingAllowedBreak = j;
1139 
1140 		if (dbcsCodePage == SC_CP_UTF8) {
1141 			j += UTF8BytesOfLead[ch];
1142 		} else if (dbcsCodePage) {
1143 			j += IsDBCSLeadByteNoExcept(ch) ? 2 : 1;
1144 		} else {
1145 			j++;
1146 		}
1147 	}
1148 	if (lastSpaceBreak >= 0) {
1149 		return lastSpaceBreak;
1150 	} else if (lastPunctuationBreak >= 0) {
1151 		return lastPunctuationBreak;
1152 	}
1153 	return lastEncodingAllowedBreak;
1154 }
1155 
CodePageFamily() const1156 EncodingFamily Document::CodePageFamily() const noexcept {
1157 	if (SC_CP_UTF8 == dbcsCodePage)
1158 		return efUnicode;
1159 	else if (dbcsCodePage)
1160 		return efDBCS;
1161 	else
1162 		return efEightBit;
1163 }
1164 
ModifiedAt(Sci::Position pos)1165 void Document::ModifiedAt(Sci::Position pos) noexcept {
1166 	if (endStyled > pos)
1167 		endStyled = pos;
1168 }
1169 
CheckReadOnly()1170 void Document::CheckReadOnly() {
1171 	if (cb.IsReadOnly() && enteredReadOnlyCount == 0) {
1172 		enteredReadOnlyCount++;
1173 		NotifyModifyAttempt();
1174 		enteredReadOnlyCount--;
1175 	}
1176 }
1177 
1178 // Document only modified by gateways DeleteChars, InsertString, Undo, Redo, and SetStyleAt.
1179 // SetStyleAt does not change the persistent state of a document
1180 
DeleteChars(Sci::Position pos,Sci::Position len)1181 bool Document::DeleteChars(Sci::Position pos, Sci::Position len) {
1182 	if (pos < 0)
1183 		return false;
1184 	if (len <= 0)
1185 		return false;
1186 	if ((pos + len) > LengthNoExcept())
1187 		return false;
1188 	CheckReadOnly();
1189 	if (enteredModification != 0) {
1190 		return false;
1191 	} else {
1192 		enteredModification++;
1193 		if (!cb.IsReadOnly()) {
1194 			NotifyModified(
1195 			    DocModification(
1196 			        SC_MOD_BEFOREDELETE | SC_PERFORMED_USER,
1197 			        pos, len,
1198 			        0, 0));
1199 			const Sci::Line prevLinesTotal = LinesTotal();
1200 			const bool startSavePoint = cb.IsSavePoint();
1201 			bool startSequence = false;
1202 			const char *text = cb.DeleteChars(pos, len, startSequence);
1203 			if (startSavePoint && cb.IsCollectingUndo())
1204 				NotifySavePoint(!startSavePoint);
1205 			if ((pos < LengthNoExcept()) || (pos == 0))
1206 				ModifiedAt(pos);
1207 			else
1208 				ModifiedAt(pos-1);
1209 			NotifyModified(
1210 			    DocModification(
1211 			        SC_MOD_DELETETEXT | SC_PERFORMED_USER | (startSequence?SC_STARTACTION:0),
1212 			        pos, len,
1213 			        LinesTotal() - prevLinesTotal, text));
1214 		}
1215 		enteredModification--;
1216 	}
1217 	return !cb.IsReadOnly();
1218 }
1219 
1220 /**
1221  * Insert a string with a length.
1222  */
InsertString(Sci::Position position,const char * s,Sci::Position insertLength)1223 Sci::Position Document::InsertString(Sci::Position position, const char *s, Sci::Position insertLength) {
1224 	if (insertLength <= 0) {
1225 		return 0;
1226 	}
1227 	CheckReadOnly();	// Application may change read only state here
1228 	if (cb.IsReadOnly()) {
1229 		return 0;
1230 	}
1231 	if (enteredModification != 0) {
1232 		return 0;
1233 	}
1234 	enteredModification++;
1235 	insertionSet = false;
1236 	insertion.clear();
1237 	NotifyModified(
1238 		DocModification(
1239 			SC_MOD_INSERTCHECK,
1240 			position, insertLength,
1241 			0, s));
1242 	if (insertionSet) {
1243 		s = insertion.c_str();
1244 		insertLength = insertion.length();
1245 	}
1246 	NotifyModified(
1247 		DocModification(
1248 			SC_MOD_BEFOREINSERT | SC_PERFORMED_USER,
1249 			position, insertLength,
1250 			0, s));
1251 	const Sci::Line prevLinesTotal = LinesTotal();
1252 	const bool startSavePoint = cb.IsSavePoint();
1253 	bool startSequence = false;
1254 	const char *text = cb.InsertString(position, s, insertLength, startSequence);
1255 	if (startSavePoint && cb.IsCollectingUndo())
1256 		NotifySavePoint(!startSavePoint);
1257 	ModifiedAt(position);
1258 	NotifyModified(
1259 		DocModification(
1260 			SC_MOD_INSERTTEXT | SC_PERFORMED_USER | (startSequence?SC_STARTACTION:0),
1261 			position, insertLength,
1262 			LinesTotal() - prevLinesTotal, text));
1263 	if (insertionSet) {	// Free memory as could be large
1264 		std::string().swap(insertion);
1265 	}
1266 	enteredModification--;
1267 	return insertLength;
1268 }
1269 
ChangeInsertion(const char * s,Sci::Position length)1270 void Document::ChangeInsertion(const char *s, Sci::Position length) {
1271 	insertionSet = true;
1272 	insertion.assign(s, length);
1273 }
1274 
AddData(const char * data,Sci_Position length)1275 int SCI_METHOD Document::AddData(const char *data, Sci_Position length) {
1276 	try {
1277 		const Sci::Position position = Length();
1278 		InsertString(position, data, length);
1279 	} catch (std::bad_alloc &) {
1280 		return SC_STATUS_BADALLOC;
1281 	} catch (...) {
1282 		return SC_STATUS_FAILURE;
1283 	}
1284 	return 0;
1285 }
1286 
ConvertToDocument()1287 void * SCI_METHOD Document::ConvertToDocument() {
1288 	return this;
1289 }
1290 
Undo()1291 Sci::Position Document::Undo() {
1292 	Sci::Position newPos = -1;
1293 	CheckReadOnly();
1294 	if ((enteredModification == 0) && (cb.IsCollectingUndo())) {
1295 		enteredModification++;
1296 		if (!cb.IsReadOnly()) {
1297 			const bool startSavePoint = cb.IsSavePoint();
1298 			bool multiLine = false;
1299 			const int steps = cb.StartUndo();
1300 			//Platform::DebugPrintf("Steps=%d\n", steps);
1301 			Sci::Position coalescedRemovePos = -1;
1302 			Sci::Position coalescedRemoveLen = 0;
1303 			Sci::Position prevRemoveActionPos = -1;
1304 			Sci::Position prevRemoveActionLen = 0;
1305 			for (int step = 0; step < steps; step++) {
1306 				const Sci::Line prevLinesTotal = LinesTotal();
1307 				const Action &action = cb.GetUndoStep();
1308 				if (action.at == removeAction) {
1309 					NotifyModified(DocModification(
1310 									SC_MOD_BEFOREINSERT | SC_PERFORMED_UNDO, action));
1311 				} else if (action.at == containerAction) {
1312 					DocModification dm(SC_MOD_CONTAINER | SC_PERFORMED_UNDO);
1313 					dm.token = action.position;
1314 					NotifyModified(dm);
1315 					if (!action.mayCoalesce) {
1316 						coalescedRemovePos = -1;
1317 						coalescedRemoveLen = 0;
1318 						prevRemoveActionPos = -1;
1319 						prevRemoveActionLen = 0;
1320 					}
1321 				} else {
1322 					NotifyModified(DocModification(
1323 									SC_MOD_BEFOREDELETE | SC_PERFORMED_UNDO, action));
1324 				}
1325 				cb.PerformUndoStep();
1326 				if (action.at != containerAction) {
1327 					ModifiedAt(action.position);
1328 					newPos = action.position;
1329 				}
1330 
1331 				int modFlags = SC_PERFORMED_UNDO;
1332 				// With undo, an insertion action becomes a deletion notification
1333 				if (action.at == removeAction) {
1334 					newPos += action.lenData;
1335 					modFlags |= SC_MOD_INSERTTEXT;
1336 					if ((coalescedRemoveLen > 0) &&
1337 						(action.position == prevRemoveActionPos || action.position == (prevRemoveActionPos + prevRemoveActionLen))) {
1338 						coalescedRemoveLen += action.lenData;
1339 						newPos = coalescedRemovePos + coalescedRemoveLen;
1340 					} else {
1341 						coalescedRemovePos = action.position;
1342 						coalescedRemoveLen = action.lenData;
1343 					}
1344 					prevRemoveActionPos = action.position;
1345 					prevRemoveActionLen = action.lenData;
1346 				} else if (action.at == insertAction) {
1347 					modFlags |= SC_MOD_DELETETEXT;
1348 					coalescedRemovePos = -1;
1349 					coalescedRemoveLen = 0;
1350 					prevRemoveActionPos = -1;
1351 					prevRemoveActionLen = 0;
1352 				}
1353 				if (steps > 1)
1354 					modFlags |= SC_MULTISTEPUNDOREDO;
1355 				const Sci::Line linesAdded = LinesTotal() - prevLinesTotal;
1356 				if (linesAdded != 0)
1357 					multiLine = true;
1358 				if (step == steps - 1) {
1359 					modFlags |= SC_LASTSTEPINUNDOREDO;
1360 					if (multiLine)
1361 						modFlags |= SC_MULTILINEUNDOREDO;
1362 				}
1363 				NotifyModified(DocModification(modFlags, action.position, action.lenData,
1364 											   linesAdded, action.data.get()));
1365 			}
1366 
1367 			const bool endSavePoint = cb.IsSavePoint();
1368 			if (startSavePoint != endSavePoint)
1369 				NotifySavePoint(endSavePoint);
1370 		}
1371 		enteredModification--;
1372 	}
1373 	return newPos;
1374 }
1375 
Redo()1376 Sci::Position Document::Redo() {
1377 	Sci::Position newPos = -1;
1378 	CheckReadOnly();
1379 	if ((enteredModification == 0) && (cb.IsCollectingUndo())) {
1380 		enteredModification++;
1381 		if (!cb.IsReadOnly()) {
1382 			const bool startSavePoint = cb.IsSavePoint();
1383 			bool multiLine = false;
1384 			const int steps = cb.StartRedo();
1385 			for (int step = 0; step < steps; step++) {
1386 				const Sci::Line prevLinesTotal = LinesTotal();
1387 				const Action &action = cb.GetRedoStep();
1388 				if (action.at == insertAction) {
1389 					NotifyModified(DocModification(
1390 									SC_MOD_BEFOREINSERT | SC_PERFORMED_REDO, action));
1391 				} else if (action.at == containerAction) {
1392 					DocModification dm(SC_MOD_CONTAINER | SC_PERFORMED_REDO);
1393 					dm.token = action.position;
1394 					NotifyModified(dm);
1395 				} else {
1396 					NotifyModified(DocModification(
1397 									SC_MOD_BEFOREDELETE | SC_PERFORMED_REDO, action));
1398 				}
1399 				cb.PerformRedoStep();
1400 				if (action.at != containerAction) {
1401 					ModifiedAt(action.position);
1402 					newPos = action.position;
1403 				}
1404 
1405 				int modFlags = SC_PERFORMED_REDO;
1406 				if (action.at == insertAction) {
1407 					newPos += action.lenData;
1408 					modFlags |= SC_MOD_INSERTTEXT;
1409 				} else if (action.at == removeAction) {
1410 					modFlags |= SC_MOD_DELETETEXT;
1411 				}
1412 				if (steps > 1)
1413 					modFlags |= SC_MULTISTEPUNDOREDO;
1414 				const Sci::Line linesAdded = LinesTotal() - prevLinesTotal;
1415 				if (linesAdded != 0)
1416 					multiLine = true;
1417 				if (step == steps - 1) {
1418 					modFlags |= SC_LASTSTEPINUNDOREDO;
1419 					if (multiLine)
1420 						modFlags |= SC_MULTILINEUNDOREDO;
1421 				}
1422 				NotifyModified(
1423 					DocModification(modFlags, action.position, action.lenData,
1424 									linesAdded, action.data.get()));
1425 			}
1426 
1427 			const bool endSavePoint = cb.IsSavePoint();
1428 			if (startSavePoint != endSavePoint)
1429 				NotifySavePoint(endSavePoint);
1430 		}
1431 		enteredModification--;
1432 	}
1433 	return newPos;
1434 }
1435 
DelChar(Sci::Position pos)1436 void Document::DelChar(Sci::Position pos) {
1437 	DeleteChars(pos, LenChar(pos));
1438 }
1439 
DelCharBack(Sci::Position pos)1440 void Document::DelCharBack(Sci::Position pos) {
1441 	if (pos <= 0) {
1442 		return;
1443 	} else if (IsCrLf(pos - 2)) {
1444 		DeleteChars(pos - 2, 2);
1445 	} else if (dbcsCodePage) {
1446 		const Sci::Position startChar = NextPosition(pos, -1);
1447 		DeleteChars(startChar, pos - startChar);
1448 	} else {
1449 		DeleteChars(pos - 1, 1);
1450 	}
1451 }
1452 
NextTab(Sci::Position pos,Sci::Position tabSize)1453 static constexpr Sci::Position NextTab(Sci::Position pos, Sci::Position tabSize) noexcept {
1454 	return ((pos / tabSize) + 1) * tabSize;
1455 }
1456 
CreateIndentation(Sci::Position indent,int tabSize,bool insertSpaces)1457 static std::string CreateIndentation(Sci::Position indent, int tabSize, bool insertSpaces) {
1458 	std::string indentation;
1459 	if (!insertSpaces) {
1460 		while (indent >= tabSize) {
1461 			indentation += '\t';
1462 			indent -= tabSize;
1463 		}
1464 	}
1465 	while (indent > 0) {
1466 		indentation += ' ';
1467 		indent--;
1468 	}
1469 	return indentation;
1470 }
1471 
GetLineIndentation(Sci_Position line)1472 int SCI_METHOD Document::GetLineIndentation(Sci_Position line) {
1473 	int indent = 0;
1474 	if ((line >= 0) && (line < LinesTotal())) {
1475 		const Sci::Position lineStart = LineStart(line);
1476 		const Sci::Position length = Length();
1477 		for (Sci::Position i = lineStart; i < length; i++) {
1478 			const char ch = cb.CharAt(i);
1479 			if (ch == ' ')
1480 				indent++;
1481 			else if (ch == '\t')
1482 				indent = static_cast<int>(NextTab(indent, tabInChars));
1483 			else
1484 				return indent;
1485 		}
1486 	}
1487 	return indent;
1488 }
1489 
SetLineIndentation(Sci::Line line,Sci::Position indent)1490 Sci::Position Document::SetLineIndentation(Sci::Line line, Sci::Position indent) {
1491 	const int indentOfLine = GetLineIndentation(line);
1492 	if (indent < 0)
1493 		indent = 0;
1494 	if (indent != indentOfLine) {
1495 		std::string linebuf = CreateIndentation(indent, tabInChars, !useTabs);
1496 		const Sci::Position thisLineStart = LineStart(line);
1497 		const Sci::Position indentPos = GetLineIndentPosition(line);
1498 		UndoGroup ug(this);
1499 		DeleteChars(thisLineStart, indentPos - thisLineStart);
1500 		return thisLineStart + InsertString(thisLineStart, linebuf.c_str(),
1501 			linebuf.length());
1502 	} else {
1503 		return GetLineIndentPosition(line);
1504 	}
1505 }
1506 
GetLineIndentPosition(Sci::Line line) const1507 Sci::Position Document::GetLineIndentPosition(Sci::Line line) const {
1508 	if (line < 0)
1509 		return 0;
1510 	Sci::Position pos = LineStart(line);
1511 	const Sci::Position length = Length();
1512 	while ((pos < length) && IsSpaceOrTab(cb.CharAt(pos))) {
1513 		pos++;
1514 	}
1515 	return pos;
1516 }
1517 
GetColumn(Sci::Position pos)1518 Sci::Position Document::GetColumn(Sci::Position pos) {
1519 	Sci::Position column = 0;
1520 	const Sci::Line line = SciLineFromPosition(pos);
1521 	if ((line >= 0) && (line < LinesTotal())) {
1522 		for (Sci::Position i = LineStart(line); i < pos;) {
1523 			const char ch = cb.CharAt(i);
1524 			if (ch == '\t') {
1525 				column = NextTab(column, tabInChars);
1526 				i++;
1527 			} else if (ch == '\r') {
1528 				return column;
1529 			} else if (ch == '\n') {
1530 				return column;
1531 			} else if (i >= Length()) {
1532 				return column;
1533 			} else {
1534 				column++;
1535 				i = NextPosition(i, 1);
1536 			}
1537 		}
1538 	}
1539 	return column;
1540 }
1541 
CountCharacters(Sci::Position startPos,Sci::Position endPos) const1542 Sci::Position Document::CountCharacters(Sci::Position startPos, Sci::Position endPos) const noexcept {
1543 	startPos = MovePositionOutsideChar(startPos, 1, false);
1544 	endPos = MovePositionOutsideChar(endPos, -1, false);
1545 	Sci::Position count = 0;
1546 	Sci::Position i = startPos;
1547 	while (i < endPos) {
1548 		count++;
1549 		i = NextPosition(i, 1);
1550 	}
1551 	return count;
1552 }
1553 
CountUTF16(Sci::Position startPos,Sci::Position endPos) const1554 Sci::Position Document::CountUTF16(Sci::Position startPos, Sci::Position endPos) const noexcept {
1555 	startPos = MovePositionOutsideChar(startPos, 1, false);
1556 	endPos = MovePositionOutsideChar(endPos, -1, false);
1557 	Sci::Position count = 0;
1558 	Sci::Position i = startPos;
1559 	while (i < endPos) {
1560 		count++;
1561 		const Sci::Position next = NextPosition(i, 1);
1562 		if ((next - i) > 3)
1563 			count++;
1564 		i = next;
1565 	}
1566 	return count;
1567 }
1568 
FindColumn(Sci::Line line,Sci::Position column)1569 Sci::Position Document::FindColumn(Sci::Line line, Sci::Position column) {
1570 	Sci::Position position = LineStart(line);
1571 	if ((line >= 0) && (line < LinesTotal())) {
1572 		Sci::Position columnCurrent = 0;
1573 		while ((columnCurrent < column) && (position < Length())) {
1574 			const char ch = cb.CharAt(position);
1575 			if (ch == '\t') {
1576 				columnCurrent = NextTab(columnCurrent, tabInChars);
1577 				if (columnCurrent > column)
1578 					return position;
1579 				position++;
1580 			} else if (ch == '\r') {
1581 				return position;
1582 			} else if (ch == '\n') {
1583 				return position;
1584 			} else {
1585 				columnCurrent++;
1586 				position = NextPosition(position, 1);
1587 			}
1588 		}
1589 	}
1590 	return position;
1591 }
1592 
Indent(bool forwards,Sci::Line lineBottom,Sci::Line lineTop)1593 void Document::Indent(bool forwards, Sci::Line lineBottom, Sci::Line lineTop) {
1594 	// Dedent - suck white space off the front of the line to dedent by equivalent of a tab
1595 	for (Sci::Line line = lineBottom; line >= lineTop; line--) {
1596 		const Sci::Position indentOfLine = GetLineIndentation(line);
1597 		if (forwards) {
1598 			if (LineStart(line) < LineEnd(line)) {
1599 				SetLineIndentation(line, indentOfLine + IndentSize());
1600 			}
1601 		} else {
1602 			SetLineIndentation(line, indentOfLine - IndentSize());
1603 		}
1604 	}
1605 }
1606 
1607 // Convert line endings for a piece of text to a particular mode.
1608 // Stop at len or when a NUL is found.
TransformLineEnds(const char * s,size_t len,int eolModeWanted)1609 std::string Document::TransformLineEnds(const char *s, size_t len, int eolModeWanted) {
1610 	std::string dest;
1611 	for (size_t i = 0; (i < len) && (s[i]); i++) {
1612 		if (s[i] == '\n' || s[i] == '\r') {
1613 			if (eolModeWanted == SC_EOL_CR) {
1614 				dest.push_back('\r');
1615 			} else if (eolModeWanted == SC_EOL_LF) {
1616 				dest.push_back('\n');
1617 			} else { // eolModeWanted == SC_EOL_CRLF
1618 				dest.push_back('\r');
1619 				dest.push_back('\n');
1620 			}
1621 			if ((s[i] == '\r') && (i+1 < len) && (s[i+1] == '\n')) {
1622 				i++;
1623 			}
1624 		} else {
1625 			dest.push_back(s[i]);
1626 		}
1627 	}
1628 	return dest;
1629 }
1630 
ConvertLineEnds(int eolModeSet)1631 void Document::ConvertLineEnds(int eolModeSet) {
1632 	UndoGroup ug(this);
1633 
1634 	for (Sci::Position pos = 0; pos < Length(); pos++) {
1635 		if (cb.CharAt(pos) == '\r') {
1636 			if (cb.CharAt(pos + 1) == '\n') {
1637 				// CRLF
1638 				if (eolModeSet == SC_EOL_CR) {
1639 					DeleteChars(pos + 1, 1); // Delete the LF
1640 				} else if (eolModeSet == SC_EOL_LF) {
1641 					DeleteChars(pos, 1); // Delete the CR
1642 				} else {
1643 					pos++;
1644 				}
1645 			} else {
1646 				// CR
1647 				if (eolModeSet == SC_EOL_CRLF) {
1648 					pos += InsertString(pos + 1, "\n", 1); // Insert LF
1649 				} else if (eolModeSet == SC_EOL_LF) {
1650 					pos += InsertString(pos, "\n", 1); // Insert LF
1651 					DeleteChars(pos, 1); // Delete CR
1652 					pos--;
1653 				}
1654 			}
1655 		} else if (cb.CharAt(pos) == '\n') {
1656 			// LF
1657 			if (eolModeSet == SC_EOL_CRLF) {
1658 				pos += InsertString(pos, "\r", 1); // Insert CR
1659 			} else if (eolModeSet == SC_EOL_CR) {
1660 				pos += InsertString(pos, "\r", 1); // Insert CR
1661 				DeleteChars(pos, 1); // Delete LF
1662 				pos--;
1663 			}
1664 		}
1665 	}
1666 
1667 }
1668 
Options() const1669 int Document::Options() const noexcept {
1670 	return (IsLarge() ? SC_DOCUMENTOPTION_TEXT_LARGE : 0) |
1671 		(cb.HasStyles() ? 0 : SC_DOCUMENTOPTION_STYLES_NONE);
1672 }
1673 
IsWhiteLine(Sci::Line line) const1674 bool Document::IsWhiteLine(Sci::Line line) const {
1675 	Sci::Position currentChar = LineStart(line);
1676 	const Sci::Position endLine = LineEnd(line);
1677 	while (currentChar < endLine) {
1678 		if (!IsSpaceOrTab(cb.CharAt(currentChar))) {
1679 			return false;
1680 		}
1681 		++currentChar;
1682 	}
1683 	return true;
1684 }
1685 
ParaUp(Sci::Position pos) const1686 Sci::Position Document::ParaUp(Sci::Position pos) const {
1687 	Sci::Line line = SciLineFromPosition(pos);
1688 	line--;
1689 	while (line >= 0 && IsWhiteLine(line)) { // skip empty lines
1690 		line--;
1691 	}
1692 	while (line >= 0 && !IsWhiteLine(line)) { // skip non-empty lines
1693 		line--;
1694 	}
1695 	line++;
1696 	return LineStart(line);
1697 }
1698 
ParaDown(Sci::Position pos) const1699 Sci::Position Document::ParaDown(Sci::Position pos) const {
1700 	Sci::Line line = SciLineFromPosition(pos);
1701 	while (line < LinesTotal() && !IsWhiteLine(line)) { // skip non-empty lines
1702 		line++;
1703 	}
1704 	while (line < LinesTotal() && IsWhiteLine(line)) { // skip empty lines
1705 		line++;
1706 	}
1707 	if (line < LinesTotal())
1708 		return LineStart(line);
1709 	else // end of a document
1710 		return LineEnd(line-1);
1711 }
1712 
WordCharacterClass(unsigned int ch) const1713 CharClassify::cc Document::WordCharacterClass(unsigned int ch) const {
1714 	if (dbcsCodePage && (!UTF8IsAscii(ch))) {
1715 		if (SC_CP_UTF8 == dbcsCodePage) {
1716 			// Use hard coded Unicode class
1717 			const CharacterCategory cc = charMap.CategoryFor(ch);
1718 			switch (cc) {
1719 
1720 				// Separator, Line/Paragraph
1721 			case ccZl:
1722 			case ccZp:
1723 				return CharClassify::ccNewLine;
1724 
1725 				// Separator, Space
1726 			case ccZs:
1727 				// Other
1728 			case ccCc:
1729 			case ccCf:
1730 			case ccCs:
1731 			case ccCo:
1732 			case ccCn:
1733 				return CharClassify::ccSpace;
1734 
1735 				// Letter
1736 			case ccLu:
1737 			case ccLl:
1738 			case ccLt:
1739 			case ccLm:
1740 			case ccLo:
1741 				// Number
1742 			case ccNd:
1743 			case ccNl:
1744 			case ccNo:
1745 				// Mark - includes combining diacritics
1746 			case ccMn:
1747 			case ccMc:
1748 			case ccMe:
1749 				return CharClassify::ccWord;
1750 
1751 				// Punctuation
1752 			case ccPc:
1753 			case ccPd:
1754 			case ccPs:
1755 			case ccPe:
1756 			case ccPi:
1757 			case ccPf:
1758 			case ccPo:
1759 				// Symbol
1760 			case ccSm:
1761 			case ccSc:
1762 			case ccSk:
1763 			case ccSo:
1764 				return CharClassify::ccPunctuation;
1765 
1766 			}
1767 		} else {
1768 			// Asian DBCS
1769 			return CharClassify::ccWord;
1770 		}
1771 	}
1772 	return charClass.GetClass(static_cast<unsigned char>(ch));
1773 }
1774 
1775 /**
1776  * Used by commmands that want to select whole words.
1777  * Finds the start of word at pos when delta < 0 or the end of the word when delta >= 0.
1778  */
ExtendWordSelect(Sci::Position pos,int delta,bool onlyWordCharacters) const1779 Sci::Position Document::ExtendWordSelect(Sci::Position pos, int delta, bool onlyWordCharacters) const {
1780 	CharClassify::cc ccStart = CharClassify::ccWord;
1781 	if (delta < 0) {
1782 		if (!onlyWordCharacters) {
1783 			const CharacterExtracted ce = CharacterBefore(pos);
1784 			ccStart = WordCharacterClass(ce.character);
1785 		}
1786 		while (pos > 0) {
1787 			const CharacterExtracted ce = CharacterBefore(pos);
1788 			if (WordCharacterClass(ce.character) != ccStart)
1789 				break;
1790 			pos -= ce.widthBytes;
1791 		}
1792 	} else {
1793 		if (!onlyWordCharacters && pos < LengthNoExcept()) {
1794 			const CharacterExtracted ce = CharacterAfter(pos);
1795 			ccStart = WordCharacterClass(ce.character);
1796 		}
1797 		while (pos < LengthNoExcept()) {
1798 			const CharacterExtracted ce = CharacterAfter(pos);
1799 			if (WordCharacterClass(ce.character) != ccStart)
1800 				break;
1801 			pos += ce.widthBytes;
1802 		}
1803 	}
1804 	return MovePositionOutsideChar(pos, delta, true);
1805 }
1806 
1807 /**
1808  * Find the start of the next word in either a forward (delta >= 0) or backwards direction
1809  * (delta < 0).
1810  * This is looking for a transition between character classes although there is also some
1811  * additional movement to transit white space.
1812  * Used by cursor movement by word commands.
1813  */
NextWordStart(Sci::Position pos,int delta) const1814 Sci::Position Document::NextWordStart(Sci::Position pos, int delta) const {
1815 	if (delta < 0) {
1816 		while (pos > 0) {
1817 			const CharacterExtracted ce = CharacterBefore(pos);
1818 			if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
1819 				break;
1820 			pos -= ce.widthBytes;
1821 		}
1822 		if (pos > 0) {
1823 			CharacterExtracted ce = CharacterBefore(pos);
1824 			const CharClassify::cc ccStart = WordCharacterClass(ce.character);
1825 			while (pos > 0) {
1826 				ce = CharacterBefore(pos);
1827 				if (WordCharacterClass(ce.character) != ccStart)
1828 					break;
1829 				pos -= ce.widthBytes;
1830 			}
1831 		}
1832 	} else {
1833 		CharacterExtracted ce = CharacterAfter(pos);
1834 		const CharClassify::cc ccStart = WordCharacterClass(ce.character);
1835 		while (pos < LengthNoExcept()) {
1836 			ce = CharacterAfter(pos);
1837 			if (WordCharacterClass(ce.character) != ccStart)
1838 				break;
1839 			pos += ce.widthBytes;
1840 		}
1841 		while (pos < LengthNoExcept()) {
1842 			ce = CharacterAfter(pos);
1843 			if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
1844 				break;
1845 			pos += ce.widthBytes;
1846 		}
1847 	}
1848 	return pos;
1849 }
1850 
1851 /**
1852  * Find the end of the next word in either a forward (delta >= 0) or backwards direction
1853  * (delta < 0).
1854  * This is looking for a transition between character classes although there is also some
1855  * additional movement to transit white space.
1856  * Used by cursor movement by word commands.
1857  */
NextWordEnd(Sci::Position pos,int delta) const1858 Sci::Position Document::NextWordEnd(Sci::Position pos, int delta) const {
1859 	if (delta < 0) {
1860 		if (pos > 0) {
1861 			CharacterExtracted ce = CharacterBefore(pos);
1862 			const CharClassify::cc ccStart = WordCharacterClass(ce.character);
1863 			if (ccStart != CharClassify::ccSpace) {
1864 				while (pos > 0) {
1865 					ce = CharacterBefore(pos);
1866 					if (WordCharacterClass(ce.character) != ccStart)
1867 						break;
1868 					pos -= ce.widthBytes;
1869 				}
1870 			}
1871 			while (pos > 0) {
1872 				ce = CharacterBefore(pos);
1873 				if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
1874 					break;
1875 				pos -= ce.widthBytes;
1876 			}
1877 		}
1878 	} else {
1879 		while (pos < LengthNoExcept()) {
1880 			const CharacterExtracted ce = CharacterAfter(pos);
1881 			if (WordCharacterClass(ce.character) != CharClassify::ccSpace)
1882 				break;
1883 			pos += ce.widthBytes;
1884 		}
1885 		if (pos < LengthNoExcept()) {
1886 			CharacterExtracted ce = CharacterAfter(pos);
1887 			const CharClassify::cc ccStart = WordCharacterClass(ce.character);
1888 			while (pos < LengthNoExcept()) {
1889 				ce = CharacterAfter(pos);
1890 				if (WordCharacterClass(ce.character) != ccStart)
1891 					break;
1892 				pos += ce.widthBytes;
1893 			}
1894 		}
1895 	}
1896 	return pos;
1897 }
1898 
1899 /**
1900  * Check that the character at the given position is a word or punctuation character and that
1901  * the previous character is of a different character class.
1902  */
IsWordStartAt(Sci::Position pos) const1903 bool Document::IsWordStartAt(Sci::Position pos) const {
1904 	if (pos >= LengthNoExcept())
1905 		return false;
1906 	if (pos > 0) {
1907 		const CharacterExtracted cePos = CharacterAfter(pos);
1908 		const CharClassify::cc ccPos = WordCharacterClass(cePos.character);
1909 		const CharacterExtracted cePrev = CharacterBefore(pos);
1910 		const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);
1911 		return (ccPos == CharClassify::ccWord || ccPos == CharClassify::ccPunctuation) &&
1912 			(ccPos != ccPrev);
1913 	}
1914 	return true;
1915 }
1916 
1917 /**
1918  * Check that the character at the given position is a word or punctuation character and that
1919  * the next character is of a different character class.
1920  */
IsWordEndAt(Sci::Position pos) const1921 bool Document::IsWordEndAt(Sci::Position pos) const {
1922 	if (pos <= 0)
1923 		return false;
1924 	if (pos < LengthNoExcept()) {
1925 		const CharacterExtracted cePos = CharacterAfter(pos);
1926 		const CharClassify::cc ccPos = WordCharacterClass(cePos.character);
1927 		const CharacterExtracted cePrev = CharacterBefore(pos);
1928 		const CharClassify::cc ccPrev = WordCharacterClass(cePrev.character);
1929 		return (ccPrev == CharClassify::ccWord || ccPrev == CharClassify::ccPunctuation) &&
1930 			(ccPrev != ccPos);
1931 	}
1932 	return true;
1933 }
1934 
1935 /**
1936  * Check that the given range is has transitions between character classes at both
1937  * ends and where the characters on the inside are word or punctuation characters.
1938  */
IsWordAt(Sci::Position start,Sci::Position end) const1939 bool Document::IsWordAt(Sci::Position start, Sci::Position end) const {
1940 	return (start < end) && IsWordStartAt(start) && IsWordEndAt(end);
1941 }
1942 
MatchesWordOptions(bool word,bool wordStart,Sci::Position pos,Sci::Position length) const1943 bool Document::MatchesWordOptions(bool word, bool wordStart, Sci::Position pos, Sci::Position length) const {
1944 	return (!word && !wordStart) ||
1945 			(word && IsWordAt(pos, pos + length)) ||
1946 			(wordStart && IsWordStartAt(pos));
1947 }
1948 
HasCaseFolder() const1949 bool Document::HasCaseFolder() const noexcept {
1950 	return pcf != nullptr;
1951 }
1952 
SetCaseFolder(CaseFolder * pcf_)1953 void Document::SetCaseFolder(CaseFolder *pcf_) noexcept {
1954 	pcf.reset(pcf_);
1955 }
1956 
ExtractCharacter(Sci::Position position) const1957 Document::CharacterExtracted Document::ExtractCharacter(Sci::Position position) const noexcept {
1958 	const unsigned char leadByte = cb.UCharAt(position);
1959 	if (UTF8IsAscii(leadByte)) {
1960 		// Common case: ASCII character
1961 		return CharacterExtracted(leadByte, 1);
1962 	}
1963 	const int widthCharBytes = UTF8BytesOfLead[leadByte];
1964 	unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
1965 	for (int b=1; b<widthCharBytes; b++)
1966 		charBytes[b] = cb.UCharAt(position + b);
1967 	const int utf8status = UTF8Classify(charBytes, widthCharBytes);
1968 	if (utf8status & UTF8MaskInvalid) {
1969 		// Treat as invalid and use up just one byte
1970 		return CharacterExtracted(unicodeReplacementChar, 1);
1971 	} else {
1972 		return CharacterExtracted(UnicodeFromUTF8(charBytes), utf8status & UTF8MaskWidth);
1973 	}
1974 }
1975 
1976 /**
1977  * Find text in document, supporting both forward and backward
1978  * searches (just pass minPos > maxPos to do a backward search)
1979  * Has not been tested with backwards DBCS searches yet.
1980  */
FindText(Sci::Position minPos,Sci::Position maxPos,const char * search,int flags,Sci::Position * length)1981 Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, const char *search,
1982                         int flags, Sci::Position *length) {
1983 	if (*length <= 0)
1984 		return minPos;
1985 	const bool caseSensitive = (flags & SCFIND_MATCHCASE) != 0;
1986 	const bool word = (flags & SCFIND_WHOLEWORD) != 0;
1987 	const bool wordStart = (flags & SCFIND_WORDSTART) != 0;
1988 	const bool regExp = (flags & SCFIND_REGEXP) != 0;
1989 	if (regExp) {
1990 		if (!regex)
1991 			regex = std::unique_ptr<RegexSearchBase>(CreateRegexSearch(&charClass));
1992 		return regex->FindText(this, minPos, maxPos, search, caseSensitive, word, wordStart, flags, length);
1993 	} else {
1994 
1995 		const bool forward = minPos <= maxPos;
1996 		const int increment = forward ? 1 : -1;
1997 
1998 		// Range endpoints should not be inside DBCS characters, but just in case, move them.
1999 		const Sci::Position startPos = MovePositionOutsideChar(minPos, increment, false);
2000 		const Sci::Position endPos = MovePositionOutsideChar(maxPos, increment, false);
2001 
2002 		// Compute actual search ranges needed
2003 		const Sci::Position lengthFind = *length;
2004 
2005 		//Platform::DebugPrintf("Find %d %d %s %d\n", startPos, endPos, ft->lpstrText, lengthFind);
2006 		const Sci::Position limitPos = std::max(startPos, endPos);
2007 		Sci::Position pos = startPos;
2008 		if (!forward) {
2009 			// Back all of a character
2010 			pos = NextPosition(pos, increment);
2011 		}
2012 		if (caseSensitive) {
2013 			const Sci::Position endSearch = (startPos <= endPos) ? endPos - lengthFind + 1 : endPos;
2014 			const char charStartSearch =  search[0];
2015 			while (forward ? (pos < endSearch) : (pos >= endSearch)) {
2016 				if (CharAt(pos) == charStartSearch) {
2017 					bool found = (pos + lengthFind) <= limitPos;
2018 					for (int indexSearch = 1; (indexSearch < lengthFind) && found; indexSearch++) {
2019 						found = CharAt(pos + indexSearch) == search[indexSearch];
2020 					}
2021 					if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) {
2022 						return pos;
2023 					}
2024 				}
2025 				if (!NextCharacter(pos, increment))
2026 					break;
2027 			}
2028 		} else if (SC_CP_UTF8 == dbcsCodePage) {
2029 			const size_t maxFoldingExpansion = 4;
2030 			std::vector<char> searchThing((lengthFind+1) * UTF8MaxBytes * maxFoldingExpansion + 1);
2031 			const size_t lenSearch =
2032 				pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind);
2033 			char bytes[UTF8MaxBytes + 1] = "";
2034 			char folded[UTF8MaxBytes * maxFoldingExpansion + 1] = "";
2035 			while (forward ? (pos < endPos) : (pos >= endPos)) {
2036 				int widthFirstCharacter = 0;
2037 				Sci::Position posIndexDocument = pos;
2038 				size_t indexSearch = 0;
2039 				bool characterMatches = true;
2040 				for (;;) {
2041 					const unsigned char leadByte = cb.UCharAt(posIndexDocument);
2042 					bytes[0] = leadByte;
2043 					int widthChar = 1;
2044 					if (!UTF8IsAscii(leadByte)) {
2045 						const int widthCharBytes = UTF8BytesOfLead[leadByte];
2046 						for (int b=1; b<widthCharBytes; b++) {
2047 							bytes[b] = cb.CharAt(posIndexDocument+b);
2048 						}
2049 						widthChar = UTF8Classify(reinterpret_cast<const unsigned char *>(bytes), widthCharBytes) & UTF8MaskWidth;
2050 					}
2051 					if (!widthFirstCharacter)
2052 						widthFirstCharacter = widthChar;
2053 					if ((posIndexDocument + widthChar) > limitPos)
2054 						break;
2055 					const size_t lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar);
2056 					// memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing
2057 					assert((indexSearch + lenFlat) <= searchThing.size());
2058 					// Does folded match the buffer
2059 					characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat);
2060 					if (!characterMatches)
2061 						break;
2062 					posIndexDocument += widthChar;
2063 					indexSearch += lenFlat;
2064 					if (indexSearch >= lenSearch)
2065 						break;
2066 				}
2067 				if (characterMatches && (indexSearch == lenSearch)) {
2068 					if (MatchesWordOptions(word, wordStart, pos, posIndexDocument - pos)) {
2069 						*length = posIndexDocument - pos;
2070 						return pos;
2071 					}
2072 				}
2073 				if (forward) {
2074 					pos += widthFirstCharacter;
2075 				} else {
2076 					if (!NextCharacter(pos, increment))
2077 						break;
2078 				}
2079 			}
2080 		} else if (dbcsCodePage) {
2081 			const size_t maxBytesCharacter = 2;
2082 			const size_t maxFoldingExpansion = 4;
2083 			std::vector<char> searchThing((lengthFind+1) * maxBytesCharacter * maxFoldingExpansion + 1);
2084 			const size_t lenSearch = pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind);
2085 			while (forward ? (pos < endPos) : (pos >= endPos)) {
2086 				Sci::Position indexDocument = 0;
2087 				size_t indexSearch = 0;
2088 				bool characterMatches = true;
2089 				while (characterMatches &&
2090 					((pos + indexDocument) < limitPos) &&
2091 					(indexSearch < lenSearch)) {
2092 					char bytes[maxBytesCharacter + 1];
2093 					bytes[0] = cb.CharAt(pos + indexDocument);
2094 					const Sci::Position widthChar = IsDBCSLeadByteNoExcept(bytes[0]) ? 2 : 1;
2095 					if (widthChar == 2)
2096 						bytes[1] = cb.CharAt(pos + indexDocument + 1);
2097 					if ((pos + indexDocument + widthChar) > limitPos)
2098 						break;
2099 					char folded[maxBytesCharacter * maxFoldingExpansion + 1];
2100 					const size_t lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar);
2101 					// memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing
2102 					assert((indexSearch + lenFlat) <= searchThing.size());
2103 					// Does folded match the buffer
2104 					characterMatches = 0 == memcmp(folded, &searchThing[0] + indexSearch, lenFlat);
2105 					indexDocument += widthChar;
2106 					indexSearch += lenFlat;
2107 				}
2108 				if (characterMatches && (indexSearch == lenSearch)) {
2109 					if (MatchesWordOptions(word, wordStart, pos, indexDocument)) {
2110 						*length = indexDocument;
2111 						return pos;
2112 					}
2113 				}
2114 				if (!NextCharacter(pos, increment))
2115 					break;
2116 			}
2117 		} else {
2118 			const Sci::Position endSearch = (startPos <= endPos) ? endPos - lengthFind + 1 : endPos;
2119 			std::vector<char> searchThing(lengthFind + 1);
2120 			pcf->Fold(&searchThing[0], searchThing.size(), search, lengthFind);
2121 			while (forward ? (pos < endSearch) : (pos >= endSearch)) {
2122 				bool found = (pos + lengthFind) <= limitPos;
2123 				for (int indexSearch = 0; (indexSearch < lengthFind) && found; indexSearch++) {
2124 					const char ch = CharAt(pos + indexSearch);
2125 					char folded[2];
2126 					pcf->Fold(folded, sizeof(folded), &ch, 1);
2127 					found = folded[0] == searchThing[indexSearch];
2128 				}
2129 				if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) {
2130 					return pos;
2131 				}
2132 				if (!NextCharacter(pos, increment))
2133 					break;
2134 			}
2135 		}
2136 	}
2137 	//Platform::DebugPrintf("Not found\n");
2138 	return -1;
2139 }
2140 
SubstituteByPosition(const char * text,Sci::Position * length)2141 const char *Document::SubstituteByPosition(const char *text, Sci::Position *length) {
2142 	if (regex)
2143 		return regex->SubstituteByPosition(this, text, length);
2144 	else
2145 		return nullptr;
2146 }
2147 
LineCharacterIndex() const2148 int Document::LineCharacterIndex() const noexcept {
2149 	return cb.LineCharacterIndex();
2150 }
2151 
AllocateLineCharacterIndex(int lineCharacterIndex)2152 void Document::AllocateLineCharacterIndex(int lineCharacterIndex) {
2153 	return cb.AllocateLineCharacterIndex(lineCharacterIndex);
2154 }
2155 
ReleaseLineCharacterIndex(int lineCharacterIndex)2156 void Document::ReleaseLineCharacterIndex(int lineCharacterIndex) {
2157 	return cb.ReleaseLineCharacterIndex(lineCharacterIndex);
2158 }
2159 
LinesTotal() const2160 Sci::Line Document::LinesTotal() const noexcept {
2161 	return cb.Lines();
2162 }
2163 
SetDefaultCharClasses(bool includeWordClass)2164 void Document::SetDefaultCharClasses(bool includeWordClass) {
2165     charClass.SetDefaultCharClasses(includeWordClass);
2166 }
2167 
SetCharClasses(const unsigned char * chars,CharClassify::cc newCharClass)2168 void Document::SetCharClasses(const unsigned char *chars, CharClassify::cc newCharClass) {
2169     charClass.SetCharClasses(chars, newCharClass);
2170 }
2171 
GetCharsOfClass(CharClassify::cc characterClass,unsigned char * buffer) const2172 int Document::GetCharsOfClass(CharClassify::cc characterClass, unsigned char *buffer) const {
2173     return charClass.GetCharsOfClass(characterClass, buffer);
2174 }
2175 
SetCharacterCategoryOptimization(int countCharacters)2176 void Document::SetCharacterCategoryOptimization(int countCharacters) {
2177 	charMap.Optimize(countCharacters);
2178 }
2179 
CharacterCategoryOptimization() const2180 int Document::CharacterCategoryOptimization() const noexcept {
2181 	return charMap.Size();
2182 }
2183 
StartStyling(Sci_Position position)2184 void SCI_METHOD Document::StartStyling(Sci_Position position) {
2185 	endStyled = position;
2186 }
2187 
SetStyleFor(Sci_Position length,char style)2188 bool SCI_METHOD Document::SetStyleFor(Sci_Position length, char style) {
2189 	if (enteredStyling != 0) {
2190 		return false;
2191 	} else {
2192 		enteredStyling++;
2193 		const Sci::Position prevEndStyled = endStyled;
2194 		if (cb.SetStyleFor(endStyled, length, style)) {
2195 			const DocModification mh(SC_MOD_CHANGESTYLE | SC_PERFORMED_USER,
2196 			                   prevEndStyled, length);
2197 			NotifyModified(mh);
2198 		}
2199 		endStyled += length;
2200 		enteredStyling--;
2201 		return true;
2202 	}
2203 }
2204 
SetStyles(Sci_Position length,const char * styles)2205 bool SCI_METHOD Document::SetStyles(Sci_Position length, const char *styles) {
2206 	if (enteredStyling != 0) {
2207 		return false;
2208 	} else {
2209 		enteredStyling++;
2210 		bool didChange = false;
2211 		Sci::Position startMod = 0;
2212 		Sci::Position endMod = 0;
2213 		for (int iPos = 0; iPos < length; iPos++, endStyled++) {
2214 			PLATFORM_ASSERT(endStyled < Length());
2215 			if (cb.SetStyleAt(endStyled, styles[iPos])) {
2216 				if (!didChange) {
2217 					startMod = endStyled;
2218 				}
2219 				didChange = true;
2220 				endMod = endStyled;
2221 			}
2222 		}
2223 		if (didChange) {
2224 			const DocModification mh(SC_MOD_CHANGESTYLE | SC_PERFORMED_USER,
2225 			                   startMod, endMod - startMod + 1);
2226 			NotifyModified(mh);
2227 		}
2228 		enteredStyling--;
2229 		return true;
2230 	}
2231 }
2232 
EnsureStyledTo(Sci::Position pos)2233 void Document::EnsureStyledTo(Sci::Position pos) {
2234 	if ((enteredStyling == 0) && (pos > GetEndStyled())) {
2235 		IncrementStyleClock();
2236 		if (pli && !pli->UseContainerLexing()) {
2237 			const Sci::Line lineEndStyled = SciLineFromPosition(GetEndStyled());
2238 			const Sci::Position endStyledTo = LineStart(lineEndStyled);
2239 			pli->Colourise(endStyledTo, pos);
2240 		} else {
2241 			// Ask the watchers to style, and stop as soon as one responds.
2242 			for (std::vector<WatcherWithUserData>::iterator it = watchers.begin();
2243 				(pos > GetEndStyled()) && (it != watchers.end()); ++it) {
2244 				it->watcher->NotifyStyleNeeded(this, it->userData, pos);
2245 			}
2246 		}
2247 	}
2248 }
2249 
StyleToAdjustingLineDuration(Sci::Position pos)2250 void Document::StyleToAdjustingLineDuration(Sci::Position pos) {
2251 	const Sci::Line lineFirst = SciLineFromPosition(GetEndStyled());
2252 	ElapsedPeriod epStyling;
2253 	EnsureStyledTo(pos);
2254 	const Sci::Line lineLast = SciLineFromPosition(GetEndStyled());
2255 	durationStyleOneLine.AddSample(lineLast - lineFirst, epStyling.Duration());
2256 }
2257 
LexerChanged()2258 void Document::LexerChanged() {
2259 	// Tell the watchers the lexer has changed.
2260 	for (const WatcherWithUserData &watcher : watchers) {
2261 		watcher.watcher->NotifyLexerChanged(this, watcher.userData);
2262 	}
2263 }
2264 
GetLexInterface() const2265 LexInterface *Document::GetLexInterface() const {
2266 	return pli.get();
2267 }
2268 
SetLexInterface(LexInterface * pLexInterface)2269 void Document::SetLexInterface(LexInterface *pLexInterface) {
2270 	pli.reset(pLexInterface);
2271 }
2272 
SetLineState(Sci_Position line,int state)2273 int SCI_METHOD Document::SetLineState(Sci_Position line, int state) {
2274 	const int statePrevious = States()->SetLineState(static_cast<Sci::Line>(line), state);
2275 	if (state != statePrevious) {
2276 		const DocModification mh(SC_MOD_CHANGELINESTATE, LineStart(line), 0, 0, nullptr,
2277 			static_cast<Sci::Line>(line));
2278 		NotifyModified(mh);
2279 	}
2280 	return statePrevious;
2281 }
2282 
GetLineState(Sci_Position line) const2283 int SCI_METHOD Document::GetLineState(Sci_Position line) const {
2284 	return States()->GetLineState(static_cast<Sci::Line>(line));
2285 }
2286 
GetMaxLineState() const2287 Sci::Line Document::GetMaxLineState() const {
2288 	return States()->GetMaxLineState();
2289 }
2290 
ChangeLexerState(Sci_Position start,Sci_Position end)2291 void SCI_METHOD Document::ChangeLexerState(Sci_Position start, Sci_Position end) {
2292 	const DocModification mh(SC_MOD_LEXERSTATE, start,
2293 		end-start, 0, 0, 0);
2294 	NotifyModified(mh);
2295 }
2296 
MarginStyledText(Sci::Line line) const2297 StyledText Document::MarginStyledText(Sci::Line line) const {
2298 	const LineAnnotation *pla = Margins();
2299 	return StyledText(pla->Length(line), pla->Text(line),
2300 		pla->MultipleStyles(line), pla->Style(line), pla->Styles(line));
2301 }
2302 
MarginSetText(Sci::Line line,const char * text)2303 void Document::MarginSetText(Sci::Line line, const char *text) {
2304 	Margins()->SetText(line, text);
2305 	const DocModification mh(SC_MOD_CHANGEMARGIN, LineStart(line),
2306 		0, 0, 0, line);
2307 	NotifyModified(mh);
2308 }
2309 
MarginSetStyle(Sci::Line line,int style)2310 void Document::MarginSetStyle(Sci::Line line, int style) {
2311 	Margins()->SetStyle(line, style);
2312 	NotifyModified(DocModification(SC_MOD_CHANGEMARGIN, LineStart(line),
2313 		0, 0, 0, line));
2314 }
2315 
MarginSetStyles(Sci::Line line,const unsigned char * styles)2316 void Document::MarginSetStyles(Sci::Line line, const unsigned char *styles) {
2317 	Margins()->SetStyles(line, styles);
2318 	NotifyModified(DocModification(SC_MOD_CHANGEMARGIN, LineStart(line),
2319 		0, 0, 0, line));
2320 }
2321 
MarginClearAll()2322 void Document::MarginClearAll() {
2323 	const Sci::Line maxEditorLine = LinesTotal();
2324 	for (Sci::Line l=0; l<maxEditorLine; l++)
2325 		MarginSetText(l, nullptr);
2326 	// Free remaining data
2327 	Margins()->ClearAll();
2328 }
2329 
AnnotationStyledText(Sci::Line line) const2330 StyledText Document::AnnotationStyledText(Sci::Line line) const {
2331 	const LineAnnotation *pla = Annotations();
2332 	return StyledText(pla->Length(line), pla->Text(line),
2333 		pla->MultipleStyles(line), pla->Style(line), pla->Styles(line));
2334 }
2335 
AnnotationSetText(Sci::Line line,const char * text)2336 void Document::AnnotationSetText(Sci::Line line, const char *text) {
2337 	if (line >= 0 && line < LinesTotal()) {
2338 		const Sci::Line linesBefore = AnnotationLines(line);
2339 		Annotations()->SetText(line, text);
2340 		const int linesAfter = AnnotationLines(line);
2341 		DocModification mh(SC_MOD_CHANGEANNOTATION, LineStart(line),
2342 			0, 0, 0, line);
2343 		mh.annotationLinesAdded = linesAfter - linesBefore;
2344 		NotifyModified(mh);
2345 	}
2346 }
2347 
AnnotationSetStyle(Sci::Line line,int style)2348 void Document::AnnotationSetStyle(Sci::Line line, int style) {
2349 	Annotations()->SetStyle(line, style);
2350 	const DocModification mh(SC_MOD_CHANGEANNOTATION, LineStart(line),
2351 		0, 0, 0, line);
2352 	NotifyModified(mh);
2353 }
2354 
AnnotationSetStyles(Sci::Line line,const unsigned char * styles)2355 void Document::AnnotationSetStyles(Sci::Line line, const unsigned char *styles) {
2356 	if (line >= 0 && line < LinesTotal()) {
2357 		Annotations()->SetStyles(line, styles);
2358 	}
2359 }
2360 
AnnotationLines(Sci::Line line) const2361 int Document::AnnotationLines(Sci::Line line) const {
2362 	return Annotations()->Lines(line);
2363 }
2364 
AnnotationClearAll()2365 void Document::AnnotationClearAll() {
2366 	const Sci::Line maxEditorLine = LinesTotal();
2367 	for (Sci::Line l=0; l<maxEditorLine; l++)
2368 		AnnotationSetText(l, nullptr);
2369 	// Free remaining data
2370 	Annotations()->ClearAll();
2371 }
2372 
IncrementStyleClock()2373 void Document::IncrementStyleClock() noexcept {
2374 	styleClock = (styleClock + 1) % 0x100000;
2375 }
2376 
DecorationSetCurrentIndicator(int indicator)2377 void SCI_METHOD Document::DecorationSetCurrentIndicator(int indicator) {
2378 	decorations->SetCurrentIndicator(indicator);
2379 }
2380 
DecorationFillRange(Sci_Position position,int value,Sci_Position fillLength)2381 void SCI_METHOD Document::DecorationFillRange(Sci_Position position, int value, Sci_Position fillLength) {
2382 	const FillResult<Sci::Position> fr = decorations->FillRange(
2383 		position, value, fillLength);
2384 	if (fr.changed) {
2385 		const DocModification mh(SC_MOD_CHANGEINDICATOR | SC_PERFORMED_USER,
2386 							fr.position, fr.fillLength);
2387 		NotifyModified(mh);
2388 	}
2389 }
2390 
AddWatcher(DocWatcher * watcher,void * userData)2391 bool Document::AddWatcher(DocWatcher *watcher, void *userData) {
2392 	const WatcherWithUserData wwud(watcher, userData);
2393 	std::vector<WatcherWithUserData>::iterator it =
2394 		std::find(watchers.begin(), watchers.end(), wwud);
2395 	if (it != watchers.end())
2396 		return false;
2397 	watchers.push_back(wwud);
2398 	return true;
2399 }
2400 
RemoveWatcher(DocWatcher * watcher,void * userData)2401 bool Document::RemoveWatcher(DocWatcher *watcher, void *userData) {
2402 	std::vector<WatcherWithUserData>::iterator it =
2403 		std::find(watchers.begin(), watchers.end(), WatcherWithUserData(watcher, userData));
2404 	if (it != watchers.end()) {
2405 		watchers.erase(it);
2406 		return true;
2407 	}
2408 	return false;
2409 }
2410 
NotifyModifyAttempt()2411 void Document::NotifyModifyAttempt() {
2412 	for (const WatcherWithUserData &watcher : watchers) {
2413 		watcher.watcher->NotifyModifyAttempt(this, watcher.userData);
2414 	}
2415 }
2416 
NotifySavePoint(bool atSavePoint)2417 void Document::NotifySavePoint(bool atSavePoint) {
2418 	for (const WatcherWithUserData &watcher : watchers) {
2419 		watcher.watcher->NotifySavePoint(this, watcher.userData, atSavePoint);
2420 	}
2421 }
2422 
NotifyModified(DocModification mh)2423 void Document::NotifyModified(DocModification mh) {
2424 	if (mh.modificationType & SC_MOD_INSERTTEXT) {
2425 		decorations->InsertSpace(mh.position, mh.length);
2426 	} else if (mh.modificationType & SC_MOD_DELETETEXT) {
2427 		decorations->DeleteRange(mh.position, mh.length);
2428 	}
2429 	for (const WatcherWithUserData &watcher : watchers) {
2430 		watcher.watcher->NotifyModified(this, mh, watcher.userData);
2431 	}
2432 }
2433 
2434 // Used for word part navigation.
IsASCIIPunctuationCharacter(unsigned int ch)2435 static bool IsASCIIPunctuationCharacter(unsigned int ch) noexcept {
2436 	switch (ch) {
2437 	case '!':
2438 	case '"':
2439 	case '#':
2440 	case '$':
2441 	case '%':
2442 	case '&':
2443 	case '\'':
2444 	case '(':
2445 	case ')':
2446 	case '*':
2447 	case '+':
2448 	case ',':
2449 	case '-':
2450 	case '.':
2451 	case '/':
2452 	case ':':
2453 	case ';':
2454 	case '<':
2455 	case '=':
2456 	case '>':
2457 	case '?':
2458 	case '@':
2459 	case '[':
2460 	case '\\':
2461 	case ']':
2462 	case '^':
2463 	case '_':
2464 	case '`':
2465 	case '{':
2466 	case '|':
2467 	case '}':
2468 	case '~':
2469 		return true;
2470 	default:
2471 		return false;
2472 	}
2473 }
2474 
IsWordPartSeparator(unsigned int ch) const2475 bool Document::IsWordPartSeparator(unsigned int ch) const {
2476 	return (WordCharacterClass(ch) == CharClassify::ccWord) && IsASCIIPunctuationCharacter(ch);
2477 }
2478 
WordPartLeft(Sci::Position pos) const2479 Sci::Position Document::WordPartLeft(Sci::Position pos) const {
2480 	if (pos > 0) {
2481 		pos -= CharacterBefore(pos).widthBytes;
2482 		CharacterExtracted ceStart = CharacterAfter(pos);
2483 		if (IsWordPartSeparator(ceStart.character)) {
2484 			while (pos > 0 && IsWordPartSeparator(CharacterAfter(pos).character)) {
2485 				pos -= CharacterBefore(pos).widthBytes;
2486 			}
2487 		}
2488 		if (pos > 0) {
2489 			ceStart = CharacterAfter(pos);
2490 			pos -= CharacterBefore(pos).widthBytes;
2491 			if (IsLowerCase(ceStart.character)) {
2492 				while (pos > 0 && IsLowerCase(CharacterAfter(pos).character))
2493 					pos -= CharacterBefore(pos).widthBytes;
2494 				if (!IsUpperCase(CharacterAfter(pos).character) && !IsLowerCase(CharacterAfter(pos).character))
2495 					pos += CharacterAfter(pos).widthBytes;
2496 			} else if (IsUpperCase(ceStart.character)) {
2497 				while (pos > 0 && IsUpperCase(CharacterAfter(pos).character))
2498 					pos -= CharacterBefore(pos).widthBytes;
2499 				if (!IsUpperCase(CharacterAfter(pos).character))
2500 					pos += CharacterAfter(pos).widthBytes;
2501 			} else if (IsADigit(ceStart.character)) {
2502 				while (pos > 0 && IsADigit(CharacterAfter(pos).character))
2503 					pos -= CharacterBefore(pos).widthBytes;
2504 				if (!IsADigit(CharacterAfter(pos).character))
2505 					pos += CharacterAfter(pos).widthBytes;
2506 			} else if (IsASCIIPunctuationCharacter(ceStart.character)) {
2507 				while (pos > 0 && IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
2508 					pos -= CharacterBefore(pos).widthBytes;
2509 				if (!IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
2510 					pos += CharacterAfter(pos).widthBytes;
2511 			} else if (isspacechar(ceStart.character)) {
2512 				while (pos > 0 && isspacechar(CharacterAfter(pos).character))
2513 					pos -= CharacterBefore(pos).widthBytes;
2514 				if (!isspacechar(CharacterAfter(pos).character))
2515 					pos += CharacterAfter(pos).widthBytes;
2516 			} else if (!IsASCII(ceStart.character)) {
2517 				while (pos > 0 && !IsASCII(CharacterAfter(pos).character))
2518 					pos -= CharacterBefore(pos).widthBytes;
2519 				if (IsASCII(CharacterAfter(pos).character))
2520 					pos += CharacterAfter(pos).widthBytes;
2521 			} else {
2522 				pos += CharacterAfter(pos).widthBytes;
2523 			}
2524 		}
2525 	}
2526 	return pos;
2527 }
2528 
WordPartRight(Sci::Position pos) const2529 Sci::Position Document::WordPartRight(Sci::Position pos) const {
2530 	CharacterExtracted ceStart = CharacterAfter(pos);
2531 	const Sci::Position length = LengthNoExcept();
2532 	if (IsWordPartSeparator(ceStart.character)) {
2533 		while (pos < length && IsWordPartSeparator(CharacterAfter(pos).character))
2534 			pos += CharacterAfter(pos).widthBytes;
2535 		ceStart = CharacterAfter(pos);
2536 	}
2537 	if (!IsASCII(ceStart.character)) {
2538 		while (pos < length && !IsASCII(CharacterAfter(pos).character))
2539 			pos += CharacterAfter(pos).widthBytes;
2540 	} else if (IsLowerCase(ceStart.character)) {
2541 		while (pos < length && IsLowerCase(CharacterAfter(pos).character))
2542 			pos += CharacterAfter(pos).widthBytes;
2543 	} else if (IsUpperCase(ceStart.character)) {
2544 		if (IsLowerCase(CharacterAfter(pos + ceStart.widthBytes).character)) {
2545 			pos += CharacterAfter(pos).widthBytes;
2546 			while (pos < length && IsLowerCase(CharacterAfter(pos).character))
2547 				pos += CharacterAfter(pos).widthBytes;
2548 		} else {
2549 			while (pos < length && IsUpperCase(CharacterAfter(pos).character))
2550 				pos += CharacterAfter(pos).widthBytes;
2551 		}
2552 		if (IsLowerCase(CharacterAfter(pos).character) && IsUpperCase(CharacterBefore(pos).character))
2553 			pos -= CharacterBefore(pos).widthBytes;
2554 	} else if (IsADigit(ceStart.character)) {
2555 		while (pos < length && IsADigit(CharacterAfter(pos).character))
2556 			pos += CharacterAfter(pos).widthBytes;
2557 	} else if (IsASCIIPunctuationCharacter(ceStart.character)) {
2558 		while (pos < length && IsASCIIPunctuationCharacter(CharacterAfter(pos).character))
2559 			pos += CharacterAfter(pos).widthBytes;
2560 	} else if (isspacechar(ceStart.character)) {
2561 		while (pos < length && isspacechar(CharacterAfter(pos).character))
2562 			pos += CharacterAfter(pos).widthBytes;
2563 	} else {
2564 		pos += CharacterAfter(pos).widthBytes;
2565 	}
2566 	return pos;
2567 }
2568 
IsLineEndChar(char c)2569 static constexpr bool IsLineEndChar(char c) noexcept {
2570 	return (c == '\n' || c == '\r');
2571 }
2572 
ExtendStyleRange(Sci::Position pos,int delta,bool singleLine)2573 Sci::Position Document::ExtendStyleRange(Sci::Position pos, int delta, bool singleLine) noexcept {
2574 	const int sStart = cb.StyleAt(pos);
2575 	if (delta < 0) {
2576 		while (pos > 0 && (cb.StyleAt(pos) == sStart) && (!singleLine || !IsLineEndChar(cb.CharAt(pos))))
2577 			pos--;
2578 		pos++;
2579 	} else {
2580 		while (pos < (LengthNoExcept()) && (cb.StyleAt(pos) == sStart) && (!singleLine || !IsLineEndChar(cb.CharAt(pos))))
2581 			pos++;
2582 	}
2583 	return pos;
2584 }
2585 
BraceOpposite(char ch)2586 static char BraceOpposite(char ch) noexcept {
2587 	switch (ch) {
2588 	case '(':
2589 		return ')';
2590 	case ')':
2591 		return '(';
2592 	case '[':
2593 		return ']';
2594 	case ']':
2595 		return '[';
2596 	case '{':
2597 		return '}';
2598 	case '}':
2599 		return '{';
2600 	case '<':
2601 		return '>';
2602 	case '>':
2603 		return '<';
2604 	default:
2605 		return '\0';
2606 	}
2607 }
2608 
2609 // TODO: should be able to extend styled region to find matching brace
BraceMatch(Sci::Position position,Sci::Position)2610 Sci::Position Document::BraceMatch(Sci::Position position, Sci::Position /*maxReStyle*/) noexcept {
2611 	const char chBrace = CharAt(position);
2612 	const char chSeek = BraceOpposite(chBrace);
2613 	if (chSeek == '\0')
2614 		return - 1;
2615 	const int styBrace = StyleIndexAt(position);
2616 	int direction = -1;
2617 	if (chBrace == '(' || chBrace == '[' || chBrace == '{' || chBrace == '<')
2618 		direction = 1;
2619 	int depth = 1;
2620 	position = NextPosition(position, direction);
2621 	while ((position >= 0) && (position < LengthNoExcept())) {
2622 		const char chAtPos = CharAt(position);
2623 		const int styAtPos = StyleIndexAt(position);
2624 		if ((position > GetEndStyled()) || (styAtPos == styBrace)) {
2625 			if (chAtPos == chBrace)
2626 				depth++;
2627 			if (chAtPos == chSeek)
2628 				depth--;
2629 			if (depth == 0)
2630 				return position;
2631 		}
2632 		const Sci::Position positionBeforeMove = position;
2633 		position = NextPosition(position, direction);
2634 		if (position == positionBeforeMove)
2635 			break;
2636 	}
2637 	return - 1;
2638 }
2639 
2640 /**
2641  * Implementation of RegexSearchBase for the default built-in regular expression engine
2642  */
2643 class BuiltinRegex : public RegexSearchBase {
2644 public:
BuiltinRegex(CharClassify * charClassTable)2645 	explicit BuiltinRegex(CharClassify *charClassTable) : search(charClassTable) {}
2646 	BuiltinRegex(const BuiltinRegex &) = delete;
2647 	BuiltinRegex(BuiltinRegex &&) = delete;
2648 	BuiltinRegex &operator=(const BuiltinRegex &) = delete;
2649 	BuiltinRegex &operator=(BuiltinRegex &&) = delete;
2650 	~BuiltinRegex() override = default;
2651 
2652 	Sci::Position FindText(Document *doc, Sci::Position minPos, Sci::Position maxPos, const char *s,
2653                         bool caseSensitive, bool word, bool wordStart, int flags,
2654                         Sci::Position *length) override;
2655 
2656 	const char *SubstituteByPosition(Document *doc, const char *text, Sci::Position *length) override;
2657 
2658 private:
2659 	RESearch search;
2660 	std::string substituted;
2661 };
2662 
2663 namespace {
2664 
2665 /**
2666 * RESearchRange keeps track of search range.
2667 */
2668 class RESearchRange {
2669 public:
2670 	const Document *doc;
2671 	int increment;
2672 	Sci::Position startPos;
2673 	Sci::Position endPos;
2674 	Sci::Line lineRangeStart;
2675 	Sci::Line lineRangeEnd;
2676 	Sci::Line lineRangeBreak;
RESearchRange(const Document * doc_,Sci::Position minPos,Sci::Position maxPos)2677 	RESearchRange(const Document *doc_, Sci::Position minPos, Sci::Position maxPos) noexcept : doc(doc_) {
2678 		increment = (minPos <= maxPos) ? 1 : -1;
2679 
2680 		// Range endpoints should not be inside DBCS characters or between a CR and LF,
2681 		// but just in case, move them.
2682 		startPos = doc->MovePositionOutsideChar(minPos, 1, true);
2683 		endPos = doc->MovePositionOutsideChar(maxPos, 1, true);
2684 
2685 		lineRangeStart = doc->SciLineFromPosition(startPos);
2686 		lineRangeEnd = doc->SciLineFromPosition(endPos);
2687 		lineRangeBreak = lineRangeEnd + increment;
2688 	}
LineRange(Sci::Line line) const2689 	Range LineRange(Sci::Line line) const {
2690 		Range range(doc->LineStart(line), doc->LineEnd(line));
2691 		if (increment == 1) {
2692 			if (line == lineRangeStart)
2693 				range.start = startPos;
2694 			if (line == lineRangeEnd)
2695 				range.end = endPos;
2696 		} else {
2697 			if (line == lineRangeEnd)
2698 				range.start = endPos;
2699 			if (line == lineRangeStart)
2700 				range.end = startPos;
2701 		}
2702 		return range;
2703 	}
2704 };
2705 
2706 // Define a way for the Regular Expression code to access the document
2707 class DocumentIndexer : public CharacterIndexer {
2708 	Document *pdoc;
2709 	Sci::Position end;
2710 public:
DocumentIndexer(Document * pdoc_,Sci::Position end_)2711 	DocumentIndexer(Document *pdoc_, Sci::Position end_) noexcept :
2712 		pdoc(pdoc_), end(end_) {
2713 	}
2714 
2715 	DocumentIndexer(const DocumentIndexer &) = delete;
2716 	DocumentIndexer(DocumentIndexer &&) = delete;
2717 	DocumentIndexer &operator=(const DocumentIndexer &) = delete;
2718 	DocumentIndexer &operator=(DocumentIndexer &&) = delete;
2719 
2720 	~DocumentIndexer() override = default;
2721 
CharAt(Sci::Position index) const2722 	char CharAt(Sci::Position index) const noexcept override {
2723 		if (index < 0 || index >= end)
2724 			return 0;
2725 		else
2726 			return pdoc->CharAt(index);
2727 	}
2728 };
2729 
2730 #ifndef NO_CXX11_REGEX
2731 
2732 class ByteIterator {
2733 public:
2734 	typedef std::bidirectional_iterator_tag iterator_category;
2735 	typedef char value_type;
2736 	typedef ptrdiff_t difference_type;
2737 	typedef char* pointer;
2738 	typedef char& reference;
2739 
2740 	const Document *doc;
2741 	Sci::Position position;
2742 
ByteIterator(const Document * doc_=nullptr,Sci::Position position_=0)2743 	ByteIterator(const Document *doc_=nullptr, Sci::Position position_=0) noexcept :
2744 		doc(doc_), position(position_) {
2745 	}
ByteIterator(const ByteIterator & other)2746 	ByteIterator(const ByteIterator &other) noexcept {
2747 		doc = other.doc;
2748 		position = other.position;
2749 	}
ByteIterator(ByteIterator && other)2750 	ByteIterator(ByteIterator &&other) noexcept {
2751 		doc = other.doc;
2752 		position = other.position;
2753 	}
operator =(const ByteIterator & other)2754 	ByteIterator &operator=(const ByteIterator &other) noexcept {
2755 		if (this != &other) {
2756 			doc = other.doc;
2757 			position = other.position;
2758 		}
2759 		return *this;
2760 	}
2761 	ByteIterator &operator=(ByteIterator &&) noexcept = default;
2762 	~ByteIterator() = default;
operator *() const2763 	char operator*() const noexcept {
2764 		return doc->CharAt(position);
2765 	}
operator ++()2766 	ByteIterator &operator++() noexcept {
2767 		position++;
2768 		return *this;
2769 	}
operator ++(int)2770 	ByteIterator operator++(int) noexcept {
2771 		ByteIterator retVal(*this);
2772 		position++;
2773 		return retVal;
2774 	}
operator --()2775 	ByteIterator &operator--() noexcept {
2776 		position--;
2777 		return *this;
2778 	}
operator ==(const ByteIterator & other) const2779 	bool operator==(const ByteIterator &other) const noexcept {
2780 		return doc == other.doc && position == other.position;
2781 	}
operator !=(const ByteIterator & other) const2782 	bool operator!=(const ByteIterator &other) const noexcept {
2783 		return doc != other.doc || position != other.position;
2784 	}
Pos() const2785 	Sci::Position Pos() const noexcept {
2786 		return position;
2787 	}
PosRoundUp() const2788 	Sci::Position PosRoundUp() const noexcept {
2789 		return position;
2790 	}
2791 };
2792 
2793 // On Windows, wchar_t is 16 bits wide and on Unix it is 32 bits wide.
2794 // Would be better to use sizeof(wchar_t) or similar to differentiate
2795 // but easier for now to hard-code platforms.
2796 // C++11 has char16_t and char32_t but neither Clang nor Visual C++
2797 // appear to allow specializing basic_regex over these.
2798 
2799 #ifdef _WIN32
2800 #define WCHAR_T_IS_16 1
2801 #else
2802 #define WCHAR_T_IS_16 0
2803 #endif
2804 
2805 #if WCHAR_T_IS_16
2806 
2807 // On Windows, report non-BMP characters as 2 separate surrogates as that
2808 // matches wregex since it is based on wchar_t.
2809 class UTF8Iterator {
2810 	// These 3 fields determine the iterator position and are used for comparisons
2811 	const Document *doc;
2812 	Sci::Position position;
2813 	size_t characterIndex;
2814 	// Remaining fields are derived from the determining fields so are excluded in comparisons
2815 	unsigned int lenBytes;
2816 	size_t lenCharacters;
2817 	wchar_t buffered[2];
2818 public:
2819 	typedef std::bidirectional_iterator_tag iterator_category;
2820 	typedef wchar_t value_type;
2821 	typedef ptrdiff_t difference_type;
2822 	typedef wchar_t* pointer;
2823 	typedef wchar_t& reference;
2824 
UTF8Iterator(const Document * doc_=nullptr,Sci::Position position_=0)2825 	UTF8Iterator(const Document *doc_=nullptr, Sci::Position position_=0) noexcept :
2826 		doc(doc_), position(position_), characterIndex(0), lenBytes(0), lenCharacters(0), buffered{} {
2827 		buffered[0] = 0;
2828 		buffered[1] = 0;
2829 		if (doc) {
2830 			ReadCharacter();
2831 		}
2832 	}
UTF8Iterator(const UTF8Iterator & other)2833 	UTF8Iterator(const UTF8Iterator &other) noexcept : buffered{} {
2834 		doc = other.doc;
2835 		position = other.position;
2836 		characterIndex = other.characterIndex;
2837 		lenBytes = other.lenBytes;
2838 		lenCharacters = other.lenCharacters;
2839 		buffered[0] = other.buffered[0];
2840 		buffered[1] = other.buffered[1];
2841 	}
2842 	UTF8Iterator(UTF8Iterator &&other) noexcept = default;
operator =(const UTF8Iterator & other)2843 	UTF8Iterator &operator=(const UTF8Iterator &other) noexcept {
2844 		if (this != &other) {
2845 			doc = other.doc;
2846 			position = other.position;
2847 			characterIndex = other.characterIndex;
2848 			lenBytes = other.lenBytes;
2849 			lenCharacters = other.lenCharacters;
2850 			buffered[0] = other.buffered[0];
2851 			buffered[1] = other.buffered[1];
2852 		}
2853 		return *this;
2854 	}
2855 	UTF8Iterator &operator=(UTF8Iterator &&) noexcept = default;
2856 	~UTF8Iterator() = default;
operator *() const2857 	wchar_t operator*() const noexcept {
2858 		assert(lenCharacters != 0);
2859 		return buffered[characterIndex];
2860 	}
operator ++()2861 	UTF8Iterator &operator++() noexcept {
2862 		if ((characterIndex + 1) < (lenCharacters)) {
2863 			characterIndex++;
2864 		} else {
2865 			position += lenBytes;
2866 			ReadCharacter();
2867 			characterIndex = 0;
2868 		}
2869 		return *this;
2870 	}
operator ++(int)2871 	UTF8Iterator operator++(int) noexcept {
2872 		UTF8Iterator retVal(*this);
2873 		if ((characterIndex + 1) < (lenCharacters)) {
2874 			characterIndex++;
2875 		} else {
2876 			position += lenBytes;
2877 			ReadCharacter();
2878 			characterIndex = 0;
2879 		}
2880 		return retVal;
2881 	}
operator --()2882 	UTF8Iterator &operator--() noexcept {
2883 		if (characterIndex) {
2884 			characterIndex--;
2885 		} else {
2886 			position = doc->NextPosition(position, -1);
2887 			ReadCharacter();
2888 			characterIndex = lenCharacters - 1;
2889 		}
2890 		return *this;
2891 	}
operator ==(const UTF8Iterator & other) const2892 	bool operator==(const UTF8Iterator &other) const noexcept {
2893 		// Only test the determining fields, not the character widths and values derived from this
2894 		return doc == other.doc &&
2895 			position == other.position &&
2896 			characterIndex == other.characterIndex;
2897 	}
operator !=(const UTF8Iterator & other) const2898 	bool operator!=(const UTF8Iterator &other) const noexcept {
2899 		// Only test the determining fields, not the character widths and values derived from this
2900 		return doc != other.doc ||
2901 			position != other.position ||
2902 			characterIndex != other.characterIndex;
2903 	}
Pos() const2904 	Sci::Position Pos() const noexcept {
2905 		return position;
2906 	}
PosRoundUp() const2907 	Sci::Position PosRoundUp() const noexcept {
2908 		if (characterIndex)
2909 			return position + lenBytes;	// Force to end of character
2910 		else
2911 			return position;
2912 	}
2913 private:
ReadCharacter()2914 	void ReadCharacter() noexcept {
2915 		const Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
2916 		lenBytes = charExtracted.widthBytes;
2917 		if (charExtracted.character == unicodeReplacementChar) {
2918 			lenCharacters = 1;
2919 			buffered[0] = static_cast<wchar_t>(charExtracted.character);
2920 		} else {
2921 			lenCharacters = UTF16FromUTF32Character(charExtracted.character, buffered);
2922 		}
2923 	}
2924 };
2925 
2926 #else
2927 
2928 // On Unix, report non-BMP characters as single characters
2929 
2930 class UTF8Iterator {
2931 	const Document *doc;
2932 	Sci::Position position;
2933 public:
2934 	typedef std::bidirectional_iterator_tag iterator_category;
2935 	typedef wchar_t value_type;
2936 	typedef ptrdiff_t difference_type;
2937 	typedef wchar_t* pointer;
2938 	typedef wchar_t& reference;
2939 
UTF8Iterator(const Document * doc_=nullptr,Sci::Position position_=0)2940 	UTF8Iterator(const Document *doc_=nullptr, Sci::Position position_=0) noexcept :
2941 		doc(doc_), position(position_) {
2942 	}
UTF8Iterator(const UTF8Iterator & other)2943 	UTF8Iterator(const UTF8Iterator &other) noexcept {
2944 		doc = other.doc;
2945 		position = other.position;
2946 	}
2947 	UTF8Iterator(UTF8Iterator &&other) noexcept = default;
operator =(const UTF8Iterator & other)2948 	UTF8Iterator &operator=(const UTF8Iterator &other) noexcept {
2949 		if (this != &other) {
2950 			doc = other.doc;
2951 			position = other.position;
2952 		}
2953 		return *this;
2954 	}
2955 	UTF8Iterator &operator=(UTF8Iterator &&) noexcept = default;
2956 	~UTF8Iterator() = default;
operator *() const2957 	wchar_t operator*() const noexcept {
2958 		const Document::CharacterExtracted charExtracted = doc->ExtractCharacter(position);
2959 		return charExtracted.character;
2960 	}
operator ++()2961 	UTF8Iterator &operator++() noexcept {
2962 		position = doc->NextPosition(position, 1);
2963 		return *this;
2964 	}
operator ++(int)2965 	UTF8Iterator operator++(int) noexcept {
2966 		UTF8Iterator retVal(*this);
2967 		position = doc->NextPosition(position, 1);
2968 		return retVal;
2969 	}
operator --()2970 	UTF8Iterator &operator--() noexcept {
2971 		position = doc->NextPosition(position, -1);
2972 		return *this;
2973 	}
operator ==(const UTF8Iterator & other) const2974 	bool operator==(const UTF8Iterator &other) const noexcept {
2975 		return doc == other.doc && position == other.position;
2976 	}
operator !=(const UTF8Iterator & other) const2977 	bool operator!=(const UTF8Iterator &other) const noexcept {
2978 		return doc != other.doc || position != other.position;
2979 	}
Pos() const2980 	Sci::Position Pos() const noexcept {
2981 		return position;
2982 	}
PosRoundUp() const2983 	Sci::Position PosRoundUp() const noexcept {
2984 		return position;
2985 	}
2986 };
2987 
2988 #endif
2989 
MatchFlags(const Document * doc,Sci::Position startPos,Sci::Position endPos)2990 std::regex_constants::match_flag_type MatchFlags(const Document *doc, Sci::Position startPos, Sci::Position endPos) {
2991 	std::regex_constants::match_flag_type flagsMatch = std::regex_constants::match_default;
2992 	if (!doc->IsLineStartPosition(startPos))
2993 		flagsMatch |= std::regex_constants::match_not_bol;
2994 	if (!doc->IsLineEndPosition(endPos))
2995 		flagsMatch |= std::regex_constants::match_not_eol;
2996 	return flagsMatch;
2997 }
2998 
2999 template<typename Iterator, typename Regex>
MatchOnLines(const Document * doc,const Regex & regexp,const RESearchRange & resr,RESearch & search)3000 bool MatchOnLines(const Document *doc, const Regex &regexp, const RESearchRange &resr, RESearch &search) {
3001 	std::match_results<Iterator> match;
3002 
3003 	// MSVC and libc++ have problems with ^ and $ matching line ends inside a range.
3004 	// CRLF line ends are also a problem as ^ and $ only treat LF as a line end.
3005 	// The std::regex::multiline option was added to C++17 to improve behaviour but
3006 	// has not been implemented by compiler runtimes with MSVC always in multiline
3007 	// mode and libc++ and libstdc++ always in single-line mode.
3008 	// If multiline regex worked well then the line by line iteration could be removed
3009 	// for the forwards case and replaced with the following 4 lines:
3010 #ifdef REGEX_MULTILINE
3011 	Iterator itStart(doc, resr.startPos);
3012 	Iterator itEnd(doc, resr.endPos);
3013 	const std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, resr.startPos, resr.endPos);
3014 	const bool matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch);
3015 #else
3016 	// Line by line.
3017 	bool matched = false;
3018 	for (Sci::Line line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
3019 		const Range lineRange = resr.LineRange(line);
3020 		Iterator itStart(doc, lineRange.start);
3021 		Iterator itEnd(doc, lineRange.end);
3022 		std::regex_constants::match_flag_type flagsMatch = MatchFlags(doc, lineRange.start, lineRange.end);
3023 		matched = std::regex_search(itStart, itEnd, match, regexp, flagsMatch);
3024 		// Check for the last match on this line.
3025 		if (matched) {
3026 			if (resr.increment == -1) {
3027 				while (matched) {
3028 					Iterator itNext(doc, match[0].second.PosRoundUp());
3029 					flagsMatch = MatchFlags(doc, itNext.Pos(), lineRange.end);
3030 					std::match_results<Iterator> matchNext;
3031 					matched = std::regex_search(itNext, itEnd, matchNext, regexp, flagsMatch);
3032 					if (matched) {
3033 						if (match[0].first == match[0].second) {
3034 							// Empty match means failure so exit
3035 							return false;
3036 						}
3037 						match = matchNext;
3038 					}
3039 				}
3040 				matched = true;
3041 			}
3042 			break;
3043 		}
3044 	}
3045 #endif
3046 	if (matched) {
3047 		for (size_t co = 0; co < match.size(); co++) {
3048 			search.bopat[co] = match[co].first.Pos();
3049 			search.eopat[co] = match[co].second.PosRoundUp();
3050 			const Sci::Position lenMatch = search.eopat[co] - search.bopat[co];
3051 			search.pat[co].resize(lenMatch);
3052 			for (Sci::Position iPos = 0; iPos < lenMatch; iPos++) {
3053 				search.pat[co][iPos] = doc->CharAt(iPos + search.bopat[co]);
3054 			}
3055 		}
3056 	}
3057 	return matched;
3058 }
3059 
Cxx11RegexFindText(const Document * doc,Sci::Position minPos,Sci::Position maxPos,const char * s,bool caseSensitive,Sci::Position * length,RESearch & search)3060 Sci::Position Cxx11RegexFindText(const Document *doc, Sci::Position minPos, Sci::Position maxPos, const char *s,
3061 	bool caseSensitive, Sci::Position *length, RESearch &search) {
3062 	const RESearchRange resr(doc, minPos, maxPos);
3063 	try {
3064 		//ElapsedPeriod ep;
3065 		std::regex::flag_type flagsRe = std::regex::ECMAScript;
3066 		// Flags that apper to have no effect:
3067 		// | std::regex::collate | std::regex::extended;
3068 		if (!caseSensitive)
3069 			flagsRe = flagsRe | std::regex::icase;
3070 
3071 		// Clear the RESearch so can fill in matches
3072 		search.Clear();
3073 
3074 		bool matched = false;
3075 		if (SC_CP_UTF8 == doc->dbcsCodePage) {
3076 			const std::wstring ws = WStringFromUTF8(s);
3077 			std::wregex regexp;
3078 			regexp.assign(ws, flagsRe);
3079 			matched = MatchOnLines<UTF8Iterator>(doc, regexp, resr, search);
3080 
3081 		} else {
3082 			std::regex regexp;
3083 			regexp.assign(s, flagsRe);
3084 			matched = MatchOnLines<ByteIterator>(doc, regexp, resr, search);
3085 		}
3086 
3087 		Sci::Position posMatch = -1;
3088 		if (matched) {
3089 			posMatch = search.bopat[0];
3090 			*length = search.eopat[0] - search.bopat[0];
3091 		}
3092 		// Example - search in doc/ScintillaHistory.html for
3093 		// [[:upper:]]eta[[:space:]]
3094 		// On MacBook, normally around 1 second but with locale imbued -> 14 seconds.
3095 		//const double durSearch = ep.Duration(true);
3096 		//Platform::DebugPrintf("Search:%9.6g \n", durSearch);
3097 		return posMatch;
3098 	} catch (std::regex_error &) {
3099 		// Failed to create regular expression
3100 		throw RegexError();
3101 	} catch (...) {
3102 		// Failed in some other way
3103 		return -1;
3104 	}
3105 }
3106 
3107 #endif
3108 
3109 }
3110 
FindText(Document * doc,Sci::Position minPos,Sci::Position maxPos,const char * s,bool caseSensitive,bool,bool,int flags,Sci::Position * length)3111 Sci::Position BuiltinRegex::FindText(Document *doc, Sci::Position minPos, Sci::Position maxPos, const char *s,
3112                         bool caseSensitive, bool, bool, int flags,
3113                         Sci::Position *length) {
3114 
3115 #ifndef NO_CXX11_REGEX
3116 	if (flags & SCFIND_CXX11REGEX) {
3117 			return Cxx11RegexFindText(doc, minPos, maxPos, s,
3118 			caseSensitive, length, search);
3119 	}
3120 #endif
3121 
3122 	const RESearchRange resr(doc, minPos, maxPos);
3123 
3124 	const bool posix = (flags & SCFIND_POSIX) != 0;
3125 
3126 	const char *errmsg = search.Compile(s, *length, caseSensitive, posix);
3127 	if (errmsg) {
3128 		return -1;
3129 	}
3130 	// Find a variable in a property file: \$(\([A-Za-z0-9_.]+\))
3131 	// Replace first '.' with '-' in each property file variable reference:
3132 	//     Search: \$(\([A-Za-z0-9_-]+\)\.\([A-Za-z0-9_.]+\))
3133 	//     Replace: $(\1-\2)
3134 	Sci::Position pos = -1;
3135 	Sci::Position lenRet = 0;
3136 	const bool searchforLineStart = s[0] == '^';
3137 	const char searchEnd = s[*length - 1];
3138 	const char searchEndPrev = (*length > 1) ? s[*length - 2] : '\0';
3139 	const bool searchforLineEnd = (searchEnd == '$') && (searchEndPrev != '\\');
3140 	for (Sci::Line line = resr.lineRangeStart; line != resr.lineRangeBreak; line += resr.increment) {
3141 		Sci::Position startOfLine = doc->LineStart(line);
3142 		Sci::Position endOfLine = doc->LineEnd(line);
3143 		if (resr.increment == 1) {
3144 			if (line == resr.lineRangeStart) {
3145 				if ((resr.startPos != startOfLine) && searchforLineStart)
3146 					continue;	// Can't match start of line if start position after start of line
3147 				startOfLine = resr.startPos;
3148 			}
3149 			if (line == resr.lineRangeEnd) {
3150 				if ((resr.endPos != endOfLine) && searchforLineEnd)
3151 					continue;	// Can't match end of line if end position before end of line
3152 				endOfLine = resr.endPos;
3153 			}
3154 		} else {
3155 			if (line == resr.lineRangeEnd) {
3156 				if ((resr.endPos != startOfLine) && searchforLineStart)
3157 					continue;	// Can't match start of line if end position after start of line
3158 				startOfLine = resr.endPos;
3159 			}
3160 			if (line == resr.lineRangeStart) {
3161 				if ((resr.startPos != endOfLine) && searchforLineEnd)
3162 					continue;	// Can't match end of line if start position before end of line
3163 				endOfLine = resr.startPos;
3164 			}
3165 		}
3166 
3167 		const DocumentIndexer di(doc, endOfLine);
3168 		int success = search.Execute(di, startOfLine, endOfLine);
3169 		if (success) {
3170 			pos = search.bopat[0];
3171 			// Ensure only whole characters selected
3172 			search.eopat[0] = doc->MovePositionOutsideChar(search.eopat[0], 1, false);
3173 			lenRet = search.eopat[0] - search.bopat[0];
3174 			// There can be only one start of a line, so no need to look for last match in line
3175 			if ((resr.increment == -1) && !searchforLineStart) {
3176 				// Check for the last match on this line.
3177 				int repetitions = 1000;	// Break out of infinite loop
3178 				while (success && (search.eopat[0] <= endOfLine) && (repetitions--)) {
3179 					success = search.Execute(di, pos+1, endOfLine);
3180 					if (success) {
3181 						if (search.eopat[0] <= minPos) {
3182 							pos = search.bopat[0];
3183 							lenRet = search.eopat[0] - search.bopat[0];
3184 						} else {
3185 							success = 0;
3186 						}
3187 					}
3188 				}
3189 			}
3190 			break;
3191 		}
3192 	}
3193 	*length = lenRet;
3194 	return pos;
3195 }
3196 
SubstituteByPosition(Document * doc,const char * text,Sci::Position * length)3197 const char *BuiltinRegex::SubstituteByPosition(Document *doc, const char *text, Sci::Position *length) {
3198 	substituted.clear();
3199 	const DocumentIndexer di(doc, doc->Length());
3200 	search.GrabMatches(di);
3201 	for (Sci::Position j = 0; j < *length; j++) {
3202 		if (text[j] == '\\') {
3203 			if (text[j + 1] >= '0' && text[j + 1] <= '9') {
3204 				const unsigned int patNum = text[j + 1] - '0';
3205 				const Sci::Position len = search.eopat[patNum] - search.bopat[patNum];
3206 				if (!search.pat[patNum].empty())	// Will be null if try for a match that did not occur
3207 					substituted.append(search.pat[patNum].c_str(), len);
3208 				j++;
3209 			} else {
3210 				j++;
3211 				switch (text[j]) {
3212 				case 'a':
3213 					substituted.push_back('\a');
3214 					break;
3215 				case 'b':
3216 					substituted.push_back('\b');
3217 					break;
3218 				case 'f':
3219 					substituted.push_back('\f');
3220 					break;
3221 				case 'n':
3222 					substituted.push_back('\n');
3223 					break;
3224 				case 'r':
3225 					substituted.push_back('\r');
3226 					break;
3227 				case 't':
3228 					substituted.push_back('\t');
3229 					break;
3230 				case 'v':
3231 					substituted.push_back('\v');
3232 					break;
3233 				case '\\':
3234 					substituted.push_back('\\');
3235 					break;
3236 				default:
3237 					substituted.push_back('\\');
3238 					j--;
3239 				}
3240 			}
3241 		} else {
3242 			substituted.push_back(text[j]);
3243 		}
3244 	}
3245 	*length = substituted.length();
3246 	return substituted.c_str();
3247 }
3248 
3249 #ifndef SCI_OWNREGEX
3250 
CreateRegexSearch(CharClassify * charClassTable)3251 RegexSearchBase *Scintilla::CreateRegexSearch(CharClassify *charClassTable) {
3252 	return new BuiltinRegex(charClassTable);
3253 }
3254 
3255 #endif
3256