1 // Scintilla Lexer for EDIFACT
2 // Written by Iain Clarke, IMCSoft & Inobiz AB.
3 // EDIFACT documented here: https://www.unece.org/cefact/edifact/welcome.html
4 // and more readably here: https://en.wikipedia.org/wiki/EDIFACT
5 // This code is subject to the same license terms as the rest of the scintilla project:
6 // The License.txt file describes the conditions under which this software may be distributed.
7 //
8 
9 // Header order must match order in scripts/HeaderOrder.txt
10 #include <cstdlib>
11 #include <cassert>
12 #include <cstring>
13 #include <cctype>
14 
15 #include "ILexer.h"
16 #include "Scintilla.h"
17 #include "SciLexer.h"
18 
19 #include "LexAccessor.h"
20 #include "LexerModule.h"
21 #include "DefaultLexer.h"
22 
23 using namespace Scintilla;
24 
25 class LexerEDIFACT : public DefaultLexer
26 {
27 public:
28 	LexerEDIFACT();
~LexerEDIFACT()29 	virtual ~LexerEDIFACT() {} // virtual destructor, as we inherit from ILexer
30 
Factory()31 	static ILexer *Factory() {
32 		return new LexerEDIFACT;
33 	}
34 
Version() const35 	int SCI_METHOD Version() const override
36 	{
37 		return lvOriginal;
38 	}
Release()39 	void SCI_METHOD Release() override
40 	{
41 		delete this;
42 	}
43 
PropertyNames()44 	const char * SCI_METHOD PropertyNames() override
45 	{
46 		return "fold\nlexer.edifact.highlight.un.all";
47 	}
PropertyType(const char *)48 	int SCI_METHOD PropertyType(const char *) override
49 	{
50 		return SC_TYPE_BOOLEAN; // Only one property!
51 	}
DescribeProperty(const char * name)52 	const char * SCI_METHOD DescribeProperty(const char *name) override
53 	{
54 		if (!strcmp(name, "fold"))
55 			return "Whether to apply folding to document or not";
56 		if (!strcmp(name, "lexer.edifact.highlight.un.all"))
57 			return "Whether to apply UN* highlighting to all UN segments, or just to UNH";
58 		return NULL;
59 	}
60 
PropertySet(const char * key,const char * val)61 	Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override
62 	{
63 		if (!strcmp(key, "fold"))
64 		{
65 			m_bFold = strcmp(val, "0") ? true : false;
66 			return 0;
67 		}
68 		if (!strcmp(key, "lexer.edifact.highlight.un.all"))	// GetProperty
69 		{
70 			m_bHighlightAllUN = strcmp(val, "0") ? true : false;
71 			return 0;
72 		}
73 		return -1;
74 	}
DescribeWordListSets()75 	const char * SCI_METHOD DescribeWordListSets() override
76 	{
77 		return NULL;
78 	}
WordListSet(int,const char *)79 	Sci_Position SCI_METHOD WordListSet(int, const char *) override
80 	{
81 		return -1;
82 	}
83 	void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
84 	void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
PrivateCall(int,void *)85 	void * SCI_METHOD PrivateCall(int, void *) override
86 	{
87 		return NULL;
88 	}
89 
90 protected:
91 	Sci_Position InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength);
92 	Sci_Position FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const;
93 	Sci_Position ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const;
94 	int DetectSegmentHeader(char SegmentHeader[3]) const;
95 
96 	bool m_bFold;
97 
98 	// property lexer.edifact.highlight.un.all
99 	//	Set to 0 to highlight only UNA segments, or 1 to highlight all UNx segments.
100 	bool m_bHighlightAllUN;
101 
102 	char m_chComponent;
103 	char m_chData;
104 	char m_chDecimal;
105 	char m_chRelease;
106 	char m_chSegment;
107 };
108 
109 LexerModule lmEDIFACT(SCLEX_EDIFACT, LexerEDIFACT::Factory, "edifact");
110 
111 ///////////////////////////////////////////////////////////////////////////////
112 
113 
114 
115 ///////////////////////////////////////////////////////////////////////////////
116 
LexerEDIFACT()117 LexerEDIFACT::LexerEDIFACT()
118 {
119 	m_bFold = false;
120 	m_bHighlightAllUN = false;
121 	m_chComponent = ':';
122 	m_chData = '+';
123 	m_chDecimal = '.';
124 	m_chRelease = '?';
125 	m_chSegment = '\'';
126 }
127 
Lex(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)128 void LexerEDIFACT::Lex(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess)
129 {
130 	Sci_PositionU posFinish = startPos + length;
131 	InitialiseFromUNA(pAccess, posFinish);
132 
133 	// Look backwards for a ' or a document beginning
134 	Sci_PositionU posCurrent = FindPreviousEnd(pAccess, startPos);
135 	// And jump past the ' if this was not the beginning of the document
136 	if (posCurrent != 0)
137 		posCurrent++;
138 
139 	// Style buffer, so we're not issuing loads of notifications
140 	LexAccessor styler (pAccess);
141 	pAccess->StartStyling(posCurrent, '\377');
142 	styler.StartSegment(posCurrent);
143 	Sci_Position posSegmentStart = -1;
144 
145 	while ((posCurrent < posFinish) && (posSegmentStart == -1))
146 	{
147 		posCurrent = ForwardPastWhitespace(pAccess, posCurrent, posFinish);
148 		// Mark whitespace as default
149 		styler.ColourTo(posCurrent - 1, SCE_EDI_DEFAULT);
150 		if (posCurrent >= posFinish)
151 			break;
152 
153 		// Does is start with 3 charaters? ie, UNH
154 		char SegmentHeader[4] = { 0 };
155 		pAccess->GetCharRange(SegmentHeader, posCurrent, 3);
156 
157 		int SegmentStyle = DetectSegmentHeader(SegmentHeader);
158 		if (SegmentStyle == SCE_EDI_BADSEGMENT)
159 			break;
160 		if (SegmentStyle == SCE_EDI_UNA)
161 		{
162 			posCurrent += 9;
163 			styler.ColourTo(posCurrent - 1, SCE_EDI_UNA); // UNA
164 			continue;
165 		}
166 		posSegmentStart = posCurrent;
167 		posCurrent += 3;
168 
169 		styler.ColourTo(posCurrent - 1, SegmentStyle); // UNH etc
170 
171 		// Colour in the rest of the segment
172 		for (char c; posCurrent < posFinish; posCurrent++)
173 		{
174 			pAccess->GetCharRange(&c, posCurrent, 1);
175 
176 			if (c == m_chRelease) // ? escape character, check first, in case of ?'
177 				posCurrent++;
178 			else if (c == m_chSegment) // '
179 			{
180 				// Make sure the whole segment is on one line. styler won't let us go back in time, so we'll settle for marking the ' as bad.
181 				Sci_Position lineSegmentStart = pAccess->LineFromPosition(posSegmentStart);
182 				Sci_Position lineSegmentEnd = pAccess->LineFromPosition(posCurrent);
183 				if (lineSegmentStart == lineSegmentEnd)
184 					styler.ColourTo(posCurrent, SCE_EDI_SEGMENTEND);
185 				else
186 					styler.ColourTo(posCurrent, SCE_EDI_BADSEGMENT);
187 				posSegmentStart = -1;
188 				posCurrent++;
189 				break;
190 			}
191 			else if (c == m_chComponent) // :
192 				styler.ColourTo(posCurrent, SCE_EDI_SEP_COMPOSITE);
193 			else if (c == m_chData) // +
194 				styler.ColourTo(posCurrent, SCE_EDI_SEP_ELEMENT);
195 			else
196 				styler.ColourTo(posCurrent, SCE_EDI_DEFAULT);
197 		}
198 	}
199 	styler.Flush();
200 
201 	if (posSegmentStart == -1)
202 		return;
203 
204 	pAccess->StartStyling(posSegmentStart, -1);
205 	pAccess->SetStyleFor(posFinish - posSegmentStart, SCE_EDI_BADSEGMENT);
206 }
207 
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)208 void LexerEDIFACT::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess)
209 {
210 	if (!m_bFold)
211 		return;
212 
213 	Sci_PositionU endPos = startPos + length;
214 	startPos = FindPreviousEnd(pAccess, startPos);
215 	char c;
216 	char SegmentHeader[4] = { 0 };
217 
218 	bool AwaitingSegment = true;
219 	Sci_PositionU currLine = pAccess->LineFromPosition(startPos);
220 	int levelCurrentStyle = SC_FOLDLEVELBASE;
221 	if (currLine > 0)
222 		levelCurrentStyle = pAccess->GetLevel(currLine - 1); // bottom 12 bits are level
223 	int indentCurrent = levelCurrentStyle & SC_FOLDLEVELNUMBERMASK;
224 	int indentNext = indentCurrent;
225 
226 	while (startPos < endPos)
227 	{
228 		pAccess->GetCharRange(&c, startPos, 1);
229 		switch (c)
230 		{
231 		case '\t':
232 		case '\r':
233 		case ' ':
234 			startPos++;
235 			continue;
236 		case '\n':
237 			currLine = pAccess->LineFromPosition(startPos);
238 			pAccess->SetLevel(currLine, levelCurrentStyle | indentCurrent);
239 			startPos++;
240 			levelCurrentStyle = SC_FOLDLEVELBASE;
241 			indentCurrent = indentNext;
242 			continue;
243 		}
244 		if (c == m_chRelease)
245 		{
246 			startPos += 2;
247 			continue;
248 		}
249 		if (c == m_chSegment)
250 		{
251 			AwaitingSegment = true;
252 			startPos++;
253 			continue;
254 		}
255 
256 		if (!AwaitingSegment)
257 		{
258 			startPos++;
259 			continue;
260 		}
261 
262 		// Segment!
263 		pAccess->GetCharRange(SegmentHeader, startPos, 3);
264 		if (SegmentHeader[0] != 'U' || SegmentHeader[1] != 'N')
265 		{
266 			startPos++;
267 			continue;
268 		}
269 
270 		AwaitingSegment = false;
271 		switch (SegmentHeader[2])
272 		{
273 		case 'H':
274 		case 'G':
275 			indentNext++;
276 			levelCurrentStyle = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG;
277 			break;
278 
279 		case 'T':
280 		case 'E':
281 			if (indentNext > 0)
282 				indentNext--;
283 			break;
284 		}
285 
286 		startPos += 3;
287 	}
288 }
289 
InitialiseFromUNA(IDocument * pAccess,Sci_PositionU MaxLength)290 Sci_Position LexerEDIFACT::InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength)
291 {
292 	MaxLength -= 9; // drop 9 chars, to give us room for UNA:+.? '
293 
294 	Sci_PositionU startPos = 0;
295 	startPos += ForwardPastWhitespace(pAccess, 0, MaxLength);
296 	if (startPos < MaxLength)
297 	{
298 		char bufUNA[9];
299 		pAccess->GetCharRange(bufUNA, startPos, 9);
300 
301 		// Check it's UNA segment
302 		if (!memcmp(bufUNA, "UNA", 3))
303 		{
304 			m_chComponent = bufUNA[3];
305 			m_chData = bufUNA[4];
306 			m_chDecimal = bufUNA[5];
307 			m_chRelease = bufUNA[6];
308 			// bufUNA [7] should be space - reserved.
309 			m_chSegment = bufUNA[8];
310 
311 			return 0; // success!
312 		}
313 	}
314 
315 	// We failed to find a UNA, so drop to defaults
316 	m_chComponent = ':';
317 	m_chData = '+';
318 	m_chDecimal = '.';
319 	m_chRelease = '?';
320 	m_chSegment = '\'';
321 
322 	return -1;
323 }
324 
ForwardPastWhitespace(IDocument * pAccess,Sci_Position startPos,Sci_Position MaxLength) const325 Sci_Position LexerEDIFACT::ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const
326 {
327 	char c;
328 
329 	while (startPos < MaxLength)
330 	{
331 		pAccess->GetCharRange(&c, startPos, 1);
332 		switch (c)
333 		{
334 		case '\t':
335 		case '\r':
336 		case '\n':
337 		case ' ':
338 			break;
339 		default:
340 			return startPos;
341 		}
342 
343 		startPos++;
344 	}
345 
346 	return MaxLength;
347 }
348 
DetectSegmentHeader(char SegmentHeader[3]) const349 int LexerEDIFACT::DetectSegmentHeader(char SegmentHeader[3]) const
350 {
351 	if (
352 		SegmentHeader[0] < 'A' || SegmentHeader[0] > 'Z' ||
353 		SegmentHeader[1] < 'A' || SegmentHeader[1] > 'Z' ||
354 		SegmentHeader[2] < 'A' || SegmentHeader[2] > 'Z')
355 		return SCE_EDI_BADSEGMENT;
356 
357 	if (!memcmp(SegmentHeader, "UNA", 3))
358 		return SCE_EDI_UNA;
359 
360 	if (m_bHighlightAllUN && !memcmp(SegmentHeader, "UN", 2))
361 		return SCE_EDI_UNH;
362 	else if (!memcmp(SegmentHeader, "UNH", 3))
363 		return SCE_EDI_UNH;
364 	else if (!memcmp(SegmentHeader, "UNG", 3))
365 		return SCE_EDI_UNH;
366 
367 	return SCE_EDI_SEGMENTSTART;
368 }
369 
370 // Look backwards for a ' or a document beginning
FindPreviousEnd(IDocument * pAccess,Sci_Position startPos) const371 Sci_Position LexerEDIFACT::FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const
372 {
373 	for (char c; startPos > 0; startPos--)
374 	{
375 		pAccess->GetCharRange(&c, startPos, 1);
376 		if (c == m_chSegment)
377 			return startPos;
378 	}
379 	// We didn't find a ', so just go with the beginning
380 	return 0;
381 }
382 
383 
384