1 // Scintilla Lexer for EDIFACT
2 // Written by Iain Clarke, IMCSoft & Inobiz AB.
3 // EDIFACT documented here: https://www.unece.org/cefact/edifact/welcome.html
4 // and more readably here: https://en.wikipedia.org/wiki/EDIFACT
5 // This code is subject to the same license terms as the rest of the scintilla project:
6 // The License.txt file describes the conditions under which this software may be distributed.
7 //
8
9 // Header order must match order in scripts/HeaderOrder.txt
10 #include <cstdlib>
11 #include <cassert>
12 #include <cstring>
13 #include <cctype>
14
15 #include "ILexer.h"
16 #include "Scintilla.h"
17 #include "SciLexer.h"
18
19 #include "LexAccessor.h"
20 #include "LexerModule.h"
21 #include "DefaultLexer.h"
22
23 using namespace Scintilla;
24
25 class LexerEDIFACT : public DefaultLexer
26 {
27 public:
28 LexerEDIFACT();
~LexerEDIFACT()29 virtual ~LexerEDIFACT() {} // virtual destructor, as we inherit from ILexer
30
Factory()31 static ILexer *Factory() {
32 return new LexerEDIFACT;
33 }
34
Version() const35 int SCI_METHOD Version() const override
36 {
37 return lvOriginal;
38 }
Release()39 void SCI_METHOD Release() override
40 {
41 delete this;
42 }
43
PropertyNames()44 const char * SCI_METHOD PropertyNames() override
45 {
46 return "fold\nlexer.edifact.highlight.un.all";
47 }
PropertyType(const char *)48 int SCI_METHOD PropertyType(const char *) override
49 {
50 return SC_TYPE_BOOLEAN; // Only one property!
51 }
DescribeProperty(const char * name)52 const char * SCI_METHOD DescribeProperty(const char *name) override
53 {
54 if (!strcmp(name, "fold"))
55 return "Whether to apply folding to document or not";
56 if (!strcmp(name, "lexer.edifact.highlight.un.all"))
57 return "Whether to apply UN* highlighting to all UN segments, or just to UNH";
58 return NULL;
59 }
60
PropertySet(const char * key,const char * val)61 Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override
62 {
63 if (!strcmp(key, "fold"))
64 {
65 m_bFold = strcmp(val, "0") ? true : false;
66 return 0;
67 }
68 if (!strcmp(key, "lexer.edifact.highlight.un.all")) // GetProperty
69 {
70 m_bHighlightAllUN = strcmp(val, "0") ? true : false;
71 return 0;
72 }
73 return -1;
74 }
DescribeWordListSets()75 const char * SCI_METHOD DescribeWordListSets() override
76 {
77 return NULL;
78 }
WordListSet(int,const char *)79 Sci_Position SCI_METHOD WordListSet(int, const char *) override
80 {
81 return -1;
82 }
83 void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
84 void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position length, int initStyle, IDocument *pAccess) override;
PrivateCall(int,void *)85 void * SCI_METHOD PrivateCall(int, void *) override
86 {
87 return NULL;
88 }
89
90 protected:
91 Sci_Position InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength);
92 Sci_Position FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const;
93 Sci_Position ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const;
94 int DetectSegmentHeader(char SegmentHeader[3]) const;
95
96 bool m_bFold;
97
98 // property lexer.edifact.highlight.un.all
99 // Set to 0 to highlight only UNA segments, or 1 to highlight all UNx segments.
100 bool m_bHighlightAllUN;
101
102 char m_chComponent;
103 char m_chData;
104 char m_chDecimal;
105 char m_chRelease;
106 char m_chSegment;
107 };
108
109 LexerModule lmEDIFACT(SCLEX_EDIFACT, LexerEDIFACT::Factory, "edifact");
110
111 ///////////////////////////////////////////////////////////////////////////////
112
113
114
115 ///////////////////////////////////////////////////////////////////////////////
116
LexerEDIFACT()117 LexerEDIFACT::LexerEDIFACT()
118 {
119 m_bFold = false;
120 m_bHighlightAllUN = false;
121 m_chComponent = ':';
122 m_chData = '+';
123 m_chDecimal = '.';
124 m_chRelease = '?';
125 m_chSegment = '\'';
126 }
127
Lex(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)128 void LexerEDIFACT::Lex(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess)
129 {
130 Sci_PositionU posFinish = startPos + length;
131 InitialiseFromUNA(pAccess, posFinish);
132
133 // Look backwards for a ' or a document beginning
134 Sci_PositionU posCurrent = FindPreviousEnd(pAccess, startPos);
135 // And jump past the ' if this was not the beginning of the document
136 if (posCurrent != 0)
137 posCurrent++;
138
139 // Style buffer, so we're not issuing loads of notifications
140 LexAccessor styler (pAccess);
141 pAccess->StartStyling(posCurrent, '\377');
142 styler.StartSegment(posCurrent);
143 Sci_Position posSegmentStart = -1;
144
145 while ((posCurrent < posFinish) && (posSegmentStart == -1))
146 {
147 posCurrent = ForwardPastWhitespace(pAccess, posCurrent, posFinish);
148 // Mark whitespace as default
149 styler.ColourTo(posCurrent - 1, SCE_EDI_DEFAULT);
150 if (posCurrent >= posFinish)
151 break;
152
153 // Does is start with 3 charaters? ie, UNH
154 char SegmentHeader[4] = { 0 };
155 pAccess->GetCharRange(SegmentHeader, posCurrent, 3);
156
157 int SegmentStyle = DetectSegmentHeader(SegmentHeader);
158 if (SegmentStyle == SCE_EDI_BADSEGMENT)
159 break;
160 if (SegmentStyle == SCE_EDI_UNA)
161 {
162 posCurrent += 9;
163 styler.ColourTo(posCurrent - 1, SCE_EDI_UNA); // UNA
164 continue;
165 }
166 posSegmentStart = posCurrent;
167 posCurrent += 3;
168
169 styler.ColourTo(posCurrent - 1, SegmentStyle); // UNH etc
170
171 // Colour in the rest of the segment
172 for (char c; posCurrent < posFinish; posCurrent++)
173 {
174 pAccess->GetCharRange(&c, posCurrent, 1);
175
176 if (c == m_chRelease) // ? escape character, check first, in case of ?'
177 posCurrent++;
178 else if (c == m_chSegment) // '
179 {
180 // Make sure the whole segment is on one line. styler won't let us go back in time, so we'll settle for marking the ' as bad.
181 Sci_Position lineSegmentStart = pAccess->LineFromPosition(posSegmentStart);
182 Sci_Position lineSegmentEnd = pAccess->LineFromPosition(posCurrent);
183 if (lineSegmentStart == lineSegmentEnd)
184 styler.ColourTo(posCurrent, SCE_EDI_SEGMENTEND);
185 else
186 styler.ColourTo(posCurrent, SCE_EDI_BADSEGMENT);
187 posSegmentStart = -1;
188 posCurrent++;
189 break;
190 }
191 else if (c == m_chComponent) // :
192 styler.ColourTo(posCurrent, SCE_EDI_SEP_COMPOSITE);
193 else if (c == m_chData) // +
194 styler.ColourTo(posCurrent, SCE_EDI_SEP_ELEMENT);
195 else
196 styler.ColourTo(posCurrent, SCE_EDI_DEFAULT);
197 }
198 }
199 styler.Flush();
200
201 if (posSegmentStart == -1)
202 return;
203
204 pAccess->StartStyling(posSegmentStart, -1);
205 pAccess->SetStyleFor(posFinish - posSegmentStart, SCE_EDI_BADSEGMENT);
206 }
207
Fold(Sci_PositionU startPos,Sci_Position length,int,IDocument * pAccess)208 void LexerEDIFACT::Fold(Sci_PositionU startPos, Sci_Position length, int, IDocument *pAccess)
209 {
210 if (!m_bFold)
211 return;
212
213 Sci_PositionU endPos = startPos + length;
214 startPos = FindPreviousEnd(pAccess, startPos);
215 char c;
216 char SegmentHeader[4] = { 0 };
217
218 bool AwaitingSegment = true;
219 Sci_PositionU currLine = pAccess->LineFromPosition(startPos);
220 int levelCurrentStyle = SC_FOLDLEVELBASE;
221 if (currLine > 0)
222 levelCurrentStyle = pAccess->GetLevel(currLine - 1); // bottom 12 bits are level
223 int indentCurrent = levelCurrentStyle & SC_FOLDLEVELNUMBERMASK;
224 int indentNext = indentCurrent;
225
226 while (startPos < endPos)
227 {
228 pAccess->GetCharRange(&c, startPos, 1);
229 switch (c)
230 {
231 case '\t':
232 case '\r':
233 case ' ':
234 startPos++;
235 continue;
236 case '\n':
237 currLine = pAccess->LineFromPosition(startPos);
238 pAccess->SetLevel(currLine, levelCurrentStyle | indentCurrent);
239 startPos++;
240 levelCurrentStyle = SC_FOLDLEVELBASE;
241 indentCurrent = indentNext;
242 continue;
243 }
244 if (c == m_chRelease)
245 {
246 startPos += 2;
247 continue;
248 }
249 if (c == m_chSegment)
250 {
251 AwaitingSegment = true;
252 startPos++;
253 continue;
254 }
255
256 if (!AwaitingSegment)
257 {
258 startPos++;
259 continue;
260 }
261
262 // Segment!
263 pAccess->GetCharRange(SegmentHeader, startPos, 3);
264 if (SegmentHeader[0] != 'U' || SegmentHeader[1] != 'N')
265 {
266 startPos++;
267 continue;
268 }
269
270 AwaitingSegment = false;
271 switch (SegmentHeader[2])
272 {
273 case 'H':
274 case 'G':
275 indentNext++;
276 levelCurrentStyle = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG;
277 break;
278
279 case 'T':
280 case 'E':
281 if (indentNext > 0)
282 indentNext--;
283 break;
284 }
285
286 startPos += 3;
287 }
288 }
289
InitialiseFromUNA(IDocument * pAccess,Sci_PositionU MaxLength)290 Sci_Position LexerEDIFACT::InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength)
291 {
292 MaxLength -= 9; // drop 9 chars, to give us room for UNA:+.? '
293
294 Sci_PositionU startPos = 0;
295 startPos += ForwardPastWhitespace(pAccess, 0, MaxLength);
296 if (startPos < MaxLength)
297 {
298 char bufUNA[9];
299 pAccess->GetCharRange(bufUNA, startPos, 9);
300
301 // Check it's UNA segment
302 if (!memcmp(bufUNA, "UNA", 3))
303 {
304 m_chComponent = bufUNA[3];
305 m_chData = bufUNA[4];
306 m_chDecimal = bufUNA[5];
307 m_chRelease = bufUNA[6];
308 // bufUNA [7] should be space - reserved.
309 m_chSegment = bufUNA[8];
310
311 return 0; // success!
312 }
313 }
314
315 // We failed to find a UNA, so drop to defaults
316 m_chComponent = ':';
317 m_chData = '+';
318 m_chDecimal = '.';
319 m_chRelease = '?';
320 m_chSegment = '\'';
321
322 return -1;
323 }
324
ForwardPastWhitespace(IDocument * pAccess,Sci_Position startPos,Sci_Position MaxLength) const325 Sci_Position LexerEDIFACT::ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const
326 {
327 char c;
328
329 while (startPos < MaxLength)
330 {
331 pAccess->GetCharRange(&c, startPos, 1);
332 switch (c)
333 {
334 case '\t':
335 case '\r':
336 case '\n':
337 case ' ':
338 break;
339 default:
340 return startPos;
341 }
342
343 startPos++;
344 }
345
346 return MaxLength;
347 }
348
DetectSegmentHeader(char SegmentHeader[3]) const349 int LexerEDIFACT::DetectSegmentHeader(char SegmentHeader[3]) const
350 {
351 if (
352 SegmentHeader[0] < 'A' || SegmentHeader[0] > 'Z' ||
353 SegmentHeader[1] < 'A' || SegmentHeader[1] > 'Z' ||
354 SegmentHeader[2] < 'A' || SegmentHeader[2] > 'Z')
355 return SCE_EDI_BADSEGMENT;
356
357 if (!memcmp(SegmentHeader, "UNA", 3))
358 return SCE_EDI_UNA;
359
360 if (m_bHighlightAllUN && !memcmp(SegmentHeader, "UN", 2))
361 return SCE_EDI_UNH;
362 else if (!memcmp(SegmentHeader, "UNH", 3))
363 return SCE_EDI_UNH;
364 else if (!memcmp(SegmentHeader, "UNG", 3))
365 return SCE_EDI_UNH;
366
367 return SCE_EDI_SEGMENTSTART;
368 }
369
370 // Look backwards for a ' or a document beginning
FindPreviousEnd(IDocument * pAccess,Sci_Position startPos) const371 Sci_Position LexerEDIFACT::FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const
372 {
373 for (char c; startPos > 0; startPos--)
374 {
375 pAccess->GetCharRange(&c, startPos, 1);
376 if (c == m_chSegment)
377 return startPos;
378 }
379 // We didn't find a ', so just go with the beginning
380 return 0;
381 }
382
383
384