scintilla/lexers/LexEDIFACT.cpp

// Scintilla Lexer for EDIFACT
// Written by Iain Clarke, IMCSoft & Inobiz AB.
// EDIFACT documented here: https://www.unece.org/cefact/edifact/welcome.html
// and more readably here: https://en.wikipedia.org/wiki/EDIFACT
// This code is subject to the same license terms as the rest of the scintilla project:
// The License.txt file describes the conditions under which this software may be distributed.
//

// Header order must match order in scripts/HeaderOrder.txt
#include <cstdlib>
#include <cassert>
#include <cstring>
#include <cctype>

#include "ILexer.h"
#include "Scintilla.h"
#include "SciLexer.h"

#include "LexAccessor.h"
#include "LexerModule.h"
#include "DefaultLexer.h"

using namespace Scintilla;

class LexerEDIFACT : public DefaultLexer
{
public:
	LexerEDIFACT();
	virtual ~LexerEDIFACT() {} // virtual destructor, as we inherit from ILexer

	static ILexer *Factory() {
		return new LexerEDIFACT;
	}

	int SCI_METHOD Version() const override
	{
		return lvOriginal;
	}
	void SCI_METHOD Release() override
	{
		delete this;
	}

	const char * SCI_METHOD PropertyNames() override
	{
		return "fold\nlexer.edifact.highlight.un.all";
	}
	int SCI_METHOD PropertyType(const char *) override
	{
		return SC_TYPE_BOOLEAN; // Only one property!
	}
	const char * SCI_METHOD DescribeProperty(const char *name) override
	{
		if (!strcmp(name, "fold"))
			return "Whether to apply folding to document or not";
		if (!strcmp(name, "lexer.edifact.highlight.un.all"))
			return "Whether to apply UN* highlighting to all UN segments, or just to UNH";
		return NULL;
	}

	Sci_Position SCI_METHOD PropertySet(const char *key, const char *val) override
	{
		if (!strcmp(key, "fold"))
		{
			m_bFold = strcmp(val, "0") ? true : false;
			return 0;
		}
		if (!strcmp(key, "lexer.edifact.highlight.un.all"))	// GetProperty
		{
			m_bHighlightAllUN = strcmp(val, "0") ? true : false;
			return 0;
		}
		return -1;
	}
	const char * SCI_METHOD DescribeWordListSets() override
	{
		return NULL;
	}
	Sci_Position SCI_METHOD WordListSet(int, const char *) override
	{
		return -1;
	}
	void SCI_METHOD Lex(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, IDocument *pAccess) override;
	void SCI_METHOD Fold(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, IDocument *pAccess) override;
	void * SCI_METHOD PrivateCall(int, void *) override
	{
		return NULL;
	}

protected:
	Sci_Position InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength);
	Sci_Position FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const;
	Sci_Position ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const;
	int DetectSegmentHeader(char SegmentHeader[3]) const;

	bool m_bFold;

	// property lexer.edifact.highlight.un.all
	//	Set to 0 to highlight only UNA segments, or 1 to highlight all UNx segments.
	bool m_bHighlightAllUN;

	char m_chComponent;
	char m_chData;
	char m_chDecimal;
	char m_chRelease;
	char m_chSegment;
};

LexerModule lmEDIFACT(SCLEX_EDIFACT, LexerEDIFACT::Factory, "edifact");

///////////////////////////////////////////////////////////////////////////////


///////////////////////////////////////////////////////////////////////////////

LexerEDIFACT::LexerEDIFACT()
{
	m_bFold = false;
	m_bHighlightAllUN = false;
	m_chComponent = ':';
	m_chData = '+';
	m_chDecimal = '.';
	m_chRelease = '?';
	m_chSegment = '\'';
}

void LexerEDIFACT::Lex(Sci_PositionU startPos, Sci_Position lengthDoc, int, IDocument *pAccess)
{
	Sci_PositionU posFinish = startPos + lengthDoc;
	InitialiseFromUNA(pAccess, posFinish);

	// Look backwards for a ' or a document beginning
	Sci_PositionU posCurrent = FindPreviousEnd(pAccess, startPos);
	// And jump past the ' if this was not the beginning of the document
	if (posCurrent != 0)
		posCurrent++;

	// Style buffer, so we're not issuing loads of notifications
	LexAccessor styler (pAccess);
	pAccess->StartStyling(posCurrent, '\377');
	styler.StartSegment(posCurrent);
	Sci_Position posSegmentStart = -1;

	while ((posCurrent < posFinish) && (posSegmentStart == -1))
	{
		posCurrent = ForwardPastWhitespace(pAccess, posCurrent, posFinish);
		// Mark whitespace as default
		styler.ColourTo(posCurrent - 1, SCE_EDI_DEFAULT);
		if (posCurrent >= posFinish)
			break;

		// Does is start with 3 charaters? ie, UNH
		char SegmentHeader[4] = { 0 };
		pAccess->GetCharRange(SegmentHeader, posCurrent, 3);

		int SegmentStyle = DetectSegmentHeader(SegmentHeader);
		if (SegmentStyle == SCE_EDI_BADSEGMENT)
			break;
		if (SegmentStyle == SCE_EDI_UNA)
		{
			posCurrent += 9;
			styler.ColourTo(posCurrent - 1, SCE_EDI_UNA); // UNA
			continue;
		}
		posSegmentStart = posCurrent;
		posCurrent += 3;

		styler.ColourTo(posCurrent - 1, SegmentStyle); // UNH etc

		// Colour in the rest of the segment
		for (char c; posCurrent < posFinish; posCurrent++)
		{
			pAccess->GetCharRange(&c, posCurrent, 1);

			if (c == m_chRelease) // ? escape character, check first, in case of ?'
				posCurrent++;
			else if (c == m_chSegment) // '
			{
				// Make sure the whole segment is on one line. styler won't let us go back in time, so we'll settle for marking the ' as bad.
				Sci_Position lineSegmentStart = pAccess->LineFromPosition(posSegmentStart);
				Sci_Position lineSegmentEnd = pAccess->LineFromPosition(posCurrent);
				if (lineSegmentStart == lineSegmentEnd)
					styler.ColourTo(posCurrent, SCE_EDI_SEGMENTEND);
				else
					styler.ColourTo(posCurrent, SCE_EDI_BADSEGMENT);
				posSegmentStart = -1;
				posCurrent++;
				break;
			}
			else if (c == m_chComponent) // :
				styler.ColourTo(posCurrent, SCE_EDI_SEP_COMPOSITE);
			else if (c == m_chData) // +
				styler.ColourTo(posCurrent, SCE_EDI_SEP_ELEMENT);
			else
				styler.ColourTo(posCurrent, SCE_EDI_DEFAULT);
		}
	}
	styler.Flush();

	if (posSegmentStart == -1)
		return;

	pAccess->StartStyling(posSegmentStart, -1);
	pAccess->SetStyleFor(posFinish - posSegmentStart, SCE_EDI_BADSEGMENT);
}

void LexerEDIFACT::Fold(Sci_PositionU startPos, Sci_Position lengthDoc, int, IDocument *pAccess)
{
	if (!m_bFold)
		return;

	// Fold at UNx lines. ie, UNx segments = 0, other segments = 1.
	// There's no sub folding, so we can be quite simple.
	Sci_Position endPos = startPos + lengthDoc;
	char SegmentHeader[4] = { 0 };

	int iIndentPrevious = 0;
	Sci_Position lineLast = pAccess->LineFromPosition(endPos);

	for (Sci_Position lineCurrent = pAccess->LineFromPosition(startPos); lineCurrent <= lineLast; lineCurrent++)
	{
		Sci_Position posLineStart = pAccess->LineStart(lineCurrent);
		posLineStart = ForwardPastWhitespace(pAccess, posLineStart, endPos);
		Sci_Position lineDataStart = pAccess->LineFromPosition(posLineStart);
		// Fill in whitespace lines?
		for (; lineCurrent < lineDataStart; lineCurrent++)
			pAccess->SetLevel(lineCurrent, SC_FOLDLEVELBASE | SC_FOLDLEVELWHITEFLAG | iIndentPrevious);
		pAccess->GetCharRange(SegmentHeader, posLineStart, 3);
		//if (DetectSegmentHeader(SegmentHeader) == SCE_EDI_BADSEGMENT) // Abort if this is not a proper segment header

		int level = 0;
		if (memcmp(SegmentHeader, "UNH", 3) == 0) // UNH starts blocks
			level = SC_FOLDLEVELBASE | SC_FOLDLEVELHEADERFLAG;
		// Check for UNA,B and Z. All others are inside messages
		else if (!memcmp(SegmentHeader, "UNA", 3) || !memcmp(SegmentHeader, "UNB", 3) || !memcmp(SegmentHeader, "UNZ", 3))
			level = SC_FOLDLEVELBASE;
		else
			level = SC_FOLDLEVELBASE | 1;
		pAccess->SetLevel(lineCurrent, level);
		iIndentPrevious = level & SC_FOLDLEVELNUMBERMASK;
	}
}

Sci_Position LexerEDIFACT::InitialiseFromUNA(IDocument *pAccess, Sci_PositionU MaxLength)
{
	MaxLength -= 9; // drop 9 chars, to give us room for UNA:+.? '

	Sci_PositionU startPos = 0;
	startPos += ForwardPastWhitespace(pAccess, 0, MaxLength);
	if (startPos < MaxLength)
	{
		char bufUNA[9];
		pAccess->GetCharRange(bufUNA, startPos, 9);

		// Check it's UNA segment
		if (!memcmp(bufUNA, "UNA", 3))
		{
			m_chComponent = bufUNA[3];
			m_chData = bufUNA[4];
			m_chDecimal = bufUNA[5];
			m_chRelease = bufUNA[6];
			// bufUNA [7] should be space - reserved.
			m_chSegment = bufUNA[8];

			return 0; // success!
		}
	}

	// We failed to find a UNA, so drop to defaults
	m_chComponent = ':';
	m_chData = '+';
	m_chDecimal = '.';
	m_chRelease = '?';
	m_chSegment = '\'';

	return -1;
}

Sci_Position LexerEDIFACT::ForwardPastWhitespace(IDocument *pAccess, Sci_Position startPos, Sci_Position MaxLength) const
{
	char c;

	while (startPos < MaxLength)
	{
		pAccess->GetCharRange(&c, startPos, 1);
		switch (c)
		{
		case '\t':
		case '\r':
		case '\n':
		case ' ':
			break;
		default:
			return startPos;
		}

		startPos++;
	}

	return MaxLength;
}

int LexerEDIFACT::DetectSegmentHeader(char SegmentHeader[3]) const
{
	if (
		SegmentHeader[0] < 'A' || SegmentHeader[0] > 'Z' ||
		SegmentHeader[1] < 'A' || SegmentHeader[1] > 'Z' ||
		SegmentHeader[2] < 'A' || SegmentHeader[2] > 'Z')
		return SCE_EDI_BADSEGMENT;

	if (!memcmp(SegmentHeader, "UNA", 3))
		return SCE_EDI_UNA;

	if (m_bHighlightAllUN && !memcmp(SegmentHeader, "UN", 2))
		return SCE_EDI_UNH;
	else if (memcmp(SegmentHeader, "UNH", 3) == 0)
		return SCE_EDI_UNH;

	return SCE_EDI_SEGMENTSTART;
}

// Look backwards for a ' or a document beginning
Sci_Position LexerEDIFACT::FindPreviousEnd(IDocument *pAccess, Sci_Position startPos) const
{
	for (char c; startPos > 0; startPos--)
	{
		pAccess->GetCharRange(&c, startPos, 1);
		if (c == m_chSegment)
			return startPos;
	}
	// We didn't find a ', so just go with the beginning
	return 0;
}