math/cocoR/Scanner.cpp

/***************************************************************************
 *  aGrUM modified frames and atg files for cocoR
 *   Copyright (c) 2005-2021 by Christophe GONZALES(_at_AMU) and Pierre-Henri WUILLEMIN(_at_LIP6)  *
 *   info_at_agrum_dot_org
***************************************************************************/
/*----------------------------------------------------------------------
Compiler Generator Coco/R,
Copyright (c) 1990, 2004 Hanspeter Moessenboeck, University of Linz
extended by M. Loeberbauer & A. Woess, Univ. of Linz
ported to C++ by Csaba Balazs, University of Szeged
with improvements by Pat Terry, Rhodes University

This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 2, or (at your option) any
later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

As an exception, it is allowed to write an extension of Coco/R that is
used as a plugin in non-free software.

If not otherwise stated, any source code generated by Coco/R (other than
Coco/R itself) does not fall under the GNU General Public License.
-----------------------------------------------------------------------*/


#include <memory.h>
#include <string.h>
#include "Scanner.h"
#include <agrum/tools/core/cocoR/common.h>

   namespace gum {
namespace formula {


Token::Token() {
  kind = 0;
  pos  = 0;
  col  = 0;
  line = 0;
  val  = nullptr;
  next = nullptr;
}

Token::~Token() {
  coco_string_delete( val );
}

Buffer::Buffer( FILE* s, bool isUserStream ) {
// ensure binary read on windows
#if _MSC_VER >= 1300
  _setmode( _fileno( s ), _O_BINARY );
#endif
  stream = s; this->isUserStream = isUserStream;

  if ( CanSeek() ) {
    fseek( s, 0, SEEK_END );
    fileLen = ftell( s );
    fseek( s, 0, SEEK_SET );
    bufLen = ( fileLen < MAX_BUFFER_LENGTH ) ? fileLen : MAX_BUFFER_LENGTH;
    bufStart = INT_MAX; // nothing in the buffer so far
  } else {
    fileLen = bufLen = bufStart = 0;
  }

  bufCapacity = ( bufLen>0 ) ? bufLen : MIN_BUFFER_LENGTH;
  buf = new unsigned char[bufCapacity];

  if ( fileLen > 0 ) SetPos( 0 );      // setup  buffer to position 0 (start)
  else bufPos = 0; // index 0 is already after the file, thus Pos = 0 is invalid

  if ( bufLen == fileLen && CanSeek() ) Close();
}

Buffer::Buffer( Buffer* b ) {
  buf = b->buf;
  bufCapacity = b->bufCapacity;
  b->buf = nullptr;
  bufStart = b->bufStart;
  bufLen = b->bufLen;
  fileLen = b->fileLen;
  bufPos = b->bufPos;
  stream = b->stream;
  b->stream = nullptr;
  isUserStream = b->isUserStream;
}

Buffer::Buffer( const unsigned char* buf, int len ) {
  this->isUserStream = false;
  this->buf = new unsigned char[len];
  memcpy( this->buf, buf, len*sizeof( unsigned char ) );
  bufStart = 0;
  bufCapacity = bufLen = len;
  fileLen = len;
  bufPos = 0;
  stream = nullptr;
}

Buffer::~Buffer() {
  Close();

  if ( buf != nullptr ) {
    delete [] buf;
    buf = nullptr;
  }
}

void Buffer::Close() {
  if ( !isUserStream && stream != nullptr ) {
    fclose( stream );
    stream = nullptr;
  }
}

int Buffer::GetPercent() {
  return ( int )( ( 100.0*GetPos() )/fileLen );
}

int Buffer::Read() {
  if ( bufPos < bufLen ) {
    return buf[bufPos++];
  } else if ( GetPos() < fileLen ) {

    SetPos( GetPos() ); // shift buffer start to Pos
    return buf[bufPos++];
  } else if ( ( stream != nullptr ) && !CanSeek() && ( ReadNextStreamChunk() > 0 ) ) {
    return buf[bufPos++];
  } else {
    return EoF;
  }
}

int Buffer::Peek() {
  int curPos = GetPos();
  int ch = Read();
  SetPos( curPos );
  return ch;
}

// beg .. begin, zero-based, inclusive, in byte
// end .. end, zero-based, exclusive, in byte
wchar_t* Buffer::GetString( int beg, int end ) {
  int len = 0;
  wchar_t* buf = new wchar_t[end - beg];
  int oldPos = GetPos();
  SetPos( beg );

  while ( GetPos() < end ) buf[len++] = ( wchar_t ) Read();

  SetPos( oldPos );
  wchar_t* res = coco_string_create( buf, 0, len );
  coco_string_delete( buf );
  return res;
}

int Buffer::GetPos() {
  return bufPos + bufStart;
}

void Buffer::SetPos( int value ) {
  if ( ( value >= fileLen ) && ( stream != nullptr ) && !CanSeek() ) {
    // Wanted position is after buffer and the stream
    // is not seek-able e.g. network or console,
    // thus we have to read the stream manually till
    // the wanted position is in sight.
    while ( ( value >= fileLen ) && ( ReadNextStreamChunk() > 0 ) );
  }

  if ( ( value < 0 ) || ( value > fileLen ) ) {
    wprintf( L"--- buffer out of bounds access, position: %d\n", value );
    exit( 1 );
  }

  if ( ( value >= bufStart ) && ( value < ( bufStart + bufLen ) ) ) { // already in buffer
    bufPos = value - bufStart;
  } else if ( stream != nullptr ) { // must be swapped in
    fseek( stream, value, SEEK_SET );
    bufLen = (int)fread( buf, int(sizeof( unsigned char )), bufCapacity, stream );
    bufStart = value; bufPos = 0;
  } else {
    bufPos = fileLen - bufStart; // make Pos return fileLen
  }
}

// Read the next chunk of bytes from the stream, increases the buffer
// if needed and updates the fields fileLen and bufLen.
// Returns the number of bytes read.
int Buffer::ReadNextStreamChunk() {
  int free = bufCapacity - bufLen;

  if ( free == 0 ) {
    // in the case of a growing input stream
    // we can neither seek in the stream, nor can we
    // foresee the maximum length, thus we must adapt
    // the buffer size on demand.
    bufCapacity = bufLen * 2;
    unsigned char* newBuf = new unsigned char[bufCapacity];
    memcpy( newBuf, buf, bufLen*sizeof( unsigned char ) );
    delete [] buf;
    buf = newBuf;
    free = bufLen;
  }

  int read = (int)fread( buf + bufLen, int(sizeof( unsigned char )), free, stream );

  if ( read > 0 ) {
    fileLen = bufLen = ( bufLen + read );
    return read;
  }

  // end of stream reached
  return 0;
}

bool Buffer::CanSeek() {
  return ( stream != nullptr ) && ( ftell( stream ) != -1 );
}

int UTF8Buffer::Read() {
  int ch;

  do {
    ch = Buffer::Read();
    // until we find a utf8 start (0xxxxxxx or 11xxxxxx)
  } while ( ( ch >= 128 ) && ( ( ch & 0xC0 ) != 0xC0 ) && ( ch != EoF ) );

  if ( ch < 128 || ch == EoF ) {
    // nothing to do, first 127 chars are the same in ascii and utf8
    // 0xxxxxxx or end of file character
  } else if ( ( ch & 0xF0 ) == 0xF0 ) {
    // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    int c1 = ch & 0x07; ch = Buffer::Read();
    int c2 = ch & 0x3F; ch = Buffer::Read();
    int c3 = ch & 0x3F; ch = Buffer::Read();
    int c4 = ch & 0x3F;
    ch = ( ( ( ( ( c1 << 6 ) | c2 ) << 6 ) | c3 ) << 6 ) | c4;
  } else if ( ( ch & 0xE0 ) == 0xE0 ) {
    // 1110xxxx 10xxxxxx 10xxxxxx
    int c1 = ch & 0x0F; ch = Buffer::Read();
    int c2 = ch & 0x3F; ch = Buffer::Read();
    int c3 = ch & 0x3F;
    ch = ( ( ( c1 << 6 ) | c2 ) << 6 ) | c3;
  } else if ( ( ch & 0xC0 ) == 0xC0 ) {
    // 110xxxxx 10xxxxxx
    int c1 = ch & 0x1F; ch = Buffer::Read();
    int c2 = ch & 0x3F;
    ch = ( c1 << 6 ) | c2;
  }

  return ch;
}

Scanner::Scanner(const unsigned char* buf, int len, std::string filename, bool trace) {
  buffer = new Buffer( buf, len );
   _filenamne_=widen( filename.c_str() );
   _trace_=trace;
  Init();
}

Scanner::Scanner( const char* fileName,bool trace ) {
  Load( widen( std::string( fileName ) ).c_str() );
   _trace_=trace;
}

Scanner::Scanner( const wchar_t* fileName,bool trace ) {
  Load( fileName );
   _trace_=trace;
}

void Scanner::Load( const wchar_t* fileName ) {
  FILE* stream;
  char* chFileName = coco_string_create_char( fileName );

  if ( ( stream = fopen( chFileName, "rb" ) ) == nullptr ) {
    std::string s( "No such file : " ); s+=chFileName;
    GUM_ERROR( gum::IOError,s )
  }

  coco_string_delete( chFileName );
  buffer = new Buffer( stream, false );
   _filenamne_=std::wstring( fileName );
  Init();
}

Scanner::Scanner( FILE* s,bool trace ) {
  buffer = new Buffer( s, true );
   _filenamne_=L"FILE";
  Init();
   _trace_=trace;
}

Scanner::~Scanner() {
  char* cur = ( char* ) firstHeap;

  while ( cur != nullptr ) {
    cur = *( char** )( cur + HEAP_BLOCK_SIZE );
    free( firstHeap );
    firstHeap = cur;
  }

  if ( tval ) delete [] tval;

  if ( buffer ) delete buffer;
}

void Scanner::Init() {
  percent=-1;
  EOL    = '\n';
  eofSym = 0;
  	maxT = 10;
	noSym = 10;
	int i;
	for (i = 48; i <= 57; ++i) start.set(i, 7);
	for (i = 65; i <= 90; ++i) start.set(i, 6);
	for (i = 95; i <= 95; ++i) start.set(i, 6);
	for (i = 97; i <= 122; ++i) start.set(i, 6);
	start.set(43, 8);
	start.set(45, 9);
	for (i = 42; i <= 42; ++i) start.set(i, 4);
	for (i = 47; i <= 47; ++i) start.set(i, 4);
	for (i = 60; i <= 60; ++i) start.set(i, 4);
	for (i = 62; i <= 62; ++i) start.set(i, 4);
	for (i = 94; i <= 94; ++i) start.set(i, 4);
	start.set(10, 5);
	start.set(40, 13);
	start.set(41, 14);
	start.set(44, 15);
		start.set(Buffer::EoF, -1);


  tvalLength = 128;
  tval = new wchar_t[tvalLength]; // text of current token

  // HEAP_BLOCK_SIZE byte heap + pointer to next heap block
  heap = malloc( HEAP_BLOCK_SIZE + sizeof( void* ) );
  firstHeap = heap;
  heapEnd = ( void** )( ( ( char* ) heap ) + HEAP_BLOCK_SIZE );
  *heapEnd = 0;
  heapTop = heap;

  if ( sizeof( Token ) > HEAP_BLOCK_SIZE ) {
    wprintf( L"--- Too small HEAP_BLOCK_SIZE\n" );
    exit( 1 );
  }

  pos = -1; line = 1; col = 0; charPos = -1;
  oldEols = 0;
  NextCh();

  if ( ch == 0xEF ) { // check optional byte order mark for UTF-8
    NextCh(); int ch1 = ch;
    NextCh(); int ch2 = ch;

    if ( ch1 != 0xBB || ch2 != 0xBF ) {
      wprintf( L"Illegal byte order mark at start of file" );
      exit( 1 );
    }

    Buffer* oldBuf = buffer;
    buffer = new UTF8Buffer( buffer ); col = 0; charPos = -1;
    delete oldBuf; oldBuf = nullptr;
    NextCh();
  }


  pt = tokens = CreateToken(); // first token is a dummy
}

void Scanner::NextCh() {
  if ( oldEols > 0 ) { ch = EOL; oldEols--; }
  else {
    pos = buffer->GetPos();
    ch = buffer->Read();
    int p=buffer->GetPercent();

    if ( ch==Buffer::EoF ) {
      GUM_EMIT1( onLoad,200 );
    } else {
      if ( percent<p ) {
        percent=p;
        GUM_EMIT1( onLoad,percent );
      }
    }

    col++; charPos++;

    // replace isolated '\r' by '\n' in order to make
    // eol handling uniform across Windows, Unix and Mac
    if ( ch == L'\r' && buffer->Peek() != L'\n' ) ch = EOL;

    if ( ch == EOL ) { /*if ( _trace_) std::cout<<line<<std::endl;*/ line++; col = 0; }
  }


}

void Scanner::AddCh() {
  if ( tlen >= tvalLength ) {
    tvalLength *= 2;
    wchar_t* newBuf = new wchar_t[tvalLength];
    memcpy( newBuf, tval, tlen*sizeof( wchar_t ) );
    delete [] tval;
    tval = newBuf;
  }

  if ( ch != Buffer::EoF ) {
    		tval[tlen++] = ch;
    NextCh();
  }
}


void Scanner::CreateHeapBlock() {
  void* newHeap;
  char* cur = ( char* ) firstHeap;

  while ( ( ( char* ) tokens < cur ) || ( ( char* ) tokens > ( cur + HEAP_BLOCK_SIZE ) ) ) {
    cur = *( ( char** )( cur + HEAP_BLOCK_SIZE ) );
    free( firstHeap );
    firstHeap = cur;
  }

  // HEAP_BLOCK_SIZE byte heap + pointer to next heap block
  newHeap = malloc( HEAP_BLOCK_SIZE + sizeof( void* ) );
  *heapEnd = newHeap;
  heapEnd = ( void** )( ( ( char* ) newHeap ) + HEAP_BLOCK_SIZE );
  *heapEnd = 0;
  heap = newHeap;
  heapTop = heap;
}

Token* Scanner::CreateToken() {
  Token* t;

  if ( ( ( char* ) heapTop + ( int ) sizeof( Token ) ) >= ( char* ) heapEnd ) {
    CreateHeapBlock();
  }

  t = ( Token* ) heapTop;
  heapTop = ( void* )( ( char* ) heapTop + sizeof( Token ) );
  t->val = nullptr;
  t->next = nullptr;
  return t;
}

void Scanner::AppendVal( Token* t ) {
  int reqMem = ( tlen + 1 ) * sizeof( wchar_t );

  if ( ( ( char* ) heapTop + reqMem ) >= ( char* ) heapEnd ) {
    if ( reqMem > HEAP_BLOCK_SIZE ) {
      wprintf( L"--- Too long token value\n" );
      exit( 1 );
    }

    CreateHeapBlock();
  }

  t->val = ( wchar_t* ) heapTop;
  heapTop = ( void* )( ( char* ) heapTop + reqMem );

  wcsncpy( t->val, tval, tlen );
  t->val[tlen] = L'\0';
}

Token* Scanner::NextToken() {
  while ( ch == ' ' ||
          			(ch >= 9 && ch <= 10) || ch == 13
        ) NextCh();


  int recKind = noSym;
  int recEnd = pos;
  t = CreateToken();
  t->pos = pos; t->col = col; t->line = line; t->charPos = charPos;
  int state = start.state( ch );
  tlen = 0; AddCh();

  switch ( state ) {
    case -1: { t->kind = eofSym; break; } // NextCh already done

    case 0: {
case_0:

      if ( recKind != noSym ) {
        tlen = recEnd - t->pos;
        SetScannerBehindT();
      }

      t->kind = recKind; break;
    } // NextCh already done

    		case 1:
			case_1:
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_3;}
			else if (ch == L'+' || ch == L'-') {AddCh(); goto case_2;}
			else {goto case_0;}
		case 2:
			case_2:
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_3;}
			else {goto case_0;}
		case 3:
			case_3:
			recEnd = pos; recKind = 3;
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_3;}
			else {t->kind = 3; break;}
		case 4:
			{t->kind = 4; break;}
		case 5:
			{t->kind = 5; break;}
		case 6:
			case_6:
			recEnd = pos; recKind = 6;
			if (ch == L'.' || (ch >= L'0' && ch <= L'9') || (ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_6;}
			else {t->kind = 6; break;}
		case 7:
			case_7:
			recEnd = pos; recKind = 1;
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_7;}
			else if (ch == L'.') {AddCh(); goto case_10;}
			else if (ch == L'E' || ch == L'e') {AddCh(); goto case_1;}
			else {t->kind = 1; break;}
		case 8:
			recEnd = pos; recKind = 4;
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_11;}
			else {t->kind = 4; break;}
		case 9:
			recEnd = pos; recKind = 4;
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_11;}
			else {t->kind = 4; break;}
		case 10:
			case_10:
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_12;}
			else {goto case_0;}
		case 11:
			case_11:
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_11;}
			else if (ch == L'.') {AddCh(); goto case_10;}
			else if (ch == L'E' || ch == L'e') {AddCh(); goto case_1;}
			else {goto case_0;}
		case 12:
			case_12:
			recEnd = pos; recKind = 2;
			if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_12;}
			else if (ch == L'E' || ch == L'e') {AddCh(); goto case_1;}
			else {t->kind = 2; break;}
		case 13:
			{t->kind = 7; break;}
		case 14:
			{t->kind = 8; break;}
		case 15:
			{t->kind = 9; break;}

  }

  AppendVal( t );
  return t;
}

void Scanner::SetScannerBehindT() {
  buffer->SetPos( t->pos );
  NextCh();
  line = t->line; col = t->col; charPos = t->charPos;

  for ( int i = 0; i < tlen; i++ ) NextCh();
}

// get the next token (possibly a token already seen during peeking)
Token* Scanner::Scan() {
  if ( tokens->next == nullptr ) {
    return pt = tokens = NextToken();
  } else {
    pt = tokens = tokens->next;
    return tokens;
  }
}

// peek for the next token, ignore pragmas
Token* Scanner::Peek() {
  do {
    if ( pt->next == nullptr ) {
      pt->next = NextToken();
    }

    pt = pt->next;
  } while ( pt->kind > maxT ); // skip pragmas

  return pt;
}

// make sure that peeking starts at the current scan position
void Scanner::ResetPeek() {
  pt = tokens;
}

} // namespace
} // namespace