ump-0.8.6/src/utf8_string.cpp

/*
 *  utf8_string - Implements a string-class which handles utf8 coded strings
 *  Copyright (c) 2006 by Mattias Hultgren <mattias_hultgren@tele2.se>
 *
 *  See utf8_string.h
 */

/*
News
----

v1  2006-07-22 - 2006-10-10
--

	Initial version

*/

#include "keyfile.h"
#include "utf8_string.h"
#include <string.h>


char null_string[] = "";

bool utf8_valid_char( const char *src , uint32 &char_len );
bool utf8_valid_str( const char *src );
// if there ain't enough with characters uint32_max is returned
uint32 utf8_get_index_of( const char *src, uint32 pos );


utf8_string::utf8_string()
{
	str = null_string;
	size = 0;
}
utf8_string::utf8_string( const utf8_string &src ) throw(error_obj)
{
	str = null_string;
	size = 0;
	*this = src;
}
utf8_string::utf8_string( const char *src ) throw(error_obj)
{
	if( src == 0 )
	{
		str = null_string;
		size = 0;
		return;
	}
	if( !utf8_valid_str( src ) )
		THROW_ERROR( ErrorType_General, _("String is not a valid utf8 string.") );

	size = strlen( src ) +1;

	try{
		str = new char[size];
	}
	catch(...)
	{
		str = null_string;
		size = 0;
		THROW_ERROR( ErrorType_Memory, _("Couldn't get memory") );
	}

	strcpy( str, src );
}
utf8_string::~utf8_string()
{
	clear();
}

utf8_string & utf8_string::clear(void) throw()
{
	if( str != null_string )
	{
		delete [] str;
		str = null_string;
		size = 0;
	}
	return *this;
}

void utf8_string::set_size( uint32 new_size ) throw(error_obj)
{
	char *tmp_str;

	if( new_size == 0 )
	{
		clear();
		return;
	}

	try{ tmp_str = new char [new_size]; }
	catch(...) { THROW_ERROR( ErrorType_Memory, _("Couldn't get memory.") ); }

	clear();
	str = tmp_str;
	size = new_size;
	str[0] = '\0';
}

void utf8_string::enlarge_to( uint32 new_size ) throw(error_obj)
{
	char *tmp_str;

	if( new_size <= size )
		return;

	try{ tmp_str = new char [new_size]; }
	catch(...) { THROW_ERROR( ErrorType_Memory, _("Couldn't get memory.") ); }

	if( str == null_string )
		tmp_str[0] = '\0';
	else
	{
		strcpy( tmp_str, str );
		delete [] str;
	}
	str = tmp_str;
	size = new_size;
}

utf8_string & utf8_string::append( const utf8_string &src ) throw(error_obj)
{
	if( src.str == null_string )
		return *this;
	if( str == null_string )
	{
		*this = src;
		return *this;
	}
	if( ( strlen(str) + strlen(src.str) ) >= size )
		enlarge_to( strlen(str) + strlen(src.str) +1 );

	strcpy( &str[strlen(str)], src.str );

	return *this;
}

void utf8_string::operator=( const utf8_string &src ) throw(error_obj)
{
	if( src.str == null_string )
	{
		clear();
		return;
	}
	if( strlen(src.str) >= size )
		enlarge_to( strlen(src.str) +1 );

	strcpy( str, src.str );
}

bool utf8_string::operator==( const utf8_string &src ) const
{
	return ( strcmp( str, src.str ) == 0 );
}

bool utf8_string::operator==( const char *src ) const
{
	return ( strcmp( str, (src)?src:"" ) == 0 );
}

uint32 utf8_string::get_length(void) const
{
	uint32 cpos=0, tmp, len=0;

	while( str[cpos] != 0 )
	{
		if( !utf8_valid_char( &str[cpos], tmp ) )
			return len; // this should never happens...
		cpos += tmp;
		len++;
	}
	return len;
}

const char * utf8_string::c_str_from( uint32 index ) const
{
	index = utf8_get_index_of( str, index );
	if( index == uint32_max )
		return null_string;
	return &str[index];
}

bool utf8_string::test_character( uint32 pos, const char *test ) const
{
	uint32 cpos, char_len, test_cpos=0, test_char_len, i;

	if( test == 0 )
		return false;

	cpos = utf8_get_index_of( str, pos );
	if( cpos == uint32_max  ||  str[cpos] == 0 )
		return false;

	if( !utf8_valid_char( &str[cpos], char_len ) )
		return false;

	while( test[test_cpos] != 0 )
	{
		if( !utf8_valid_char( &test[test_cpos], test_char_len ) )
			return false;
		if( test_char_len == char_len )
		{
			for( i=0; i<char_len; i++ )
			{
				if( str[cpos+i] != test[test_cpos+i] )
					break;
			}
			if( i == char_len )
				return true;
		}
		test_cpos += test_char_len;
	}

	return false;
}

utf8_string & utf8_string::remove( uint32 pos, uint32 len )
{
	uint32 cpos, clen;

	cpos = utf8_get_index_of( str, pos );
	if( cpos == uint32_max )
		return *this;

	clen = utf8_get_index_of( &str[cpos], len );
	if( clen == 0 )
		return *this;
	if( clen == uint32_max )
	{
		str[cpos] = 0;
		return *this;
	}

	for( uint32 i=0; ; i++ )
	{
		str[cpos+i] = str[cpos+clen+i];
		if( str[cpos+i] == 0 )
			return *this;
	}
}

utf8_string & utf8_string::insert( const utf8_string &src, uint32 pos ) throw(error_obj)
{
	uint32 cpos;
	char *tmp;

	if( str == null_string )
	{
		append( src );
		return *this;
	}

	cpos = strlen(str) + strlen(src.str) +1;
	try{  tmp = new char[cpos]; }
	catch(...) {  THROW_ERROR( ErrorType_Memory, _("Couldn't get memory") ); }

	size = cpos;

	cpos = utf8_get_index_of( str, pos );
	if( cpos == uint32_max )
		cpos = strlen( str );

	for( uint32 i=0; i<cpos; i++ )
		tmp[i] = str[i];

	strcpy( &tmp[cpos], src.str );
	strcpy( &tmp[strlen(tmp)], &str[cpos] );

	if( str != null_string )
		delete [] str;
	str = tmp;

	return *this;
}

int32 utf8_string::get_digit( uint32 pos ) const throw(error_obj)
{
	uint32 cpos;

	cpos = utf8_get_index_of( str, pos );
	if( cpos == uint32_max )
		THROW_ERROR ( ErrorType_General, _("Unexpected end of string") );

	if( str[cpos] >= '0'  &&  str[cpos] <= '9' )
		return int32(str[cpos] - '0');

	THROW_ERROR( ErrorType_General, _("Expected a digit") );
}

void utf8_string::assign( const utf8_string &src, uint32 pos, uint32 length ) throw(error_obj)
{
	uint32 cpos, clen;

	cpos = utf8_get_index_of( src.str, pos );
	if( cpos == uint32_max )
	{
		clear();
		return;
	}

	clen = utf8_get_index_of( &src.str[cpos], length );
	if( clen == uint32_max  ||  clen == 0 )
	{
		clear();
		return;
	}

	set_size( clen +1 );

	for( uint32 i=0; i<clen; i++ )
		str[i] = src.str[cpos+i];
	str[clen] = 0;
}

utf8_string utf8_string::substr( uint32 start, uint32 length ) const throw(error_obj)
{
	utf8_string res;
	uint32 cpos, clen;

	cpos = utf8_get_index_of( str, start );
	if( cpos == uint32_max )
		return res;

	clen = utf8_get_index_of( &str[cpos], length );
	if( clen == uint32_max  ||  clen == 0 )
		return res;

	res.set_size( clen +1 );

	for( uint32 i=0; i<clen; i++ )
		res.str[i] = str[cpos+i];
	res.str[clen] = 0;

	return res;
}

void utf8_string::remove_escape_sequences(void)
{
	for( uint32 i=0; str[i] != '\0'; i++ )
	{
		if( str[i] == '\\' )
		{
			bool replaced = false;

			if( str[i+1] == '\\' )
				replaced = true;
			else if( str[i+1] == 'n' )
			{  str[i] = '\n';  replaced = true;  }
			else if( str[i+1] == 't' )
			{  str[i] = '\t';  replaced = true;  }
			else if( str[i+1] == 'r' )
			{  str[i] = '\r';  replaced = true;  }
			else if( str[i+1] == '\"' )
			{  str[i] = '\"';  replaced = true;  }

			if( replaced )
			{
				for( uint32 i2=i+1; str[i2]!='\0'; i2++ )
					str[i2] = str[i2+1];
			}
		}
	}
}


bool utf8_check_tail_bytes( const char *src, uint32 len )
{
	for( uint32 i=0; i<len; i++ )
	{
		if( (src[i] & 0xc0) != 0x80 )
			return false;
	}
	return true;
}

bool utf8_valid_char( const char *src , uint32 &char_len )
{
	if( (*src & 0x80) == 0x00 )
	{
		char_len = 1;
		return true;
	}
	else if( (*src & 0xe0) == 0xc0 )
	{
		if( (*src & 0x1e) == 0x00  ||  !utf8_check_tail_bytes( &src[1], 1 ) )
			return false;

		char_len = 2;
		return true;
	}
	else if( (*src & 0xf0) == 0xe0 )
	{
		if( (*src ^ 0xe0) == 0xff  &&  (src[1] & 0x20) == 0x00 )  // checks if *src == 0xe0
			return false;
		if( !utf8_check_tail_bytes( &src[1], 2  ) )
			return false;
		char_len = 3;
		return true;
	}
	else if( (*src & 0xf8) == 0xf0 )
	{
		if( (*src ^ 0xf0) == 0xff  &&  (src[1] & 0x30) == 0x00 )
			return false;
		if( !utf8_check_tail_bytes( &src[1], 3 ) )
			return false;
		char_len = 4;
		return true;
	}
	return false;
}

bool utf8_valid_str( const char *src )
{
	uint32 cpos=0, tmp;

	while( src[cpos] != 0 )
	{
		if( !utf8_valid_char( &src[cpos], tmp ) )
			return false;
		cpos += tmp;
	}
	return true;
}

uint32 utf8_get_index_of( const char *src, uint32 pos )
{
	uint32 cpos=0, tmp, index=0;

	while( index != pos )
	{
		if( src[cpos] == 0 )
			return uint32_max;
		if( !utf8_valid_char( &src[cpos], tmp ) )
			return uint32_max;
		cpos += tmp;
		index++;
	}
	return cpos;
}