1 /* ScummVM - Graphic Adventure Engine
2  *
3  * ScummVM is the legal property of its developers, whose names
4  * are too numerous to list here. Please refer to the COPYRIGHT
5  * file distributed with this source distribution.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  *
21  */
22 /*
23  Basic UTF-8 manipulation routines
24  by Jeff Bezanson
25  placed in the public domain Fall 2005
26 
27  This code is designed to provide the utilities you need to manipulate
28  UTF-8 as an internal string encoding. These functions do not perform the
29  error checking normally needed when handling UTF-8 data, so if you happen
30  to be from the Unicode Consortium you will want to flay me alive.
31  I do this because error checking can be performed at the boundaries (I/O),
32  with these routines reserved for higher performance on data known to be
33  valid.
34  */
35 
36 #include "common/debug.h"
37 
38 #include "sludge/utf8.h"
39 
40 namespace Sludge {
41 
42 const uint32 UTF8Converter::offsetsFromUTF8[6] = {
43 		0x00000000UL, 0x00003080UL,
44 		0x000E2080UL, 0x03C82080UL,
45 		0xFA082080UL, 0x82082080UL };
46 
47 /* reads the next utf-8 sequence out of a string, updating an index */
nextchar(const char * s,int * i)48 uint32 UTF8Converter::nextchar(const char *s, int *i) {
49 	uint32 ch = 0;
50 	int sz = 0;
51 
52 	do {
53 		ch <<= 6;
54 		ch += (byte)s[(*i)++];
55 		sz++;
56 	} while (s[*i] && !isutf(s[*i]));
57 	ch -= offsetsFromUTF8[sz - 1];
58 
59 	return ch;
60 }
61 
convertUtf8ToUtf32(const Common::String & str)62 Common::U32String UTF8Converter::convertUtf8ToUtf32(const Common::String &str) {
63 	// we assume one character in a Common::String is one byte
64 	// but in this case it's actually an UTF-8 string
65 	// with up to 4 bytes per character. To work around this,
66 	// convert it to an U32String before any further operation
67 	Common::U32String u32str;
68 	int i = 0;
69 	while (i < (int)str.size()) {
70 		uint32 chr = nextchar(str.c_str(), &i);
71 		u32str += chr;
72 	}
73 	return u32str;
74 }
75 
76 /* utf32 index => original byte offset */
getOriginOffset(int origIdx)77 int UTF8Converter::getOriginOffset(int origIdx) {
78 	uint offs = 0;
79 	while (origIdx > 0 && offs < _str.size()) {
80 		// increment if it's not the start of a utf8 sequence
81 		(void)(	(++offs < _str.size() && isutf(_str[offs])) ||
82 				(++offs < _str.size() && isutf(_str[offs])) ||
83 				(++offs < _str.size() && isutf(_str[offs])) ||
84 				++offs);
85 		origIdx--;
86 	}
87 	return offs;
88 }
89 
90 /** Construct a UTF8String with original char array to convert */
UTF8Converter(const char * str)91 UTF8Converter::UTF8Converter(const char *str) {
92 	setUTF8String(str);
93 }
94 
95 /** set a utf8 string to convert */
setUTF8String(Common::String str)96 void UTF8Converter::setUTF8String(Common::String str) {
97 	_str32.clear();
98 	_str32 = convertUtf8ToUtf32(str);
99 	_str.clear();
100 	_str = str;
101 }
102 
103 } // End of namespace Sludge
104