1 /* ScummVM - Graphic Adventure Engine
2 *
3 * ScummVM is the legal property of its developers, whose names
4 * are too numerous to list here. Please refer to the COPYRIGHT
5 * file distributed with this source distribution.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 */
22 /*
23 Basic UTF-8 manipulation routines
24 by Jeff Bezanson
25 placed in the public domain Fall 2005
26
27 This code is designed to provide the utilities you need to manipulate
28 UTF-8 as an internal string encoding. These functions do not perform the
29 error checking normally needed when handling UTF-8 data, so if you happen
30 to be from the Unicode Consortium you will want to flay me alive.
31 I do this because error checking can be performed at the boundaries (I/O),
32 with these routines reserved for higher performance on data known to be
33 valid.
34 */
35
36 #include "common/debug.h"
37
38 #include "sludge/utf8.h"
39
40 namespace Sludge {
41
42 const uint32 UTF8Converter::offsetsFromUTF8[6] = {
43 0x00000000UL, 0x00003080UL,
44 0x000E2080UL, 0x03C82080UL,
45 0xFA082080UL, 0x82082080UL };
46
47 /* reads the next utf-8 sequence out of a string, updating an index */
nextchar(const char * s,int * i)48 uint32 UTF8Converter::nextchar(const char *s, int *i) {
49 uint32 ch = 0;
50 int sz = 0;
51
52 do {
53 ch <<= 6;
54 ch += (byte)s[(*i)++];
55 sz++;
56 } while (s[*i] && !isutf(s[*i]));
57 ch -= offsetsFromUTF8[sz - 1];
58
59 return ch;
60 }
61
convertUtf8ToUtf32(const Common::String & str)62 Common::U32String UTF8Converter::convertUtf8ToUtf32(const Common::String &str) {
63 // we assume one character in a Common::String is one byte
64 // but in this case it's actually an UTF-8 string
65 // with up to 4 bytes per character. To work around this,
66 // convert it to an U32String before any further operation
67 Common::U32String u32str;
68 int i = 0;
69 while (i < (int)str.size()) {
70 uint32 chr = nextchar(str.c_str(), &i);
71 u32str += chr;
72 }
73 return u32str;
74 }
75
76 /* utf32 index => original byte offset */
getOriginOffset(int origIdx)77 int UTF8Converter::getOriginOffset(int origIdx) {
78 uint offs = 0;
79 while (origIdx > 0 && offs < _str.size()) {
80 // increment if it's not the start of a utf8 sequence
81 (void)( (++offs < _str.size() && isutf(_str[offs])) ||
82 (++offs < _str.size() && isutf(_str[offs])) ||
83 (++offs < _str.size() && isutf(_str[offs])) ||
84 ++offs);
85 origIdx--;
86 }
87 return offs;
88 }
89
90 /** Construct a UTF8String with original char array to convert */
UTF8Converter(const char * str)91 UTF8Converter::UTF8Converter(const char *str) {
92 setUTF8String(str);
93 }
94
95 /** set a utf8 string to convert */
setUTF8String(Common::String str)96 void UTF8Converter::setUTF8String(Common::String str) {
97 _str32.clear();
98 _str32 = convertUtf8ToUtf32(str);
99 _str.clear();
100 _str = str;
101 }
102
103 } // End of namespace Sludge
104