1 // Emacs style mode select -*- C++ -*-
2 //-----------------------------------------------------------------------------
3 //
4 // Copyright(C) 2012 Simon Howard
5 //
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
10 //
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 //
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 // 02111-1307, USA.
20 //
21
22 #include <stdlib.h>
23 #include <string.h>
24
25 #include "txt_utf8.h"
26
27 // Encode a Unicode character as UTF-8, storing it in the buffer 'p'
28 // and returning the new, incremented position.
29
TXT_EncodeUTF8(char * p,unsigned int c)30 char *TXT_EncodeUTF8(char *p, unsigned int c)
31 {
32 if (c < 0x80) // 1 character (ASCII):
33 {
34 p[0] = c;
35 return p + 1;
36 }
37 else if (c < 0x800) // 2 character:
38 {
39 p[0] = 0xc0 | (c >> 6);
40 p[1] = 0x80 | (c & 0x3f);
41 return p + 2;
42 }
43 else if (c < 0x10000) // 3 chacater:
44 {
45 p[0] = 0xe0 | (c >> 12);
46 p[1] = 0x80 | ((c >> 6) & 0x3f);
47 p[2] = 0x80 | (c & 0x3f);
48 return p + 3;
49 }
50 else if (c < 0x200000) // 4 character:
51 {
52 p[0] = 0xf0 | (c >> 18);
53 p[1] = 0x80 | ((c >> 12) & 0x3f);
54 p[2] = 0x80 | ((c >> 6) & 0x3f);
55 p[3] = 0x80 | (c & 0x3f);
56 return p + 4;
57 }
58 else
59 {
60 // Too big!
61
62 return p;
63 }
64 }
65
66 // Decode UTF-8 character, incrementing *ptr over the decoded bytes.
67
TXT_DecodeUTF8(const char ** ptr)68 unsigned int TXT_DecodeUTF8(const char **ptr)
69 {
70 const char *p = *ptr;
71 unsigned int c;
72
73 // UTF-8 decode.
74
75 if ((*p & 0x80) == 0) // 1 character (ASCII):
76 {
77 c = *p;
78 *ptr += 1;
79 }
80 else if ((p[0] & 0xe0) == 0xc0 // 2 character:
81 && (p[1] & 0xc0) == 0x80)
82 {
83 c = ((p[0] & 0x1f) << 6)
84 | (p[1] & 0x3f);
85 *ptr += 2;
86 }
87 else if ((p[0] & 0xf0) == 0xe0 // 3 character:
88 && (p[1] & 0xc0) == 0x80
89 && (p[2] & 0xc0) == 0x80)
90 {
91 c = ((p[0] & 0x0f) << 12)
92 | ((p[1] & 0x3f) << 6)
93 | (p[2] & 0x3f);
94 *ptr += 3;
95 }
96 else if ((p[0] & 0xf8) == 0xf0 // 4 character:
97 && (p[1] & 0xc0) == 0x80
98 && (p[2] & 0xc0) == 0x80
99 && (p[3] & 0xc0) == 0x80)
100 {
101 c = ((p[0] & 0x07) << 18)
102 | ((p[1] & 0x3f) << 12)
103 | ((p[2] & 0x3f) << 6)
104 | (p[3] & 0x3f);
105 *ptr += 4;
106 }
107 else
108 {
109 // Decode failure.
110 // Don't bother with 5/6 byte sequences.
111
112 c = 0;
113 }
114
115 return c;
116 }
117
118 // Count the number of characters in a UTF-8 string.
119
TXT_UTF8_Strlen(const char * s)120 unsigned int TXT_UTF8_Strlen(const char *s)
121 {
122 const char *p;
123 unsigned int result = 0;
124 unsigned int c;
125
126 for (p = s; *p != '\0';)
127 {
128 c = TXT_DecodeUTF8(&p);
129
130 if (c == 0)
131 {
132 break;
133 }
134
135 ++result;
136 }
137
138 return result;
139 }
140
141 // Skip past the first n characters in a UTF-8 string.
142
TXT_UTF8_SkipChars(const char * s,unsigned int n)143 char *TXT_UTF8_SkipChars(const char *s, unsigned int n)
144 {
145 unsigned int i;
146 const char *p;
147
148 p = s;
149
150 for (i = 0; i < n; ++i)
151 {
152 if (TXT_DecodeUTF8(&p) == 0)
153 {
154 break;
155 }
156 }
157
158 return (char *) p;
159 }
160
161