1 //
2 // Copyright(C) 2005-2014 Simon Howard
3 //
4 // This program is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU General Public License
6 // as published by the Free Software Foundation; either version 2
7 // of the License, or (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18
19 #include "txt_utf8.h"
20
21 // Encode a Unicode character as UTF-8, storing it in the buffer 'p'
22 // and returning the new, incremented position.
23
TXT_EncodeUTF8(char * p,unsigned int c)24 char *TXT_EncodeUTF8(char *p, unsigned int c)
25 {
26 if (c < 0x80) // 1 character (ASCII):
27 {
28 p[0] = c;
29 return p + 1;
30 }
31 else if (c < 0x800) // 2 character:
32 {
33 p[0] = 0xc0 | (c >> 6);
34 p[1] = 0x80 | (c & 0x3f);
35 return p + 2;
36 }
37 else if (c < 0x10000) // 3 chacater:
38 {
39 p[0] = 0xe0 | (c >> 12);
40 p[1] = 0x80 | ((c >> 6) & 0x3f);
41 p[2] = 0x80 | (c & 0x3f);
42 return p + 3;
43 }
44 else if (c < 0x200000) // 4 character:
45 {
46 p[0] = 0xf0 | (c >> 18);
47 p[1] = 0x80 | ((c >> 12) & 0x3f);
48 p[2] = 0x80 | ((c >> 6) & 0x3f);
49 p[3] = 0x80 | (c & 0x3f);
50 return p + 4;
51 }
52 else
53 {
54 // Too big!
55
56 return p;
57 }
58 }
59
60 // Decode UTF-8 character, incrementing *ptr over the decoded bytes.
61
TXT_DecodeUTF8(const char ** ptr)62 unsigned int TXT_DecodeUTF8(const char **ptr)
63 {
64 const char *p = *ptr;
65 unsigned int c;
66
67 // UTF-8 decode.
68
69 if ((*p & 0x80) == 0) // 1 character (ASCII):
70 {
71 c = *p;
72 *ptr += 1;
73 }
74 else if ((p[0] & 0xe0) == 0xc0 // 2 character:
75 && (p[1] & 0xc0) == 0x80)
76 {
77 c = ((p[0] & 0x1f) << 6)
78 | (p[1] & 0x3f);
79 *ptr += 2;
80 }
81 else if ((p[0] & 0xf0) == 0xe0 // 3 character:
82 && (p[1] & 0xc0) == 0x80
83 && (p[2] & 0xc0) == 0x80)
84 {
85 c = ((p[0] & 0x0f) << 12)
86 | ((p[1] & 0x3f) << 6)
87 | (p[2] & 0x3f);
88 *ptr += 3;
89 }
90 else if ((p[0] & 0xf8) == 0xf0 // 4 character:
91 && (p[1] & 0xc0) == 0x80
92 && (p[2] & 0xc0) == 0x80
93 && (p[3] & 0xc0) == 0x80)
94 {
95 c = ((p[0] & 0x07) << 18)
96 | ((p[1] & 0x3f) << 12)
97 | ((p[2] & 0x3f) << 6)
98 | (p[3] & 0x3f);
99 *ptr += 4;
100 }
101 else
102 {
103 // Decode failure.
104 // Don't bother with 5/6 byte sequences.
105
106 c = 0;
107 }
108
109 return c;
110 }
111
112 // Count the number of characters in a UTF-8 string.
113
TXT_UTF8_Strlen(const char * s)114 unsigned int TXT_UTF8_Strlen(const char *s)
115 {
116 const char *p;
117 unsigned int result = 0;
118 unsigned int c;
119
120 for (p = s; *p != '\0';)
121 {
122 c = TXT_DecodeUTF8(&p);
123
124 if (c == 0)
125 {
126 break;
127 }
128
129 ++result;
130 }
131
132 return result;
133 }
134
135 // Skip past the first n characters in a UTF-8 string.
136
TXT_UTF8_SkipChars(const char * s,unsigned int n)137 char *TXT_UTF8_SkipChars(const char *s, unsigned int n)
138 {
139 unsigned int i;
140 const char *p;
141
142 p = s;
143
144 for (i = 0; i < n; ++i)
145 {
146 if (TXT_DecodeUTF8(&p) == 0)
147 {
148 break;
149 }
150 }
151
152 return (char *) p;
153 }
154
155