1 //
2 // Copyright(C) 2005-2014 Simon Howard
3 //
4 // This program is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU General Public License
6 // as published by the Free Software Foundation; either version 2
7 // of the License, or (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 // GNU General Public License for more details.
13 //
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 
19 #include "txt_utf8.h"
20 
21 // Encode a Unicode character as UTF-8, storing it in the buffer 'p'
22 // and returning the new, incremented position.
23 
TXT_EncodeUTF8(char * p,unsigned int c)24 char *TXT_EncodeUTF8(char *p, unsigned int c)
25 {
26     if (c < 0x80)                             // 1 character (ASCII):
27     {
28         p[0] = c;
29         return p + 1;
30     }
31     else if (c < 0x800)                       // 2 character:
32     {
33         p[0] = 0xc0 | (c >> 6);
34         p[1] = 0x80 | (c & 0x3f);
35         return p + 2;
36     }
37     else if (c < 0x10000)                     // 3 chacater:
38     {
39         p[0] = 0xe0 | (c >> 12);
40         p[1] = 0x80 | ((c >> 6) & 0x3f);
41         p[2] = 0x80 | (c & 0x3f);
42         return p + 3;
43     }
44     else if (c < 0x200000)                    // 4 character:
45     {
46         p[0] = 0xf0 | (c >> 18);
47         p[1] = 0x80 | ((c >> 12) & 0x3f);
48         p[2] = 0x80 | ((c >> 6) & 0x3f);
49         p[3] = 0x80 | (c & 0x3f);
50         return p + 4;
51     }
52     else
53     {
54         // Too big!
55 
56         return p;
57     }
58 }
59 
60 // Decode UTF-8 character, incrementing *ptr over the decoded bytes.
61 
TXT_DecodeUTF8(const char ** ptr)62 unsigned int TXT_DecodeUTF8(const char **ptr)
63 {
64     const char *p = *ptr;
65     unsigned int c;
66 
67     // UTF-8 decode.
68 
69     if ((*p & 0x80) == 0)                     // 1 character (ASCII):
70     {
71         c = *p;
72         *ptr += 1;
73     }
74     else if ((p[0] & 0xe0) == 0xc0            // 2 character:
75           && (p[1] & 0xc0) == 0x80)
76     {
77         c = ((p[0] & 0x1f) << 6)
78           |  (p[1] & 0x3f);
79         *ptr += 2;
80     }
81     else if ((p[0] & 0xf0) == 0xe0            // 3 character:
82           && (p[1] & 0xc0) == 0x80
83           && (p[2] & 0xc0) == 0x80)
84     {
85         c = ((p[0] & 0x0f) << 12)
86           | ((p[1] & 0x3f) << 6)
87           |  (p[2] & 0x3f);
88         *ptr += 3;
89     }
90     else if ((p[0] & 0xf8) == 0xf0            // 4 character:
91           && (p[1] & 0xc0) == 0x80
92           && (p[2] & 0xc0) == 0x80
93           && (p[3] & 0xc0) == 0x80)
94     {
95         c = ((p[0] & 0x07) << 18)
96           | ((p[1] & 0x3f) << 12)
97           | ((p[2] & 0x3f) << 6)
98           |  (p[3] & 0x3f);
99         *ptr += 4;
100     }
101     else
102     {
103         // Decode failure.
104         // Don't bother with 5/6 byte sequences.
105 
106         c = 0;
107     }
108 
109     return c;
110 }
111 
112 // Count the number of characters in a UTF-8 string.
113 
TXT_UTF8_Strlen(const char * s)114 unsigned int TXT_UTF8_Strlen(const char *s)
115 {
116     const char *p;
117     unsigned int result = 0;
118     unsigned int c;
119 
120     for (p = s; *p != '\0';)
121     {
122         c = TXT_DecodeUTF8(&p);
123 
124         if (c == 0)
125         {
126             break;
127         }
128 
129         ++result;
130     }
131 
132     return result;
133 }
134 
135 // Skip past the first n characters in a UTF-8 string.
136 
TXT_UTF8_SkipChars(const char * s,unsigned int n)137 char *TXT_UTF8_SkipChars(const char *s, unsigned int n)
138 {
139     unsigned int i;
140     const char *p;
141 
142     p = s;
143 
144     for (i = 0; i < n; ++i)
145     {
146         if (TXT_DecodeUTF8(&p) == 0)
147         {
148             break;
149         }
150     }
151 
152     return (char *) p;
153 }
154 
155