1 // Emacs style mode select   -*- C++ -*-
2 //-----------------------------------------------------------------------------
3 //
4 // Copyright(C) 2012 Simon Howard
5 //
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either version 2
9 // of the License, or (at your option) any later version.
10 //
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 // GNU General Public License for more details.
15 //
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software
18 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19 // 02111-1307, USA.
20 //
21 
22 #include <stdlib.h>
23 #include <string.h>
24 
25 #include "txt_utf8.h"
26 
27 // Encode a Unicode character as UTF-8, storing it in the buffer 'p'
28 // and returning the new, incremented position.
29 
TXT_EncodeUTF8(char * p,unsigned int c)30 char *TXT_EncodeUTF8(char *p, unsigned int c)
31 {
32     if (c < 0x80)                             // 1 character (ASCII):
33     {
34         p[0] = c;
35         return p + 1;
36     }
37     else if (c < 0x800)                       // 2 character:
38     {
39         p[0] = 0xc0 | (c >> 6);
40         p[1] = 0x80 | (c & 0x3f);
41         return p + 2;
42     }
43     else if (c < 0x10000)                     // 3 chacater:
44     {
45         p[0] = 0xe0 | (c >> 12);
46         p[1] = 0x80 | ((c >> 6) & 0x3f);
47         p[2] = 0x80 | (c & 0x3f);
48         return p + 3;
49     }
50     else if (c < 0x200000)                    // 4 character:
51     {
52         p[0] = 0xf0 | (c >> 18);
53         p[1] = 0x80 | ((c >> 12) & 0x3f);
54         p[2] = 0x80 | ((c >> 6) & 0x3f);
55         p[3] = 0x80 | (c & 0x3f);
56         return p + 4;
57     }
58     else
59     {
60         // Too big!
61 
62         return p;
63     }
64 }
65 
66 // Decode UTF-8 character, incrementing *ptr over the decoded bytes.
67 
TXT_DecodeUTF8(const char ** ptr)68 unsigned int TXT_DecodeUTF8(const char **ptr)
69 {
70     const char *p = *ptr;
71     unsigned int c;
72 
73     // UTF-8 decode.
74 
75     if ((*p & 0x80) == 0)                     // 1 character (ASCII):
76     {
77         c = *p;
78         *ptr += 1;
79     }
80     else if ((p[0] & 0xe0) == 0xc0            // 2 character:
81           && (p[1] & 0xc0) == 0x80)
82     {
83         c = ((p[0] & 0x1f) << 6)
84           |  (p[1] & 0x3f);
85         *ptr += 2;
86     }
87     else if ((p[0] & 0xf0) == 0xe0            // 3 character:
88           && (p[1] & 0xc0) == 0x80
89           && (p[2] & 0xc0) == 0x80)
90     {
91         c = ((p[0] & 0x0f) << 12)
92           | ((p[1] & 0x3f) << 6)
93           |  (p[2] & 0x3f);
94         *ptr += 3;
95     }
96     else if ((p[0] & 0xf8) == 0xf0            // 4 character:
97           && (p[1] & 0xc0) == 0x80
98           && (p[2] & 0xc0) == 0x80
99           && (p[3] & 0xc0) == 0x80)
100     {
101         c = ((p[0] & 0x07) << 18)
102           | ((p[1] & 0x3f) << 12)
103           | ((p[2] & 0x3f) << 6)
104           |  (p[3] & 0x3f);
105         *ptr += 4;
106     }
107     else
108     {
109         // Decode failure.
110         // Don't bother with 5/6 byte sequences.
111 
112         c = 0;
113     }
114 
115     return c;
116 }
117 
118 // Count the number of characters in a UTF-8 string.
119 
TXT_UTF8_Strlen(const char * s)120 unsigned int TXT_UTF8_Strlen(const char *s)
121 {
122     const char *p;
123     unsigned int result = 0;
124     unsigned int c;
125 
126     for (p = s; *p != '\0';)
127     {
128         c = TXT_DecodeUTF8(&p);
129 
130         if (c == 0)
131         {
132             break;
133         }
134 
135         ++result;
136     }
137 
138     return result;
139 }
140 
141 // Skip past the first n characters in a UTF-8 string.
142 
TXT_UTF8_SkipChars(const char * s,unsigned int n)143 char *TXT_UTF8_SkipChars(const char *s, unsigned int n)
144 {
145     unsigned int i;
146     const char *p;
147 
148     p = s;
149 
150     for (i = 0; i < n; ++i)
151     {
152         if (TXT_DecodeUTF8(&p) == 0)
153         {
154             break;
155         }
156     }
157 
158     return (char *) p;
159 }
160 
161