1 /* Copyright (C) 2008 Vincent Penquerc'h.
2 This file is part of the Kate codec library.
3 Written by Vincent Penquerc'h.
4
5 Use, distribution and reproduction of this library is governed
6 by a BSD style source license included with this source in the
7 file 'COPYING'. Please read these terms before distributing. */
8
9
10 #define KATE_INTERNAL
11 #include "kate_internal.h"
12
13 #ifdef HAVE_STRING_H
14 #include <string.h>
15 #endif
16 #include "kate/kate.h"
17
18 /* #define ENABLE_CODE_POINTS_ABOVE_0x10ffff */
19
kate_is_valid_code_point(int c)20 inline int kate_is_valid_code_point(int c)
21 {
22 /* surrogate range is invalid */
23 if (c>=0xd800 && c<=0xdfff) return 0; /* UTF-16 surrogates */
24 if (c>=0xfffe && c<=0xffff) return 0; /* Markus Kuhn's UTF-8 test files says these are invalid */
25
26 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
27 return c>=0 && c<=0x7fffffff;
28 #else
29 return c>=0 && c<=0x10ffff;
30 #endif
31 }
32
33 static inline int get_bytes_for_code_point(int c) __attribute__((const));
get_bytes_for_code_point(int c)34 static inline int get_bytes_for_code_point(int c)
35 {
36 if (!kate_is_valid_code_point(c)) return -1;
37 if (c<=0x7f) return 1;
38 if (c<=0x7ff) return 2;
39 if (c<=0xffff) return 3;
40 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
41 if (c<=0x001fffff) return 4;
42 if (c<=0x03ffffff) return 5;
43 if (c<=0x7fffffff) return 6;
44 #else
45 if (c<=0x10ffff) return 4;
46 #endif
47 return -1;
48 }
49
kate_text_utf8_read(const char * s,int * cp)50 static int kate_text_utf8_read(const char *s,int *cp)
51 {
52 int c;
53
54 if (!s) return KATE_E_INVALID_PARAMETER;
55
56 c=0;
57
58 if (((*s)&0x80)==0) {
59 /* 1 byte */
60 c=*s;
61
62 *cp=c;
63 return 1;
64 }
65 else if (((*s)&0xe0)==0xc0) {
66 /* 2 bytes */
67 c|=(((*s)&0x1f)<<6);
68 s++;
69 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
70 c|=((*s)&0x3f);
71 if (c<=0x7f) return KATE_E_TEXT;
72
73 *cp=c;
74 return 2;
75 }
76 else if (((*s)&0xf0)==0xe0) {
77 /* 3 bytes */
78 c|=(((*s)&0xf)<<12);
79 s++;
80 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
81 c|=(((*s)&0x3f)<<6);
82 s++;
83 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
84 c|=(((*s)&0x3f));
85 if (c<=0x7ff) return KATE_E_TEXT;
86
87 *cp=c;
88 return 3;
89 }
90 else if (((*s)&0xf8)==0xf0) {
91 /* 4 bytes */
92 c|=(((*s)&0x7)<<18);
93 s++;
94 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
95 c|=(((*s)&0x3f)<<12);
96 s++;
97 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
98 c|=(((*s)&0x3f)<<6);
99 s++;
100 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
101 c|=(((*s)&0x3f));
102 if (c<=0xffff) return KATE_E_TEXT;
103
104 *cp=c;
105 return 4;
106 }
107 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
108 /* 5 and 6 bytes are for unicode code points not yet assigned */
109 else if (((*s)&0xfc)==0xf8) {
110 /* 5 bytes */
111 c|=(((*s)&0x3)<<24);
112 s++;
113 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
114 c|=(((*s)&0x3f)<<18);
115 s++;
116 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
117 c|=(((*s)&0x3f)<<12);
118 s++;
119 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
120 c|=(((*s)&0x3f)<<6);
121 s++;
122 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
123 c|=(((*s)&0x3f));
124 if (c<=0x001fffff) return KATE_E_TEXT;
125
126 *cp=c;
127 return 5;
128 }
129 else if (((*s)&0xfe)==0xfc) {
130 /* 6 bytes */
131 c|=(((*s)&0x1)<<30);
132 s++;
133 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
134 c|=(((*s)&0x3f)<<24);
135 s++;
136 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
137 c|=(((*s)&0x3f)<<18);
138 s++;
139 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
140 c|=(((*s)&0x3f)<<12);
141 s++;
142 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
143 c|=(((*s)&0x3f)<<6);
144 s++;
145 if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
146 c|=(((*s)&0x3f));
147 if (c<=0x03ffffff) return KATE_E_TEXT;
148
149 *cp=c;
150 return 6;
151 }
152 #endif
153
154 return KATE_E_TEXT;
155 }
156
kate_text_utf8_write(char * s,int cp)157 static int kate_text_utf8_write(char *s,int cp)
158 {
159 int bytes;
160
161 if (!s) return KATE_E_INVALID_PARAMETER;
162 if (!kate_is_valid_code_point(cp)) return KATE_E_INVALID_PARAMETER;
163
164 bytes=get_bytes_for_code_point(cp);
165 if (bytes<=0) return KATE_E_INVALID_PARAMETER;
166
167 switch (bytes) {
168 case 1:
169 *s++=cp;
170 break;
171 case 2:
172 *s++=0xc0 | (cp>>6);
173 *s++=0x80 | (cp&0x3f);
174 break;
175 case 3:
176 *s++=0xe0 | (cp>>12);
177 *s++=0x80 | ((cp>>6)&0x3f);
178 *s++=0x80 | (cp&0x3f);
179 break;
180 case 4:
181 *s++=0xf0 | (cp>>18);
182 *s++=0x80 | ((cp>>12)&0x3f);
183 *s++=0x80 | ((cp>>6)&0x3f);
184 *s++=0x80 | (cp&0x3f);
185 break;
186 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
187 case 5:
188 *s++=0xf8 | (cp>>24);
189 *s++=0x80 | ((cp>>18)&0x3f);
190 *s++=0x80 | ((cp>>12)&0x3f);
191 *s++=0x80 | ((cp>>6)&0x3f);
192 *s++=0x80 | (cp&0x3f);
193 break;
194 case 6:
195 *s++=0xfc | (cp>>30);
196 *s++=0x80 | ((cp>>24)&0x3f);
197 *s++=0x80 | ((cp>>18)&0x3f);
198 *s++=0x80 | ((cp>>12)&0x3f);
199 *s++=0x80 | ((cp>>6)&0x3f);
200 *s++=0x80 | (cp&0x3f);
201 break;
202 #endif
203 default:
204 return KATE_E_INVALID_PARAMETER;
205 }
206 return bytes;
207 }
208
209 /**
210 \ingroup text
211 Reads a code point from the string, advancing the text pointer to the next one.
212 \param text_encoding the character encoding the text is coded in
213 \param text a pointer to the text pointer, to be advanced to the next character on success
214 \param len0 a pointer to the length of the buffer, including any terminating NUL, to be decreased by the number of bytes that text is advanced on success
215 \returns >=0 success, the unicode code point read
216 \returns KATE_E_* error
217 */
kate_text_get_character(kate_text_encoding text_encoding,const char ** const text,size_t * len0)218 int kate_text_get_character(kate_text_encoding text_encoding,const char ** const text,size_t *len0)
219 {
220 const char *new_text;
221 int c,ret;
222 size_t bytes;
223
224 if (!text || !len0) return KATE_E_INVALID_PARAMETER;
225
226 switch (text_encoding) {
227 case kate_utf8:
228 new_text=*text;
229 ret=kate_text_utf8_read(new_text,&c);
230 if (ret<0) return ret;
231 bytes=ret;
232 if (bytes>*len0) return KATE_E_TEXT;
233 *len0-=bytes;
234 *text+=bytes;
235 return c;
236 default:
237 return KATE_E_INVALID_PARAMETER;
238 }
239 }
240
241 /**
242 \ingroup text
243 Writes a code point to the given string, advancing the text pointer to the next byte.
244 \param text_encoding the character encoding the text is coded in
245 \param c the unicode code point to write to the string
246 \param text a pointer to the text pointer, to be advanced to the next character on success
247 \param len0 a pointer to the length of the buffer, including any terminating NUL, to be decreased by the number of bytes that text is advanced on success
248 \returns >=0 success, the number of bytes used to write this code point
249 \returns KATE_E_* error
250 */
kate_text_set_character(kate_text_encoding text_encoding,int c,char ** const text,size_t * len0)251 int kate_text_set_character(kate_text_encoding text_encoding,int c,char ** const text,size_t *len0)
252 {
253 char tmp[8]={0};
254 size_t bytes;
255 int ret;
256
257 if (!text || !len0) return KATE_E_INVALID_PARAMETER;
258
259 switch (text_encoding) {
260 case kate_utf8:
261 ret=kate_text_utf8_write(tmp,c);
262 if (ret<0) return ret;
263 bytes=ret;
264 if (bytes>*len0) return KATE_E_TEXT;
265 memcpy(*text,tmp,bytes);
266 *text+=bytes;
267 *len0-=bytes;
268 return bytes;
269 default:
270 return KATE_E_INVALID_PARAMETER;
271 }
272 }
273
274 /**
275 \ingroup text
276 Removes markup from the given text.
277 \param text_encoding the character encoding the text is coded in
278 \param text the text to remove markup from
279 \param len0 the length in bytes of the text, including any terminating NUL - will be set to the length of the text with markup removed, including any terminating NUL
280 \returns 0 success
281 \returns KATE_E_* error
282 */
kate_text_remove_markup(kate_text_encoding text_encoding,char * text,size_t * len0)283 int kate_text_remove_markup(kate_text_encoding text_encoding,char *text,size_t *len0)
284 {
285 char *r=text,*w=text;
286 int in_tag=0;
287 size_t n;
288
289 if (!text || !len0) return KATE_E_INVALID_PARAMETER;
290
291 switch (text_encoding) {
292 case kate_utf8:
293 while (*r && (size_t)(r-text)<*len0) {
294 int ret,c;
295 ret=kate_text_utf8_read(r,&c);
296 if (ret<0) return ret;
297 r+=ret;
298 if (r>text+*len0) {
299 /* we went over the limit, we probably read dummy data, discard */
300 break;
301 }
302 if (c=='<') {
303 in_tag++;
304 /* add a newline if we just removed a br */
305 if (*len0>=3 && !strncmp(r,"br>",3)) {
306 ret=kate_text_utf8_write(w,'\n');
307 if (ret<0) return ret;
308 w+=ret;
309 }
310 }
311 if (!in_tag) {
312 ret=kate_text_utf8_write(w,c);
313 if (ret<0) return ret;
314 w+=ret;
315 }
316 if (c=='>') {
317 in_tag--;
318 }
319 }
320 /* zero all bytes we removed */
321 for (n=0;n<*len0-(w-text);++n) w[n]=0;
322 /* adjust len0 to new size */
323 *len0=w-text;
324 break;
325 default:
326 return KATE_E_INVALID_PARAMETER;
327 }
328
329 return 0;
330 }
331
332 /**
333 \ingroup text
334 Validates text for the given character encoding, flagging partial sequences and invalid sequences.
335 \param text_encoding the character encoding the text is coded in
336 \param text the text to validate (may include embedded NULs)
337 \param len0 the length in bytes of the text, including any terminating NUL
338 \returns 0 success
339 \returns KATE_E_* error
340 */
kate_text_validate(kate_text_encoding text_encoding,const char * text,size_t len0)341 int kate_text_validate(kate_text_encoding text_encoding,const char *text,size_t len0)
342 {
343 if (!text) return KATE_E_INVALID_PARAMETER;
344
345 switch (text_encoding) {
346 case kate_utf8:
347 while (len0>0) {
348 int ret,c;
349 ret=kate_text_utf8_read(text,&c);
350 if (ret<0) return ret;
351 if (!kate_is_valid_code_point(c)) return KATE_E_TEXT;
352 if ((size_t)ret>len0) {
353 /* we went over the limit, we probably read dummy data, discard */
354 return KATE_E_TEXT;
355 }
356 text+=ret;
357 len0-=ret;
358 }
359 break;
360 default:
361 return KATE_E_INVALID_PARAMETER;
362 }
363
364 return 0;
365 }
366
367