1 /* Copyright (C) 2008 Vincent Penquerc'h.
2    This file is part of the Kate codec library.
3    Written by Vincent Penquerc'h.
4 
5    Use, distribution and reproduction of this library is governed
6    by a BSD style source license included with this source in the
7    file 'COPYING'. Please read these terms before distributing. */
8 
9 
10 #define KATE_INTERNAL
11 #include "kate_internal.h"
12 
13 #ifdef HAVE_STRING_H
14 #include <string.h>
15 #endif
16 #include "kate/kate.h"
17 
18 /* #define ENABLE_CODE_POINTS_ABOVE_0x10ffff */
19 
kate_is_valid_code_point(int c)20 inline int kate_is_valid_code_point(int c)
21 {
22   /* surrogate range is invalid */
23   if (c>=0xd800 && c<=0xdfff) return 0; /* UTF-16 surrogates */
24   if (c>=0xfffe && c<=0xffff) return 0; /* Markus Kuhn's UTF-8 test files says these are invalid */
25 
26 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
27   return c>=0 && c<=0x7fffffff;
28 #else
29   return c>=0 && c<=0x10ffff;
30 #endif
31 }
32 
33 static inline int get_bytes_for_code_point(int c) __attribute__((const));
get_bytes_for_code_point(int c)34 static inline int get_bytes_for_code_point(int c)
35 {
36   if (!kate_is_valid_code_point(c)) return -1;
37   if (c<=0x7f) return 1;
38   if (c<=0x7ff) return 2;
39   if (c<=0xffff) return 3;
40 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
41   if (c<=0x001fffff) return 4;
42   if (c<=0x03ffffff) return 5;
43   if (c<=0x7fffffff) return 6;
44 #else
45   if (c<=0x10ffff) return 4;
46 #endif
47   return -1;
48 }
49 
kate_text_utf8_read(const char * s,int * cp)50 static int kate_text_utf8_read(const char *s,int *cp)
51 {
52   int c;
53 
54   if (!s) return KATE_E_INVALID_PARAMETER;
55 
56   c=0;
57 
58   if (((*s)&0x80)==0) {
59     /* 1 byte */
60     c=*s;
61 
62     *cp=c;
63     return 1;
64   }
65   else if (((*s)&0xe0)==0xc0) {
66     /* 2 bytes */
67     c|=(((*s)&0x1f)<<6);
68     s++;
69     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
70     c|=((*s)&0x3f);
71     if (c<=0x7f) return KATE_E_TEXT;
72 
73     *cp=c;
74     return 2;
75   }
76   else if (((*s)&0xf0)==0xe0) {
77     /* 3 bytes */
78     c|=(((*s)&0xf)<<12);
79     s++;
80     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
81     c|=(((*s)&0x3f)<<6);
82     s++;
83     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
84     c|=(((*s)&0x3f));
85     if (c<=0x7ff) return KATE_E_TEXT;
86 
87     *cp=c;
88     return 3;
89   }
90   else if (((*s)&0xf8)==0xf0) {
91     /* 4 bytes */
92     c|=(((*s)&0x7)<<18);
93     s++;
94     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
95     c|=(((*s)&0x3f)<<12);
96     s++;
97     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
98     c|=(((*s)&0x3f)<<6);
99     s++;
100     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
101     c|=(((*s)&0x3f));
102     if (c<=0xffff) return KATE_E_TEXT;
103 
104     *cp=c;
105     return 4;
106   }
107 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
108   /* 5 and 6 bytes are for unicode code points not yet assigned */
109   else if (((*s)&0xfc)==0xf8) {
110     /* 5 bytes */
111     c|=(((*s)&0x3)<<24);
112     s++;
113     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
114     c|=(((*s)&0x3f)<<18);
115     s++;
116     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
117     c|=(((*s)&0x3f)<<12);
118     s++;
119     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
120     c|=(((*s)&0x3f)<<6);
121     s++;
122     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
123     c|=(((*s)&0x3f));
124     if (c<=0x001fffff) return KATE_E_TEXT;
125 
126     *cp=c;
127     return 5;
128   }
129   else if (((*s)&0xfe)==0xfc) {
130     /* 6 bytes */
131     c|=(((*s)&0x1)<<30);
132     s++;
133     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
134     c|=(((*s)&0x3f)<<24);
135     s++;
136     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
137     c|=(((*s)&0x3f)<<18);
138     s++;
139     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
140     c|=(((*s)&0x3f)<<12);
141     s++;
142     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
143     c|=(((*s)&0x3f)<<6);
144     s++;
145     if (((*s)&0xc0)!=0x80) return KATE_E_TEXT;
146     c|=(((*s)&0x3f));
147     if (c<=0x03ffffff) return KATE_E_TEXT;
148 
149     *cp=c;
150     return 6;
151   }
152 #endif
153 
154   return KATE_E_TEXT;
155 }
156 
kate_text_utf8_write(char * s,int cp)157 static int kate_text_utf8_write(char *s,int cp)
158 {
159   int bytes;
160 
161   if (!s) return KATE_E_INVALID_PARAMETER;
162   if (!kate_is_valid_code_point(cp)) return KATE_E_INVALID_PARAMETER;
163 
164   bytes=get_bytes_for_code_point(cp);
165   if (bytes<=0) return KATE_E_INVALID_PARAMETER;
166 
167   switch (bytes) {
168   case 1:
169     *s++=cp;
170     break;
171   case 2:
172     *s++=0xc0 | (cp>>6);
173     *s++=0x80 | (cp&0x3f);
174     break;
175   case 3:
176     *s++=0xe0 | (cp>>12);
177     *s++=0x80 | ((cp>>6)&0x3f);
178     *s++=0x80 | (cp&0x3f);
179     break;
180   case 4:
181     *s++=0xf0 | (cp>>18);
182     *s++=0x80 | ((cp>>12)&0x3f);
183     *s++=0x80 | ((cp>>6)&0x3f);
184     *s++=0x80 | (cp&0x3f);
185     break;
186 #ifdef ENABLE_CODE_POINTS_ABOVE_0x10ffff
187   case 5:
188     *s++=0xf8 | (cp>>24);
189     *s++=0x80 | ((cp>>18)&0x3f);
190     *s++=0x80 | ((cp>>12)&0x3f);
191     *s++=0x80 | ((cp>>6)&0x3f);
192     *s++=0x80 | (cp&0x3f);
193     break;
194   case 6:
195     *s++=0xfc | (cp>>30);
196     *s++=0x80 | ((cp>>24)&0x3f);
197     *s++=0x80 | ((cp>>18)&0x3f);
198     *s++=0x80 | ((cp>>12)&0x3f);
199     *s++=0x80 | ((cp>>6)&0x3f);
200     *s++=0x80 | (cp&0x3f);
201     break;
202 #endif
203   default:
204     return KATE_E_INVALID_PARAMETER;
205   }
206   return bytes;
207 }
208 
209 /**
210   \ingroup text
211   Reads a code point from the string, advancing the text pointer to the next one.
212   \param text_encoding the character encoding the text is coded in
213   \param text a pointer to the text pointer, to be advanced to the next character on success
214   \param len0 a pointer to the length of the buffer, including any terminating NUL, to be decreased by the number of bytes that text is advanced on success
215   \returns >=0 success, the unicode code point read
216   \returns KATE_E_* error
217   */
kate_text_get_character(kate_text_encoding text_encoding,const char ** const text,size_t * len0)218 int kate_text_get_character(kate_text_encoding text_encoding,const char ** const text,size_t *len0)
219 {
220   const char *new_text;
221   int c,ret;
222   size_t bytes;
223 
224   if (!text || !len0) return KATE_E_INVALID_PARAMETER;
225 
226   switch (text_encoding) {
227     case kate_utf8:
228       new_text=*text;
229       ret=kate_text_utf8_read(new_text,&c);
230       if (ret<0) return ret;
231       bytes=ret;
232       if (bytes>*len0) return KATE_E_TEXT;
233       *len0-=bytes;
234       *text+=bytes;
235       return c;
236     default:
237       return KATE_E_INVALID_PARAMETER;
238   }
239 }
240 
241 /**
242   \ingroup text
243   Writes a code point to the given string, advancing the text pointer to the next byte.
244   \param text_encoding the character encoding the text is coded in
245   \param c the unicode code point to write to the string
246   \param text a pointer to the text pointer, to be advanced to the next character on success
247   \param len0 a pointer to the length of the buffer, including any terminating NUL, to be decreased by the number of bytes that text is advanced on success
248   \returns >=0 success, the number of bytes used to write this code point
249   \returns KATE_E_* error
250   */
kate_text_set_character(kate_text_encoding text_encoding,int c,char ** const text,size_t * len0)251 int kate_text_set_character(kate_text_encoding text_encoding,int c,char ** const text,size_t *len0)
252 {
253   char tmp[8]={0};
254   size_t bytes;
255   int ret;
256 
257   if (!text || !len0) return KATE_E_INVALID_PARAMETER;
258 
259   switch (text_encoding) {
260     case kate_utf8:
261       ret=kate_text_utf8_write(tmp,c);
262       if (ret<0) return ret;
263       bytes=ret;
264       if (bytes>*len0) return KATE_E_TEXT;
265       memcpy(*text,tmp,bytes);
266       *text+=bytes;
267       *len0-=bytes;
268       return bytes;
269     default:
270       return KATE_E_INVALID_PARAMETER;
271   }
272 }
273 
274 /**
275   \ingroup text
276   Removes markup from the given text.
277   \param text_encoding the character encoding the text is coded in
278   \param text the text to remove markup from
279   \param len0 the length in bytes of the text, including any terminating NUL - will be set to the length of the text with markup removed, including any terminating NUL
280   \returns 0 success
281   \returns KATE_E_* error
282   */
kate_text_remove_markup(kate_text_encoding text_encoding,char * text,size_t * len0)283 int kate_text_remove_markup(kate_text_encoding text_encoding,char *text,size_t *len0)
284 {
285   char *r=text,*w=text;
286   int in_tag=0;
287   size_t n;
288 
289   if (!text || !len0) return KATE_E_INVALID_PARAMETER;
290 
291   switch (text_encoding) {
292     case kate_utf8:
293       while (*r && (size_t)(r-text)<*len0) {
294         int ret,c;
295         ret=kate_text_utf8_read(r,&c);
296         if (ret<0) return ret;
297         r+=ret;
298         if (r>text+*len0) {
299           /* we went over the limit, we probably read dummy data, discard */
300           break;
301         }
302         if (c=='<') {
303           in_tag++;
304           /* add a newline if we just removed a br */
305           if (*len0>=3 && !strncmp(r,"br>",3)) {
306             ret=kate_text_utf8_write(w,'\n');
307             if (ret<0) return ret;
308             w+=ret;
309           }
310         }
311         if (!in_tag) {
312           ret=kate_text_utf8_write(w,c);
313           if (ret<0) return ret;
314           w+=ret;
315         }
316         if (c=='>') {
317           in_tag--;
318         }
319       }
320       /* zero all bytes we removed */
321       for (n=0;n<*len0-(w-text);++n) w[n]=0;
322       /* adjust len0 to new size */
323       *len0=w-text;
324       break;
325     default:
326       return KATE_E_INVALID_PARAMETER;
327   }
328 
329   return 0;
330 }
331 
332 /**
333   \ingroup text
334   Validates text for the given character encoding, flagging partial sequences and invalid sequences.
335   \param text_encoding the character encoding the text is coded in
336   \param text the text to validate (may include embedded NULs)
337   \param len0 the length in bytes of the text, including any terminating NUL
338   \returns 0 success
339   \returns KATE_E_* error
340   */
kate_text_validate(kate_text_encoding text_encoding,const char * text,size_t len0)341 int kate_text_validate(kate_text_encoding text_encoding,const char *text,size_t len0)
342 {
343   if (!text) return KATE_E_INVALID_PARAMETER;
344 
345   switch (text_encoding) {
346     case kate_utf8:
347       while (len0>0) {
348         int ret,c;
349         ret=kate_text_utf8_read(text,&c);
350         if (ret<0) return ret;
351         if (!kate_is_valid_code_point(c)) return KATE_E_TEXT;
352         if ((size_t)ret>len0) {
353           /* we went over the limit, we probably read dummy data, discard */
354           return KATE_E_TEXT;
355         }
356         text+=ret;
357         len0-=ret;
358       }
359       break;
360     default:
361       return KATE_E_INVALID_PARAMETER;
362   }
363 
364   return 0;
365 }
366 
367