1 /* $Id$
2
3 Part of SWI-Prolog
4
5 Author: Jan Wielemaker and Anjo Anjewierden
6 E-mail: jan@swi.psy.uva.nl
7 WWW: http://www.swi-prolog.org
8 Copyright (C): 1985-2002, University of Amsterdam
9
10 This library is free software; you can redistribute it and/or
11 modify it under the terms of the GNU Lesser General Public
12 License as published by the Free Software Foundation; either
13 version 2.1 of the License, or (at your option) any later version.
14
15 This library is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 Lesser General Public License for more details.
19
20 You should have received a copy of the GNU Lesser General Public
21 License along with this library; if not, write to the Free Software
22 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #include <string.h> /* get size_t */
26 #include "pl-utf8.h"
27
28 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
29 UTF-8 Decoding, based on http://www.cl.cam.ac.uk/~mgk25/unicode.html
30 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
31
32 #define CONT(i) ISUTF8_CB(in[i])
33 #define VAL(i, s) ((in[i]&0x3f) << s)
34
35 char *
_PL__utf8_get_char(const char * in,int * chr)36 _PL__utf8_get_char(const char *in, int *chr)
37 { /* 2-byte, 0x80-0x7ff */
38 if ( (in[0]&0xe0) == 0xc0 && CONT(1) )
39 { *chr = ((in[0]&0x1f) << 6)|VAL(1,0);
40 return (char *)in+2;
41 }
42 /* 3-byte, 0x800-0xffff */
43 if ( (in[0]&0xf0) == 0xe0 && CONT(1) && CONT(2) )
44 { *chr = ((in[0]&0xf) << 12)|VAL(1,6)|VAL(2,0);
45 return (char *)in+3;
46 }
47 /* 4-byte, 0x10000-0x1FFFFF */
48 if ( (in[0]&0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3) )
49 { *chr = ((in[0]&0x7) << 18)|VAL(1,12)|VAL(2,6)|VAL(3,0);
50 return (char *)in+4;
51 }
52 /* 5-byte, 0x200000-0x3FFFFFF */
53 if ( (in[0]&0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4) )
54 { *chr = ((in[0]&0x3) << 24)|VAL(1,18)|VAL(2,12)|VAL(3,6)|VAL(4,0);
55 return (char *)in+5;
56 }
57 /* 6-byte, 0x400000-0x7FFFFFF */
58 if ( (in[0]&0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) && CONT(5) )
59 { *chr = ((in[0]&0x1) << 30)|VAL(1,24)|VAL(2,18)|VAL(3,12)|VAL(4,6)|VAL(5,0);
60 return (char *)in+4;
61 }
62
63 *chr = *in;
64
65 return (char *)in+1;
66 }
67
68
69 char *
_PL__utf8_put_char(char * out,int chr)70 _PL__utf8_put_char(char *out, int chr)
71 { if ( chr < 0x80 )
72 { *out++ = chr;
73 } else if ( chr < 0x800 )
74 { *out++ = 0xc0|((chr>>6)&0x1f);
75 *out++ = 0x80|(chr&0x3f);
76 } else if ( chr < 0x10000 )
77 { *out++ = 0xe0|((chr>>12)&0x0f);
78 *out++ = 0x80|((chr>>6)&0x3f);
79 *out++ = 0x80|(chr&0x3f);
80 } else if ( chr < 0x200000 )
81 { *out++ = 0xf0|((chr>>18)&0x07);
82 *out++ = 0x80|((chr>>12)&0x3f);
83 *out++ = 0x80|((chr>>6)&0x3f);
84 *out++ = 0x80|(chr&0x3f);
85 } else if ( chr < 0x4000000 )
86 { *out++ = 0xf8|((chr>>24)&0x03);
87 *out++ = 0x80|((chr>>18)&0x3f);
88 *out++ = 0x80|((chr>>12)&0x3f);
89 *out++ = 0x80|((chr>>6)&0x3f);
90 *out++ = 0x80|(chr&0x3f);
91 } else if ( (unsigned)chr < 0x80000000 )
92 { *out++ = 0xfc|((chr>>30)&0x01);
93 *out++ = 0x80|((chr>>24)&0x3f);
94 *out++ = 0x80|((chr>>18)&0x3f);
95 *out++ = 0x80|((chr>>12)&0x3f);
96 *out++ = 0x80|((chr>>6)&0x3f);
97 *out++ = 0x80|(chr&0x3f);
98 }
99
100 return out;
101 }
102
103
104 size_t
utf8_strlen(const char * s,size_t len)105 utf8_strlen(const char *s, size_t len)
106 { const char *e = &s[len];
107 unsigned int l = 0;
108
109 while(s<e)
110 { int chr;
111
112 s = utf8_get_char(s, &chr);
113 l++;
114 }
115
116 return l;
117 }
118