1 /*  $Id$
2 
3     Part of SWI-Prolog
4 
5     Author:        Jan Wielemaker and Anjo Anjewierden
6     E-mail:        jan@swi.psy.uva.nl
7     WWW:           http://www.swi-prolog.org
8     Copyright (C): 1985-2002, University of Amsterdam
9 
10     This library is free software; you can redistribute it and/or
11     modify it under the terms of the GNU Lesser General Public
12     License as published by the Free Software Foundation; either
13     version 2.1 of the License, or (at your option) any later version.
14 
15     This library is distributed in the hope that it will be useful,
16     but WITHOUT ANY WARRANTY; without even the implied warranty of
17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18     Lesser General Public License for more details.
19 
20     You should have received a copy of the GNU Lesser General Public
21     License along with this library; if not, write to the Free Software
22     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23 */
24 
25 #include <string.h>			/* get size_t */
26 #include "pl-utf8.h"
27 
28 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
29 UTF-8 Decoding, based on http://www.cl.cam.ac.uk/~mgk25/unicode.html
30 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
31 
32 #define CONT(i)   ISUTF8_CB(in[i])
33 #define VAL(i, s) ((in[i]&0x3f) << s)
34 
35 char *
_PL__utf8_get_char(const char * in,int * chr)36 _PL__utf8_get_char(const char *in, int *chr)
37 { 					/* 2-byte, 0x80-0x7ff */
38   if ( (in[0]&0xe0) == 0xc0 && CONT(1) )
39   { *chr = ((in[0]&0x1f) << 6)|VAL(1,0);
40     return (char *)in+2;
41   }
42 					/* 3-byte, 0x800-0xffff */
43   if ( (in[0]&0xf0) == 0xe0 && CONT(1) && CONT(2) )
44   { *chr = ((in[0]&0xf) << 12)|VAL(1,6)|VAL(2,0);
45     return (char *)in+3;
46   }
47 					/* 4-byte, 0x10000-0x1FFFFF */
48   if ( (in[0]&0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3) )
49   { *chr = ((in[0]&0x7) << 18)|VAL(1,12)|VAL(2,6)|VAL(3,0);
50     return (char *)in+4;
51   }
52 					/* 5-byte, 0x200000-0x3FFFFFF */
53   if ( (in[0]&0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4) )
54   { *chr = ((in[0]&0x3) << 24)|VAL(1,18)|VAL(2,12)|VAL(3,6)|VAL(4,0);
55     return (char *)in+5;
56   }
57 					/* 6-byte, 0x400000-0x7FFFFFF */
58   if ( (in[0]&0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) && CONT(5) )
59   { *chr = ((in[0]&0x1) << 30)|VAL(1,24)|VAL(2,18)|VAL(3,12)|VAL(4,6)|VAL(5,0);
60     return (char *)in+4;
61   }
62 
63   *chr = *in;
64 
65   return (char *)in+1;
66 }
67 
68 
69 char *
_PL__utf8_put_char(char * out,int chr)70 _PL__utf8_put_char(char *out, int chr)
71 { if ( chr < 0x80 )
72   { *out++ = chr;
73   } else if ( chr < 0x800 )
74   { *out++ = 0xc0|((chr>>6)&0x1f);
75     *out++ = 0x80|(chr&0x3f);
76   } else if ( chr < 0x10000 )
77   { *out++ = 0xe0|((chr>>12)&0x0f);
78     *out++ = 0x80|((chr>>6)&0x3f);
79     *out++ = 0x80|(chr&0x3f);
80   } else if ( chr < 0x200000 )
81   { *out++ = 0xf0|((chr>>18)&0x07);
82     *out++ = 0x80|((chr>>12)&0x3f);
83     *out++ = 0x80|((chr>>6)&0x3f);
84     *out++ = 0x80|(chr&0x3f);
85   } else if ( chr < 0x4000000 )
86   { *out++ = 0xf8|((chr>>24)&0x03);
87     *out++ = 0x80|((chr>>18)&0x3f);
88     *out++ = 0x80|((chr>>12)&0x3f);
89     *out++ = 0x80|((chr>>6)&0x3f);
90     *out++ = 0x80|(chr&0x3f);
91   } else if ( (unsigned)chr < 0x80000000 )
92   { *out++ = 0xfc|((chr>>30)&0x01);
93     *out++ = 0x80|((chr>>24)&0x3f);
94     *out++ = 0x80|((chr>>18)&0x3f);
95     *out++ = 0x80|((chr>>12)&0x3f);
96     *out++ = 0x80|((chr>>6)&0x3f);
97     *out++ = 0x80|(chr&0x3f);
98   }
99 
100   return out;
101 }
102 
103 
104 size_t
utf8_strlen(const char * s,size_t len)105 utf8_strlen(const char *s, size_t len)
106 { const char *e = &s[len];
107   unsigned int l = 0;
108 
109   while(s<e)
110   { int chr;
111 
112     s = utf8_get_char(s, &chr);
113     l++;
114   }
115 
116   return l;
117 }
118