1 /* "$Id: $"
2  *
3  * Author: Jean-Marc Lienher ( http://oksid.ch )
4  * Copyright 2000-2003 by O'ksi'D.
5  *
6  * This library is free software. Distribution and use rights are outlined in
7  * the file "COPYING" which should have been included with this file.  If this
8  * file is missing or damaged, see the license at:
9  *
10  *     http://www.fltk.org/COPYING.php
11  *
12  * Please report all bugs and problems on the following page:
13  *
14  *     http://www.fltk.org/str.php
15  */
16 
17 /*
18  * Unicode to UTF-8 conversion functions.
19  */
20 
21 #if !defined(WIN32) && !defined(__APPLE__)
22 
23 #include "../Xutf8.h"
24 
25 /*** NOTE : all functions are LIMITED to 24 bits Unicode values !!! ***/
26 
27 /*
28  * Converts the first char of the UTF-8 string to an Unicode value
29  * Returns the byte length of the converted UTF-8 char
30  * Returns -1 if the UTF-8 string is not valid
31  */
32 int
XConvertUtf8ToUcs(const unsigned char * buf,int len,unsigned int * ucs)33 XConvertUtf8ToUcs(const unsigned char     *buf,
34 		  int                     len,
35 		  unsigned int         	  *ucs) {
36 
37   if (buf[0] & 0x80) {
38     if (buf[0] & 0x40) {
39       if (buf[0] & 0x20) {
40 	if (buf[0] & 0x10) {
41 	  if (buf[0] & 0x08) {
42 	    if (buf[0] & 0x04) {
43 	      if (buf[0] & 0x02) {
44 		/* bad UTF-8 string */
45 	      } else {
46 		/* 0x04000000 - 0x7FFFFFFF */
47 	      }
48 	    } else if (len > 4
49 		       && (buf[1] & 0xC0) == 0x80
50 		       && (buf[2] & 0xC0) == 0x80
51 		       && (buf[3] & 0xC0) == 0x80
52 		       && (buf[4] & 0xC0) == 0x80) {
53 	      /* 0x00200000 - 0x03FFFFFF */
54 	      *ucs =  ((buf[0] & ~0xF8) << 24) +
55 		      ((buf[1] & ~0x80) << 18) +
56 		      ((buf[2] & ~0x80) << 12) +
57 		      ((buf[3] & ~0x80) << 6) +
58 		       (buf[4] & ~0x80);
59 	      if (*ucs > 0x001FFFFF && *ucs < 0x01000000) return 5;
60 	    }
61 	  } else if (len > 3
62 		     && (buf[1] & 0xC0) == 0x80
63 		     && (buf[2] & 0xC0) == 0x80
64 		     && (buf[3] & 0xC0) == 0x80) {
65 	    /* 0x00010000 - 0x001FFFFF */
66 	    *ucs =  ((buf[0] & ~0xF0) << 18) +
67 		    ((buf[1] & ~0x80) << 12) +
68 		    ((buf[2] & ~0x80) << 6) +
69 		     (buf[3] & ~0x80);
70 	    if (*ucs > 0x0000FFFF) return 4;
71 	  }
72 	} else if (len > 2
73 	           && (buf[1] & 0xC0) == 0x80
74 		   && (buf[2] & 0xC0) == 0x80) {
75 	  /* 0x00000800 - 0x0000FFFF */
76 	  *ucs =  ((buf[0] & ~0xE0) << 12) +
77 		  ((buf[1] & ~0x80) << 6) +
78 		   (buf[2] & ~0x80);
79 	  if (*ucs > 0x000007FF) return 3;
80 	}
81       } else if (len > 1 && (buf[1] & 0xC0) == 0x80) {
82 	/* 0x00000080 - 0x000007FF */
83 	*ucs = ((buf[0] & ~0xC0) << 6) +
84 		(buf[1] & ~0x80);
85 	if (*ucs > 0x0000007F) return 2;
86       }
87     }
88   } else if (len > 0) {
89     /* 0x00000000 - 0x0000007F */
90     *ucs = buf[0];
91     return 1;
92   }
93 
94   *ucs = (unsigned int) '?'; /* bad utf-8 string */
95   return -1;
96 }
97 
98 /*
99  * Converts an Unicode value to an UTF-8 string
100  * NOTE : the buffer (buf) must be at least 5 bytes long !!!
101  */
102 int
XConvertUcsToUtf8(unsigned int ucs,char * buf)103 XConvertUcsToUtf8(unsigned int 	ucs,
104 		  char 		*buf) {
105 
106   if (ucs < 0x000080) {
107     buf[0] = ucs;
108     return 1;
109   } else if (ucs < 0x000800) {
110     buf[0] = 0xC0 | (ucs >> 6);
111     buf[1] = 0x80 | (ucs & 0x3F);
112     return 2;
113   } else if (ucs < 0x010000) {
114     buf[0] = 0xE0 | (ucs >> 12);
115     buf[1] = 0x80 | ((ucs >> 6) & 0x3F);
116     buf[2] = 0x80 | (ucs & 0x3F);
117     return 3;
118   } else if (ucs < 0x00200000) {
119     buf[0] = 0xF0 | (ucs >> 18);
120     buf[1] = 0x80 | ((ucs >> 12) & 0x3F);
121     buf[2] = 0x80 | ((ucs >> 6) & 0x3F);
122     buf[3] = 0x80 | (ucs & 0x3F);
123     return 4;
124   } else if (ucs < 0x01000000) {
125     buf[0] = 0xF8 | (ucs >> 24);
126     buf[1] = 0x80 | ((ucs >> 18) & 0x3F);
127     buf[2] = 0x80 | ((ucs >> 12) & 0x3F);
128     buf[3] = 0x80 | ((ucs >> 6) & 0x3F);
129     buf[4] = 0x80 | (ucs & 0x3F);
130     return 5;
131   }
132   buf[0] = '?';
133   return -1;
134 }
135 
136 /*
137  * returns the byte length of the first UTF-8 char
138  * (returns -1 if not valid)
139  */
140 int
XUtf8CharByteLen(const unsigned char * buf,int len)141 XUtf8CharByteLen(const unsigned char     *buf,
142 		 int                     len) {
143   unsigned int ucs;
144   return XConvertUtf8ToUcs(buf, len, &ucs);
145 }
146 
147 /*
148  * returns the quantity of Unicode chars in the UTF-8 string
149  */
150 int
XCountUtf8Char(const unsigned char * buf,int len)151 XCountUtf8Char(const unsigned char 	*buf,
152 	       int 			len) {
153 
154   int i = 0;
155   int nbc = 0;
156   while (i < len) {
157     int cl = XUtf8CharByteLen(buf + i, len - i);
158     if (cl < 1) cl = 1;
159     nbc++;
160     i += cl;
161   }
162   return nbc;
163 }
164 
165 /*
166  * Same as XConvertUtf8ToUcs but no sanity check is done.
167  */
168 int
XFastConvertUtf8ToUcs(const unsigned char * buf,int len,unsigned int * ucs)169 XFastConvertUtf8ToUcs(const unsigned char     *buf,
170 		      int                     len,
171 		      unsigned int            *ucs) {
172 
173   if (buf[0] & 0x80) {
174     if (buf[0] & 0x40) {
175       if (buf[0] & 0x20) {
176 	if (buf[0] & 0x10) {
177 	  if (buf[0] & 0x08) {
178 	    if (buf[0] & 0x04) {
179 	      if (buf[0] & 0x02) {
180 		/* bad UTF-8 string */
181 	      } else {
182 		/* 0x04000000 - 0x7FFFFFFF */
183 	      }
184 	    } else if (len > 4) {
185 	      /* 0x00200000 - 0x03FFFFFF */
186 	      *ucs =  ((buf[0] & ~0xF8) << 24) +
187 		      ((buf[1] & ~0x80) << 18) +
188 		      ((buf[2] & ~0x80) << 12) +
189 		      ((buf[3] & ~0x80) << 6) +
190 		       (buf[4] & ~0x80);
191 	      return 5;
192 	    }
193 	  } else if (len > 3) {
194 	    /* 0x00010000 - 0x001FFFFF */
195 	    *ucs =  ((buf[0] & ~0xF0) << 18) +
196 		    ((buf[1] & ~0x80) << 12) +
197 		    ((buf[2] & ~0x80) << 6) +
198 		     (buf[3] & ~0x80);
199 	    return 4;
200 	  }
201 	} else if (len > 2) {
202 	  /* 0x00000800 - 0x0000FFFF */
203 	  *ucs =  ((buf[0] & ~0xE0) << 12) +
204 		  ((buf[1] & ~0x80) << 6) +
205 		   (buf[2] & ~0x80);
206 	  return 3;
207 	}
208       } else if (len > 1) {
209 	/* 0x00000080 - 0x000007FF */
210 	*ucs = ((buf[0] & ~0xC0) << 6) +
211 		(buf[1] & ~0x80);
212 	return 2;
213       }
214     }
215   } else if (len > 0) {
216     /* 0x00000000 - 0x0000007F */
217     *ucs = buf[0];
218     return 1;
219   }
220 
221   *ucs = (unsigned int) '?'; /* bad utf-8 string */
222   return -1;
223 }
224 
225 #endif /* X11 only */
226 
227 /*
228  * End of "$Id: $".
229  */
230