1 /* -*- Mode: C; indent-tabs-mode: nil; c-basic-offset: 8 -*- */
2 
3 /* libcroco - Library for parsing and applying CSS
4  * Copyright (C) 2006-2019 Free Software Foundation, Inc.
5  *
6  * This file is not part of the GNU gettext program, but is used with
7  * GNU gettext.
8  *
9  * The original copyright notice is as follows:
10  */
11 
12 /*
13  * This file is part of The Croco Library
14  *
15  * Copyright (C) 2003-2004 Dodji Seketeli.  All Rights Reserved.
16  *
17  * This program is free software; you can redistribute it and/or
18  * modify it under the terms of version 2.1 of the GNU Lesser General Public
19  * License as published by the Free Software Foundation.
20  *
21  * This program is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24  * GNU General Public License for more details.
25  *
26  * You should have received a copy of the GNU Lesser General Public License
27  * along with this program; if not, write to the Free Software
28  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
29  * USA
30  *
31  * Author: Dodji Seketeli
32  */
33 
34 #include <config.h>
35 #include "cr-utils.h"
36 #include "cr-string.h"
37 
38 /**
39  *@file:
40  *Some misc utility functions used
41  *in the libcroco.
42  *Note that troughout this file I will
43  *refer to the CSS SPECIFICATIONS DOCUMENTATION
44  *written by the w3c guys. You can find that document
45  *at http://www.w3.org/TR/REC-CSS2/ .
46  */
47 
48 /****************************
49  *Encoding transformations and
50  *encoding helpers
51  ****************************/
52 
53 /*
54  *Here is the correspondance between the ucs-4 charactere codes
55  *and there matching utf-8 encoding pattern as dscribed by RFC 2279:
56  *
57  *UCS-4 range (hex.)    UTF-8 octet sequence (binary)
58  *------------------    -----------------------------
59  *0000 0000-0000 007F   0xxxxxxx
60  *0000 0080-0000 07FF   110xxxxx 10xxxxxx
61  *0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
62  *0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
63  *0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
64  *0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
65  */
66 
67 /**
68  *Given an utf8 string buffer, calculates
69  *the length of this string if it was encoded
70  *in ucs4.
71  *@param a_in_start a pointer to the begining of
72  *the input utf8 string.
73  *@param a_in_end a pointre to the end of the input
74  *utf8 string (points to the last byte of the buffer)
75  *@param a_len out parameter the calculated length.
76  *@return CR_OK upon succesfull completion, an error code
77  *otherwise.
78  */
79 enum CRStatus
cr_utils_utf8_str_len_as_ucs4(const guchar * a_in_start,const guchar * a_in_end,gulong * a_len)80 cr_utils_utf8_str_len_as_ucs4 (const guchar * a_in_start,
81                                const guchar * a_in_end, gulong * a_len)
82 {
83         guchar *byte_ptr = NULL;
84         gint len = 0;
85 
86         /*
87          *to store the final decoded
88          *unicode char
89          */
90         guint c = 0;
91 
92         g_return_val_if_fail (a_in_start && a_in_end && a_len,
93                               CR_BAD_PARAM_ERROR);
94         *a_len = 0;
95 
96         for (byte_ptr = (guchar *) a_in_start;
97              byte_ptr <= a_in_end; byte_ptr++) {
98                 gint nb_bytes_2_decode = 0;
99 
100                 if (*byte_ptr <= 0x7F) {
101                         /*
102                          *7 bits long char
103                          *encoded over 1 byte:
104                          * 0xxx xxxx
105                          */
106                         c = *byte_ptr;
107                         nb_bytes_2_decode = 1;
108 
109                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
110                         /*
111                          *up to 11 bits long char.
112                          *encoded over 2 bytes:
113                          *110x xxxx  10xx xxxx
114                          */
115                         c = *byte_ptr & 0x1F;
116                         nb_bytes_2_decode = 2;
117 
118                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
119                         /*
120                          *up to 16 bit long char
121                          *encoded over 3 bytes:
122                          *1110 xxxx  10xx xxxx  10xx xxxx
123                          */
124                         c = *byte_ptr & 0x0F;
125                         nb_bytes_2_decode = 3;
126 
127                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
128                         /*
129                          *up to 21 bits long char
130                          *encoded over 4 bytes:
131                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
132                          */
133                         c = *byte_ptr & 0x7;
134                         nb_bytes_2_decode = 4;
135 
136                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
137                         /*
138                          *up to 26 bits long char
139                          *encoded over 5 bytes.
140                          *1111 10xx  10xx xxxx  10xx xxxx
141                          *10xx xxxx  10xx xxxx
142                          */
143                         c = *byte_ptr & 3;
144                         nb_bytes_2_decode = 5;
145 
146                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
147                         /*
148                          *up to 31 bits long char
149                          *encoded over 6 bytes:
150                          *1111 110x  10xx xxxx  10xx xxxx
151                          *10xx xxxx  10xx xxxx  10xx xxxx
152                          */
153                         c = *byte_ptr & 1;
154                         nb_bytes_2_decode = 6;
155 
156                 } else {
157                         /*
158                          *BAD ENCODING
159                          */
160                         return CR_ENCODING_ERROR;
161                 }
162 
163                 /*
164                  *Go and decode the remaining byte(s)
165                  *(if any) to get the current character.
166                  */
167                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
168                         /*decode the next byte */
169                         byte_ptr++;
170 
171                         /*byte pattern must be: 10xx xxxx */
172                         if ((*byte_ptr & 0xC0) != 0x80) {
173                                 return CR_ENCODING_ERROR;
174                         }
175 
176                         c = (c << 6) | (*byte_ptr & 0x3F);
177                 }
178 
179                 len++;
180         }
181 
182         *a_len = len;
183 
184         return CR_OK;
185 }
186 
187 /**
188  *Given an ucs4 string, this function
189  *returns the size (in bytes) this string
190  *would have occupied if it was encoded in utf-8.
191  *@param a_in_start a pointer to the beginning of the input
192  *buffer.
193  *@param a_in_end a pointer to the end of the input buffer.
194  *@param a_len out parameter. The computed length.
195  *@return CR_OK upon successfull completion, an error code otherwise.
196  */
197 enum CRStatus
cr_utils_ucs4_str_len_as_utf8(const guint32 * a_in_start,const guint32 * a_in_end,gulong * a_len)198 cr_utils_ucs4_str_len_as_utf8 (const guint32 * a_in_start,
199                                const guint32 * a_in_end, gulong * a_len)
200 {
201         gint len = 0;
202         guint32 *char_ptr = NULL;
203 
204         g_return_val_if_fail (a_in_start && a_in_end && a_len,
205                               CR_BAD_PARAM_ERROR);
206 
207         for (char_ptr = (guint32 *) a_in_start;
208              char_ptr <= a_in_end; char_ptr++) {
209                 if (*char_ptr <= 0x7F) {
210                         /*the utf-8 char would take 1 byte */
211                         len += 1;
212                 } else if (*char_ptr <= 0x7FF) {
213                         /*the utf-8 char would take 2 bytes */
214                         len += 2;
215                 } else if (*char_ptr <= 0xFFFF) {
216                         len += 3;
217                 } else if (*char_ptr <= 0x1FFFFF) {
218                         len += 4;
219                 } else if (*char_ptr <= 0x3FFFFFF) {
220                         len += 5;
221                 } else if (*char_ptr <= 0x7FFFFFFF) {
222                         len += 6;
223                 }
224         }
225 
226         *a_len = len;
227         return CR_OK;
228 }
229 
230 /**
231  *Given an ucsA string, this function
232  *returns the size (in bytes) this string
233  *would have occupied if it was encoded in utf-8.
234  *@param a_in_start a pointer to the beginning of the input
235  *buffer.
236  *@param a_in_end a pointer to the end of the input buffer.
237  *@param a_len out parameter. The computed length.
238  *@return CR_OK upon successfull completion, an error code otherwise.
239  */
240 enum CRStatus
cr_utils_ucs1_str_len_as_utf8(const guchar * a_in_start,const guchar * a_in_end,gulong * a_len)241 cr_utils_ucs1_str_len_as_utf8 (const guchar * a_in_start,
242                                const guchar * a_in_end, gulong * a_len)
243 {
244         gint len = 0;
245         guchar *char_ptr = NULL;
246 
247         g_return_val_if_fail (a_in_start && a_in_end && a_len,
248                               CR_BAD_PARAM_ERROR);
249 
250         for (char_ptr = (guchar *) a_in_start;
251              char_ptr <= a_in_end; char_ptr++) {
252                 if (*char_ptr <= 0x7F) {
253                         /*the utf-8 char would take 1 byte */
254                         len += 1;
255                 } else {
256                         /*the utf-8 char would take 2 bytes */
257                         len += 2;
258                 }
259         }
260 
261         *a_len = len;
262         return CR_OK;
263 }
264 
265 /**
266  *Converts an utf8 buffer into an ucs4 buffer.
267  *
268  *@param a_in the input utf8 buffer to convert.
269  *@param a_in_len in/out parameter. The size of the
270  *input buffer to convert. After return, this parameter contains
271  *the actual number of bytes consumed.
272  *@param a_out the output converted ucs4 buffer. Must be allocated by
273  *the caller.
274  *@param a_out_len in/out parameter. The size of the output buffer.
275  *If this size is actually smaller than the real needed size, the function
276  *just converts what it can and returns a success status. After return,
277  *this param points to the actual number of characters decoded.
278  *@return CR_OK upon successfull completion, an error code otherwise.
279  */
280 enum CRStatus
cr_utils_utf8_to_ucs4(const guchar * a_in,gulong * a_in_len,guint32 * a_out,gulong * a_out_len)281 cr_utils_utf8_to_ucs4 (const guchar * a_in,
282                        gulong * a_in_len, guint32 * a_out, gulong * a_out_len)
283 {
284         gulong in_len = 0,
285                 out_len = 0,
286                 in_index = 0,
287                 out_index = 0;
288         enum CRStatus status = CR_OK;
289 
290         /*
291          *to store the final decoded
292          *unicode char
293          */
294         guint c = 0;
295 
296         g_return_val_if_fail (a_in && a_in_len
297                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
298 
299         if (*a_in_len < 1) {
300                 status = CR_OK;
301                 goto end;
302         }
303 
304         in_len = *a_in_len;
305         out_len = *a_out_len;
306 
307         for (in_index = 0, out_index = 0;
308              (in_index < in_len) && (out_index < out_len);
309              in_index++, out_index++) {
310                 gint nb_bytes_2_decode = 0;
311 
312                 if (a_in[in_index] <= 0x7F) {
313                         /*
314                          *7 bits long char
315                          *encoded over 1 byte:
316                          * 0xxx xxxx
317                          */
318                         c = a_in[in_index];
319                         nb_bytes_2_decode = 1;
320 
321                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
322                         /*
323                          *up to 11 bits long char.
324                          *encoded over 2 bytes:
325                          *110x xxxx  10xx xxxx
326                          */
327                         c = a_in[in_index] & 0x1F;
328                         nb_bytes_2_decode = 2;
329 
330                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
331                         /*
332                          *up to 16 bit long char
333                          *encoded over 3 bytes:
334                          *1110 xxxx  10xx xxxx  10xx xxxx
335                          */
336                         c = a_in[in_index] & 0x0F;
337                         nb_bytes_2_decode = 3;
338 
339                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
340                         /*
341                          *up to 21 bits long char
342                          *encoded over 4 bytes:
343                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
344                          */
345                         c = a_in[in_index] & 0x7;
346                         nb_bytes_2_decode = 4;
347 
348                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
349                         /*
350                          *up to 26 bits long char
351                          *encoded over 5 bytes.
352                          *1111 10xx  10xx xxxx  10xx xxxx
353                          *10xx xxxx  10xx xxxx
354                          */
355                         c = a_in[in_index] & 3;
356                         nb_bytes_2_decode = 5;
357 
358                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
359                         /*
360                          *up to 31 bits long char
361                          *encoded over 6 bytes:
362                          *1111 110x  10xx xxxx  10xx xxxx
363                          *10xx xxxx  10xx xxxx  10xx xxxx
364                          */
365                         c = a_in[in_index] & 1;
366                         nb_bytes_2_decode = 6;
367 
368                 } else {
369                         /*BAD ENCODING */
370                         goto end;
371                 }
372 
373                 /*
374                  *Go and decode the remaining byte(s)
375                  *(if any) to get the current character.
376                  */
377                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
378                         /*decode the next byte */
379                         in_index++;
380 
381                         /*byte pattern must be: 10xx xxxx */
382                         if ((a_in[in_index] & 0xC0) != 0x80) {
383                                 goto end;
384                         }
385 
386                         c = (c << 6) | (a_in[in_index] & 0x3F);
387                 }
388 
389                 /*
390                  *The decoded ucs4 char is now
391                  *in c.
392                  */
393 
394                 /************************
395                  *Some security tests
396                  ***********************/
397 
398                 /*be sure c is a char */
399                 if (c == 0xFFFF || c == 0xFFFE)
400                         goto end;
401 
402                 /*be sure c is inferior to the max ucs4 char value */
403                 if (c > 0x10FFFF)
404                         goto end;
405 
406                 /*
407                  *c must be less than UTF16 "lower surrogate begin"
408                  *or higher than UTF16 "High surrogate end"
409                  */
410                 if (c >= 0xD800 && c <= 0xDFFF)
411                         goto end;
412 
413                 /*Avoid characters that equals zero */
414                 if (c == 0)
415                         goto end;
416 
417                 a_out[out_index] = c;
418         }
419 
420       end:
421         *a_out_len = out_index + 1;
422         *a_in_len = in_index + 1;
423 
424         return status;
425 }
426 
427 /**
428  *Reads a character from an utf8 buffer.
429  *Actually decode the next character code (unicode character code)
430  *and returns it.
431  *@param a_in the starting address of the utf8 buffer.
432  *@param a_in_len the length of the utf8 buffer.
433  *@param a_out output parameter. The resulting read char.
434  *@param a_consumed the number of the bytes consumed to
435  *decode the returned character code.
436  *@return CR_OK upon successfull completion, an error code otherwise.
437  */
438 enum CRStatus
cr_utils_read_char_from_utf8_buf(const guchar * a_in,gulong a_in_len,guint32 * a_out,gulong * a_consumed)439 cr_utils_read_char_from_utf8_buf (const guchar * a_in,
440                                   gulong a_in_len,
441                                   guint32 * a_out, gulong * a_consumed)
442 {
443         gulong in_index = 0,
444                nb_bytes_2_decode = 0;
445         enum CRStatus status = CR_OK;
446 
447         /*
448          *to store the final decoded
449          *unicode char
450          */
451         guint32 c = 0;
452 
453         g_return_val_if_fail (a_in && a_out && a_out
454                               && a_consumed, CR_BAD_PARAM_ERROR);
455 
456         if (a_in_len < 1) {
457                 status = CR_OK;
458                 goto end;
459         }
460 
461         if (*a_in <= 0x7F) {
462                 /*
463                  *7 bits long char
464                  *encoded over 1 byte:
465                  * 0xxx xxxx
466                  */
467                 c = *a_in;
468                 nb_bytes_2_decode = 1;
469 
470         } else if ((*a_in & 0xE0) == 0xC0) {
471                 /*
472                  *up to 11 bits long char.
473                  *encoded over 2 bytes:
474                  *110x xxxx  10xx xxxx
475                  */
476                 c = *a_in & 0x1F;
477                 nb_bytes_2_decode = 2;
478 
479         } else if ((*a_in & 0xF0) == 0xE0) {
480                 /*
481                  *up to 16 bit long char
482                  *encoded over 3 bytes:
483                  *1110 xxxx  10xx xxxx  10xx xxxx
484                  */
485                 c = *a_in & 0x0F;
486                 nb_bytes_2_decode = 3;
487 
488         } else if ((*a_in & 0xF8) == 0xF0) {
489                 /*
490                  *up to 21 bits long char
491                  *encoded over 4 bytes:
492                  *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
493                  */
494                 c = *a_in & 0x7;
495                 nb_bytes_2_decode = 4;
496 
497         } else if ((*a_in & 0xFC) == 0xF8) {
498                 /*
499                  *up to 26 bits long char
500                  *encoded over 5 bytes.
501                  *1111 10xx  10xx xxxx  10xx xxxx
502                  *10xx xxxx  10xx xxxx
503                  */
504                 c = *a_in & 3;
505                 nb_bytes_2_decode = 5;
506 
507         } else if ((*a_in & 0xFE) == 0xFC) {
508                 /*
509                  *up to 31 bits long char
510                  *encoded over 6 bytes:
511                  *1111 110x  10xx xxxx  10xx xxxx
512                  *10xx xxxx  10xx xxxx  10xx xxxx
513                  */
514                 c = *a_in & 1;
515                 nb_bytes_2_decode = 6;
516 
517         } else {
518                 /*BAD ENCODING */
519                 goto end;
520         }
521 
522         if (nb_bytes_2_decode > a_in_len) {
523                 status = CR_END_OF_INPUT_ERROR;
524                 goto end;
525         }
526 
527         /*
528          *Go and decode the remaining byte(s)
529          *(if any) to get the current character.
530          */
531         for (in_index = 1; in_index < nb_bytes_2_decode; in_index++) {
532                 /*byte pattern must be: 10xx xxxx */
533                 if ((a_in[in_index] & 0xC0) != 0x80) {
534                         goto end;
535                 }
536 
537                 c = (c << 6) | (a_in[in_index] & 0x3F);
538         }
539 
540         /*
541          *The decoded ucs4 char is now
542          *in c.
543          */
544 
545     /************************
546      *Some security tests
547      ***********************/
548 
549         /*be sure c is a char */
550         if (c == 0xFFFF || c == 0xFFFE)
551                 goto end;
552 
553         /*be sure c is inferior to the max ucs4 char value */
554         if (c > 0x10FFFF)
555                 goto end;
556 
557         /*
558          *c must be less than UTF16 "lower surrogate begin"
559          *or higher than UTF16 "High surrogate end"
560          */
561         if (c >= 0xD800 && c <= 0xDFFF)
562                 goto end;
563 
564         /*Avoid characters that equals zero */
565         if (c == 0)
566                 goto end;
567 
568         *a_out = c;
569 
570       end:
571         *a_consumed = nb_bytes_2_decode;
572 
573         return status;
574 }
575 
576 /**
577  *
578  */
579 enum CRStatus
cr_utils_utf8_str_len_as_ucs1(const guchar * a_in_start,const guchar * a_in_end,gulong * a_len)580 cr_utils_utf8_str_len_as_ucs1 (const guchar * a_in_start,
581                                const guchar * a_in_end, gulong * a_len)
582 {
583         /*
584          *Note: this function can be made shorter
585          *but it considers all the cases of the utf8 encoding
586          *to ease further extensions ...
587          */
588 
589         guchar *byte_ptr = NULL;
590         gint len = 0;
591 
592         /*
593          *to store the final decoded
594          *unicode char
595          */
596         guint c = 0;
597 
598         g_return_val_if_fail (a_in_start && a_in_end && a_len,
599                               CR_BAD_PARAM_ERROR);
600         *a_len = 0;
601 
602         for (byte_ptr = (guchar *) a_in_start;
603              byte_ptr <= a_in_end; byte_ptr++) {
604                 gint nb_bytes_2_decode = 0;
605 
606                 if (*byte_ptr <= 0x7F) {
607                         /*
608                          *7 bits long char
609                          *encoded over 1 byte:
610                          * 0xxx xxxx
611                          */
612                         c = *byte_ptr;
613                         nb_bytes_2_decode = 1;
614 
615                 } else if ((*byte_ptr & 0xE0) == 0xC0) {
616                         /*
617                          *up to 11 bits long char.
618                          *encoded over 2 bytes:
619                          *110x xxxx  10xx xxxx
620                          */
621                         c = *byte_ptr & 0x1F;
622                         nb_bytes_2_decode = 2;
623 
624                 } else if ((*byte_ptr & 0xF0) == 0xE0) {
625                         /*
626                          *up to 16 bit long char
627                          *encoded over 3 bytes:
628                          *1110 xxxx  10xx xxxx  10xx xxxx
629                          */
630                         c = *byte_ptr & 0x0F;
631                         nb_bytes_2_decode = 3;
632 
633                 } else if ((*byte_ptr & 0xF8) == 0xF0) {
634                         /*
635                          *up to 21 bits long char
636                          *encoded over 4 bytes:
637                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
638                          */
639                         c = *byte_ptr & 0x7;
640                         nb_bytes_2_decode = 4;
641 
642                 } else if ((*byte_ptr & 0xFC) == 0xF8) {
643                         /*
644                          *up to 26 bits long char
645                          *encoded over 5 bytes.
646                          *1111 10xx  10xx xxxx  10xx xxxx
647                          *10xx xxxx  10xx xxxx
648                          */
649                         c = *byte_ptr & 3;
650                         nb_bytes_2_decode = 5;
651 
652                 } else if ((*byte_ptr & 0xFE) == 0xFC) {
653                         /*
654                          *up to 31 bits long char
655                          *encoded over 6 bytes:
656                          *1111 110x  10xx xxxx  10xx xxxx
657                          *10xx xxxx  10xx xxxx  10xx xxxx
658                          */
659                         c = *byte_ptr & 1;
660                         nb_bytes_2_decode = 6;
661 
662                 } else {
663                         /*
664                          *BAD ENCODING
665                          */
666                         return CR_ENCODING_ERROR;
667                 }
668 
669                 /*
670                  *Go and decode the remaining byte(s)
671                  *(if any) to get the current character.
672                  */
673                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
674                         /*decode the next byte */
675                         byte_ptr++;
676 
677                         /*byte pattern must be: 10xx xxxx */
678                         if ((*byte_ptr & 0xC0) != 0x80) {
679                                 return CR_ENCODING_ERROR;
680                         }
681 
682                         c = (c << 6) | (*byte_ptr & 0x3F);
683                 }
684 
685                 /*
686                  *The decoded ucs4 char is now
687                  *in c.
688                  */
689 
690                 if (c <= 0xFF) { /*Add other conditions to support
691                                   *other char sets (ucs2, ucs3, ucs4).
692                                   */
693                         len++;
694                 } else {
695                         /*the char is too long to fit
696                          *into the supposed charset len.
697                          */
698                         return CR_ENCODING_ERROR;
699                 }
700         }
701 
702         *a_len = len;
703 
704         return CR_OK;
705 }
706 
707 /**
708  *Converts an utf8 string into an ucs4 string.
709  *@param a_in the input string to convert.
710  *@param a_in_len in/out parameter. The length of the input
711  *string. After return, points to the actual number of bytes
712  *consumed. This can be usefull to debug the input stream in case
713  *of encoding error.
714  *@param a_out out parameter. Points to the output string. It is allocated
715  *by this function and must be freed by the caller.
716  *@param a_out_len out parameter. The length of the output string.
717  *@return CR_OK upon successfull completion, an error code otherwise.
718  *
719  */
720 enum CRStatus
cr_utils_utf8_str_to_ucs4(const guchar * a_in,gulong * a_in_len,guint32 ** a_out,gulong * a_out_len)721 cr_utils_utf8_str_to_ucs4 (const guchar * a_in,
722                            gulong * a_in_len,
723                            guint32 ** a_out, gulong * a_out_len)
724 {
725         enum CRStatus status = CR_OK;
726 
727         g_return_val_if_fail (a_in && a_in_len
728                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
729 
730         status = cr_utils_utf8_str_len_as_ucs4 (a_in,
731                                                 &a_in[*a_in_len - 1],
732                                                 a_out_len);
733 
734         g_return_val_if_fail (status == CR_OK, status);
735 
736         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
737 
738         status = cr_utils_utf8_to_ucs4 (a_in, a_in_len, *a_out, a_out_len);
739 
740         return status;
741 }
742 
743 /**
744  *Converts an ucs4 buffer into an utf8 buffer.
745  *
746  *@param a_in the input ucs4 buffer to convert.
747  *@param a_in_len in/out parameter. The size of the
748  *input buffer to convert. After return, this parameter contains
749  *the actual number of characters consumed.
750  *@param a_out the output converted utf8 buffer. Must be allocated by
751  *the caller.
752  *@param a_out_len in/out parameter. The size of the output buffer.
753  *If this size is actually smaller than the real needed size, the function
754  *just converts what it can and returns a success status. After return,
755  *this param points to the actual number of bytes in the buffer.
756  *@return CR_OK upon successfull completion, an error code otherwise.
757  */
758 enum CRStatus
cr_utils_ucs4_to_utf8(const guint32 * a_in,gulong * a_in_len,guchar * a_out,gulong * a_out_len)759 cr_utils_ucs4_to_utf8 (const guint32 * a_in,
760                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
761 {
762         gulong in_len = 0,
763                 in_index = 0,
764                 out_index = 0;
765         enum CRStatus status = CR_OK;
766 
767         g_return_val_if_fail (a_in && a_in_len && a_out && a_out_len,
768                               CR_BAD_PARAM_ERROR);
769 
770         if (*a_in_len < 1) {
771                 status = CR_OK;
772                 goto end;
773         }
774 
775         in_len = *a_in_len;
776 
777         for (in_index = 0; in_index < in_len; in_index++) {
778                 /*
779                  *FIXME: return whenever we encounter forbidden char values.
780                  */
781 
782                 if (a_in[in_index] <= 0x7F) {
783                         a_out[out_index] = a_in[in_index];
784                         out_index++;
785                 } else if (a_in[in_index] <= 0x7FF) {
786                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
787                         a_out[out_index + 1] =
788                                 (0x80 | (a_in[in_index] & 0x3F));
789                         out_index += 2;
790                 } else if (a_in[in_index] <= 0xFFFF) {
791                         a_out[out_index] = (0xE0 | (a_in[in_index] >> 12));
792                         a_out[out_index + 1] =
793                                 (0x80 | ((a_in[in_index] >> 6) & 0x3F));
794                         a_out[out_index + 2] =
795                                 (0x80 | (a_in[in_index] & 0x3F));
796                         out_index += 3;
797                 } else if (a_in[in_index] <= 0x1FFFFF) {
798                         a_out[out_index] = (0xF0 | (a_in[in_index] >> 18));
799                         a_out[out_index + 1]
800                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
801                         a_out[out_index + 2]
802                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
803                         a_out[out_index + 3]
804                                 = (0x80 | (a_in[in_index] & 0x3F));
805                         out_index += 4;
806                 } else if (a_in[in_index] <= 0x3FFFFFF) {
807                         a_out[out_index] = (0xF8 | (a_in[in_index] >> 24));
808                         a_out[out_index + 1] =
809                                 (0x80 | (a_in[in_index] >> 18));
810                         a_out[out_index + 2]
811                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
812                         a_out[out_index + 3]
813                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
814                         a_out[out_index + 4]
815                                 = (0x80 | (a_in[in_index] & 0x3F));
816                         out_index += 5;
817                 } else if (a_in[in_index] <= 0x7FFFFFFF) {
818                         a_out[out_index] = (0xFC | (a_in[in_index] >> 30));
819                         a_out[out_index + 1] =
820                                 (0x80 | (a_in[in_index] >> 24));
821                         a_out[out_index + 2]
822                                 = (0x80 | ((a_in[in_index] >> 18) & 0x3F));
823                         a_out[out_index + 3]
824                                 = (0x80 | ((a_in[in_index] >> 12) & 0x3F));
825                         a_out[out_index + 4]
826                                 = (0x80 | ((a_in[in_index] >> 6) & 0x3F));
827                         a_out[out_index + 4]
828                                 = (0x80 | (a_in[in_index] & 0x3F));
829                         out_index += 6;
830                 } else {
831                         status = CR_ENCODING_ERROR;
832                         goto end;
833                 }
834         }                       /*end for */
835 
836       end:
837         *a_in_len = in_index + 1;
838         *a_out_len = out_index + 1;
839 
840         return status;
841 }
842 
843 /**
844  *Converts an ucs4 string into an utf8 string.
845  *@param a_in the input string to convert.
846  *@param a_in_len in/out parameter. The length of the input
847  *string. After return, points to the actual number of characters
848  *consumed. This can be usefull to debug the input string in case
849  *of encoding error.
850  *@param a_out out parameter. Points to the output string. It is allocated
851  *by this function and must be freed by the caller.
852  *@param a_out_len out parameter. The length (in bytes) of the output string.
853  *@return CR_OK upon successfull completion, an error code otherwise.
854  */
855 enum CRStatus
cr_utils_ucs4_str_to_utf8(const guint32 * a_in,gulong * a_in_len,guchar ** a_out,gulong * a_out_len)856 cr_utils_ucs4_str_to_utf8 (const guint32 * a_in,
857                            gulong * a_in_len,
858                            guchar ** a_out, gulong * a_out_len)
859 {
860         enum CRStatus status = CR_OK;
861 
862         g_return_val_if_fail (a_in && a_in_len && a_out
863                               && a_out_len, CR_BAD_PARAM_ERROR);
864 
865         status = cr_utils_ucs4_str_len_as_utf8 (a_in,
866                                                 &a_in[*a_out_len - 1],
867                                                 a_out_len);
868 
869         g_return_val_if_fail (status == CR_OK, status);
870 
871         status = cr_utils_ucs4_to_utf8 (a_in, a_in_len, *a_out, a_out_len);
872 
873         return status;
874 }
875 
876 /**
877  *Converts an ucs1 buffer into an utf8 buffer.
878  *The caller must know the size of the resulting buffer and
879  *allocate it prior to calling this function.
880  *
881  *@param a_in the input ucs1 buffer.
882  *
883  *@param a_in_len in/out parameter. The length of the input buffer.
884  *After return, points to the number of bytes actually consumed even
885  *in case of encoding error.
886  *
887  *@param a_out out parameter. The output utf8 converted buffer.
888  *
889  *@param a_out_len in/out parameter. The size of the output buffer.
890  *If the output buffer size is shorter than the actual needed size,
891  *this function just convert what it can.
892  *
893  *@return CR_OK upon successfull completion, an error code otherwise.
894  *
895  */
896 enum CRStatus
cr_utils_ucs1_to_utf8(const guchar * a_in,gulong * a_in_len,guchar * a_out,gulong * a_out_len)897 cr_utils_ucs1_to_utf8 (const guchar * a_in,
898                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
899 {
900         gulong out_index = 0,
901                 in_index = 0,
902                 in_len = 0,
903                 out_len = 0;
904         enum CRStatus status = CR_OK;
905 
906         g_return_val_if_fail (a_in && a_in_len
907                               && a_out_len,
908                               CR_BAD_PARAM_ERROR);
909 
910         if (*a_in_len == 0) {
911                 *a_out_len = 0 ;
912                 return status;
913         }
914         g_return_val_if_fail (a_out, CR_BAD_PARAM_ERROR) ;
915 
916         in_len = *a_in_len;
917         out_len = *a_out_len;
918 
919         for (in_index = 0, out_index = 0;
920              (in_index < in_len) && (out_index < out_len); in_index++) {
921                 /*
922                  *FIXME: return whenever we encounter forbidden char values.
923                  */
924 
925                 if (a_in[in_index] <= 0x7F) {
926                         a_out[out_index] = a_in[in_index];
927                         out_index++;
928                 } else {
929                         a_out[out_index] = (0xC0 | (a_in[in_index] >> 6));
930                         a_out[out_index + 1] =
931                                 (0x80 | (a_in[in_index] & 0x3F));
932                         out_index += 2;
933                 }
934         }                       /*end for */
935 
936         *a_in_len = in_index;
937         *a_out_len = out_index;
938 
939         return status;
940 }
941 
942 /**
943  *Converts an ucs1 string into an utf8 string.
944  *@param a_in_start the beginning of the input string to convert.
945  *@param a_in_end the end of the input string to convert.
946  *@param a_out out parameter. The converted string.
947  *@param a_out out parameter. The length of the converted string.
948  *@return CR_OK upon successfull completion, an error code otherwise.
949  *
950  */
951 enum CRStatus
cr_utils_ucs1_str_to_utf8(const guchar * a_in,gulong * a_in_len,guchar ** a_out,gulong * a_out_len)952 cr_utils_ucs1_str_to_utf8 (const guchar * a_in,
953                            gulong * a_in_len,
954                            guchar ** a_out, gulong * a_out_len)
955 {
956         gulong out_len = 0;
957         enum CRStatus status = CR_OK;
958 
959         g_return_val_if_fail (a_in && a_in_len && a_out
960                               && a_out_len, CR_BAD_PARAM_ERROR);
961 
962         if (*a_in_len < 1) {
963                 *a_out_len = 0;
964                 *a_out = NULL;
965                 return CR_OK;
966         }
967 
968         status = cr_utils_ucs1_str_len_as_utf8 (a_in, &a_in[*a_in_len - 1],
969                                                 &out_len);
970 
971         g_return_val_if_fail (status == CR_OK, status);
972 
973         *a_out = g_malloc0 (out_len);
974 
975         status = cr_utils_ucs1_to_utf8 (a_in, a_in_len, *a_out, &out_len);
976 
977         *a_out_len = out_len;
978 
979         return status;
980 }
981 
982 /**
983  *Converts an utf8 buffer into an ucs1 buffer.
984  *The caller must know the size of the resulting
985  *converted buffer, and allocated it prior to calling this
986  *function.
987  *
988  *@param a_in the input utf8 buffer to convert.
989  *
990  *@param a_in_len in/out parameter. The size of the input utf8 buffer.
991  *After return, points to the number of bytes consumed
992  *by the function even in case of encoding error.
993  *
994  *@param a_out out parameter. Points to the resulting buffer.
995  *Must be allocated by the caller. If the size of a_out is shorter
996  *than its required size, this function converts what it can and return
997  *a successfull status.
998  *
999  *@param a_out_len in/out parameter. The size of the output buffer.
1000  *After return, points to the number of bytes consumed even in case of
1001  *encoding error.
1002  *
1003  *@return CR_OK upon successfull completion, an error code otherwise.
1004  */
1005 enum CRStatus
cr_utils_utf8_to_ucs1(const guchar * a_in,gulong * a_in_len,guchar * a_out,gulong * a_out_len)1006 cr_utils_utf8_to_ucs1 (const guchar * a_in,
1007                        gulong * a_in_len, guchar * a_out, gulong * a_out_len)
1008 {
1009         gulong in_index = 0,
1010                 out_index = 0,
1011                 in_len = 0,
1012                 out_len = 0;
1013         enum CRStatus status = CR_OK;
1014 
1015         /*
1016          *to store the final decoded
1017          *unicode char
1018          */
1019         guint32 c = 0;
1020 
1021         g_return_val_if_fail (a_in && a_in_len
1022                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1023 
1024         if (*a_in_len < 1) {
1025                 goto end;
1026         }
1027 
1028         in_len = *a_in_len;
1029         out_len = *a_out_len;
1030 
1031         for (in_index = 0, out_index = 0;
1032              (in_index < in_len) && (out_index < out_len);
1033              in_index++, out_index++) {
1034                 gint nb_bytes_2_decode = 0;
1035 
1036                 if (a_in[in_index] <= 0x7F) {
1037                         /*
1038                          *7 bits long char
1039                          *encoded over 1 byte:
1040                          * 0xxx xxxx
1041                          */
1042                         c = a_in[in_index];
1043                         nb_bytes_2_decode = 1;
1044 
1045                 } else if ((a_in[in_index] & 0xE0) == 0xC0) {
1046                         /*
1047                          *up to 11 bits long char.
1048                          *encoded over 2 bytes:
1049                          *110x xxxx  10xx xxxx
1050                          */
1051                         c = a_in[in_index] & 0x1F;
1052                         nb_bytes_2_decode = 2;
1053 
1054                 } else if ((a_in[in_index] & 0xF0) == 0xE0) {
1055                         /*
1056                          *up to 16 bit long char
1057                          *encoded over 3 bytes:
1058                          *1110 xxxx  10xx xxxx  10xx xxxx
1059                          */
1060                         c = a_in[in_index] & 0x0F;
1061                         nb_bytes_2_decode = 3;
1062 
1063                 } else if ((a_in[in_index] & 0xF8) == 0xF0) {
1064                         /*
1065                          *up to 21 bits long char
1066                          *encoded over 4 bytes:
1067                          *1111 0xxx  10xx xxxx  10xx xxxx  10xx xxxx
1068                          */
1069                         c = a_in[in_index] & 0x7;
1070                         nb_bytes_2_decode = 4;
1071 
1072                 } else if ((a_in[in_index] & 0xFC) == 0xF8) {
1073                         /*
1074                          *up to 26 bits long char
1075                          *encoded over 5 bytes.
1076                          *1111 10xx  10xx xxxx  10xx xxxx
1077                          *10xx xxxx  10xx xxxx
1078                          */
1079                         c = a_in[in_index] & 3;
1080                         nb_bytes_2_decode = 5;
1081 
1082                 } else if ((a_in[in_index] & 0xFE) == 0xFC) {
1083                         /*
1084                          *up to 31 bits long char
1085                          *encoded over 6 bytes:
1086                          *1111 110x  10xx xxxx  10xx xxxx
1087                          *10xx xxxx  10xx xxxx  10xx xxxx
1088                          */
1089                         c = a_in[in_index] & 1;
1090                         nb_bytes_2_decode = 6;
1091 
1092                 } else {
1093                         /*BAD ENCODING */
1094                         status = CR_ENCODING_ERROR;
1095                         goto end;
1096                 }
1097 
1098                 /*
1099                  *Go and decode the remaining byte(s)
1100                  *(if any) to get the current character.
1101                  */
1102                 if (in_index + nb_bytes_2_decode - 1 >= in_len) {
1103                         goto end;
1104                 }
1105 
1106                 for (; nb_bytes_2_decode > 1; nb_bytes_2_decode--) {
1107                         /*decode the next byte */
1108                         in_index++;
1109 
1110                         /*byte pattern must be: 10xx xxxx */
1111                         if ((a_in[in_index] & 0xC0) != 0x80) {
1112                                 status = CR_ENCODING_ERROR;
1113                                 goto end;
1114                         }
1115 
1116                         c = (c << 6) | (a_in[in_index] & 0x3F);
1117                 }
1118 
1119                 /*
1120                  *The decoded ucs4 char is now
1121                  *in c.
1122                  */
1123 
1124                 if (c > 0xFF) {
1125                         status = CR_ENCODING_ERROR;
1126                         goto end;
1127                 }
1128 
1129                 a_out[out_index] = c;
1130         }
1131 
1132       end:
1133         *a_out_len = out_index;
1134         *a_in_len = in_index;
1135 
1136         return status;
1137 }
1138 
1139 /**
1140  *Converts an utf8 buffer into an
1141  *ucs1 buffer.
1142  *@param a_in_start the start of the input buffer.
1143  *@param a_in_end the end of the input buffer.
1144  *@param a_out out parameter. The resulting converted ucs4 buffer.
1145  *Must be freed by the caller.
1146  *@param a_out_len out parameter. The length of the converted buffer.
1147  *@return CR_OK upon successfull completion, an error code otherwise.
1148  *Note that out parameters are valid if and only if this function
1149  *returns CR_OK.
1150  */
1151 enum CRStatus
cr_utils_utf8_str_to_ucs1(const guchar * a_in,gulong * a_in_len,guchar ** a_out,gulong * a_out_len)1152 cr_utils_utf8_str_to_ucs1 (const guchar * a_in,
1153                            gulong * a_in_len,
1154                            guchar ** a_out, gulong * a_out_len)
1155 {
1156         enum CRStatus status = CR_OK;
1157 
1158         g_return_val_if_fail (a_in && a_in_len
1159                               && a_out && a_out_len, CR_BAD_PARAM_ERROR);
1160 
1161         if (*a_in_len < 1) {
1162                 *a_out_len = 0;
1163                 *a_out = NULL;
1164                 return CR_OK;
1165         }
1166 
1167         status = cr_utils_utf8_str_len_as_ucs4 (a_in, &a_in[*a_in_len - 1],
1168                                                 a_out_len);
1169 
1170         g_return_val_if_fail (status == CR_OK, status);
1171 
1172         *a_out = g_malloc0 (*a_out_len * sizeof (guint32));
1173 
1174         status = cr_utils_utf8_to_ucs1 (a_in, a_in_len, *a_out, a_out_len);
1175         return status;
1176 }
1177 
1178 /*****************************************
1179  *CSS basic types identification utilities
1180  *****************************************/
1181 
1182 /**
1183  *Returns TRUE if a_char is a white space as
1184  *defined in the css spec in chap 4.1.1.
1185  *
1186  *white-space ::= ' '| \t|\r|\n|\f
1187  *
1188  *@param a_char the character to test.
1189  *return TRUE if is a white space, false otherwise.
1190  */
1191 gboolean
cr_utils_is_white_space(guint32 a_char)1192 cr_utils_is_white_space (guint32 a_char)
1193 {
1194         switch (a_char) {
1195         case ' ':
1196         case '\t':
1197         case '\r':
1198         case '\n':
1199         case '\f':
1200                 return TRUE;
1201                 break;
1202         default:
1203                 return FALSE;
1204         }
1205 }
1206 
1207 /**
1208  *Returns true if the character is a newline
1209  *as defined in the css spec in the chap 4.1.1.
1210  *
1211  *nl ::= \n|\r\n|\r|\f
1212  *
1213  *@param a_char the character to test.
1214  *@return TRUE if the character is a newline, FALSE otherwise.
1215  */
1216 gboolean
cr_utils_is_newline(guint32 a_char)1217 cr_utils_is_newline (guint32 a_char)
1218 {
1219         switch (a_char) {
1220         case '\n':
1221         case '\r':
1222         case '\f':
1223                 return TRUE;
1224                 break;
1225         default:
1226                 return FALSE;
1227         }
1228 }
1229 
1230 /**
1231  *returns TRUE if the char is part of an hexa num char:
1232  *i.e hexa_char ::= [0-9A-F]
1233  */
1234 gboolean
cr_utils_is_hexa_char(guint32 a_char)1235 cr_utils_is_hexa_char (guint32 a_char)
1236 {
1237         if ((a_char >= '0' && a_char <= '9')
1238             || (a_char >= 'A' && a_char <= 'F')) {
1239                 return TRUE;
1240         }
1241         return FALSE;
1242 }
1243 
1244 /**
1245  *Returns true if the character is a nonascii
1246  *character (as defined in the css spec chap 4.1.1):
1247  *
1248  *nonascii ::= [^\0-\177]
1249  *
1250  *@param a_char the character to test.
1251  *@return TRUE if the character is a nonascii char,
1252  *FALSE otherwise.
1253  */
1254 gboolean
cr_utils_is_nonascii(guint32 a_char)1255 cr_utils_is_nonascii (guint32 a_char)
1256 {
1257         if (a_char <= 177) {
1258                 return FALSE;
1259         }
1260 
1261         return TRUE;
1262 }
1263 
1264 /**
1265  *Dumps a character a_nb times on a file.
1266  *@param a_char the char to dump
1267  *@param a_fp the destination file pointer
1268  *@param a_nb the number of times a_char is to be dumped.
1269  */
1270 void
cr_utils_dump_n_chars(guchar a_char,FILE * a_fp,glong a_nb)1271 cr_utils_dump_n_chars (guchar a_char, FILE * a_fp, glong a_nb)
1272 {
1273         glong i = 0;
1274 
1275         for (i = 0; i < a_nb; i++) {
1276                 fprintf (a_fp, "%c", a_char);
1277         }
1278 }
1279 
1280 void
cr_utils_dump_n_chars2(guchar a_char,GString * a_string,glong a_nb)1281 cr_utils_dump_n_chars2 (guchar a_char, GString * a_string, glong a_nb)
1282 {
1283         glong i = 0;
1284 
1285         g_return_if_fail (a_string);
1286 
1287         for (i = 0; i < a_nb; i++) {
1288                 g_string_append_printf (a_string, "%c", a_char);
1289         }
1290 }
1291 
1292 /**
1293  *Duplicates a list of GString instances.
1294  *@return the duplicated list of GString instances or NULL if
1295  *something bad happened.
1296  *@param a_list_of_strings the list of strings to be duplicated.
1297  */
1298 GList *
cr_utils_dup_glist_of_string(GList const * a_list_of_strings)1299 cr_utils_dup_glist_of_string (GList const * a_list_of_strings)
1300 {
1301         GList const *cur = NULL;
1302         GList *result = NULL;
1303 
1304         g_return_val_if_fail (a_list_of_strings, NULL);
1305 
1306         for (cur = a_list_of_strings; cur; cur = cur->next) {
1307                 GString *str = NULL;
1308 
1309                 str = g_string_new_len (((GString *) cur->data)->str,
1310                                         ((GString *) cur->data)->len);
1311                 if (str)
1312                         result = g_list_append (result, str);
1313         }
1314 
1315         return result;
1316 }
1317 
1318 /**
1319  *Duplicate a GList where the GList::data is a CRString.
1320  *@param a_list_of_strings the list to duplicate
1321  *@return the duplicated list, or NULL if something bad
1322  *happened.
1323  */
1324 GList *
cr_utils_dup_glist_of_cr_string(GList const * a_list_of_strings)1325 cr_utils_dup_glist_of_cr_string (GList const * a_list_of_strings)
1326 {
1327         GList const *cur = NULL;
1328         GList *result = NULL;
1329 
1330         g_return_val_if_fail (a_list_of_strings, NULL);
1331 
1332         for (cur = a_list_of_strings; cur; cur = cur->next) {
1333                 CRString *str = NULL;
1334 
1335                 str = cr_string_dup ((CRString const *) cur->data) ;
1336                 if (str)
1337                         result = g_list_append (result, str);
1338         }
1339 
1340         return result;
1341 }
1342