1 /* Copyright (C) 2011-2011 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #include "udm_config.h"
19 
20 #include <stdio.h>
21 #include <string.h>
22 #include "udm_uniconv.h"
23 #include "udm_unidata.h"
24 #include "udm_sgml.h"
25 
26 
27 size_t
UdmWellFormedLengthGeneric(UDM_CHARSET * cs,const char * src,size_t srclen,int flags)28 UdmWellFormedLengthGeneric(UDM_CHARSET *cs,
29                            const char *src, size_t srclen, int flags)
30 {
31   const char *src0= src, *srcend= src + srclen;
32   while (src < srcend)
33   {
34     udm_wc_t wc;
35     int  length= cs->cset->mb_wc(NULL, cs, &wc,
36                                  (const unsigned char *) src,
37                                  (const unsigned char *) srcend,
38                                  flags);
39     if (length <= 0)
40       break;
41      src+= length;
42   }
43   return src - src0;
44 }
45 
46 
47 static inline int
scan_one_char(UDM_UNIDATA * unidata,UDM_CHARSET * cs,const char * str,const char * strend,int * ctype,int flags)48 scan_one_char(UDM_UNIDATA *unidata, UDM_CHARSET *cs,
49               const char *str, const char *strend, int *ctype, int flags)
50 {
51   int wclen;
52   if (*str == '&' && (flags & UDM_RECODE_HTML_IN))
53   {
54     udm_wc_t wc;
55     wclen= UdmSGMLScan(&wc, (const unsigned char*) str, (const unsigned char*) strend);
56     *ctype= UdmUniCType(unidata, wc);
57   }
58   else if (*str > 0)
59   {
60     wclen= 1;
61     *ctype= udm_charset_usascii.ctype[(unsigned char) *str];
62   }
63   else
64   {
65     udm_wc_t wc;
66     wclen= cs->cset->mb_wc(NULL, cs, &wc, (const unsigned char *) str, (const unsigned char *) strend, flags);
67     if (wclen <= 0)
68     {
69       *ctype= UDM_UNI_SEPAR;
70       return 1;
71     }
72     *ctype= UdmUniCType(unidata, wc);
73   }
74 
75   if (*ctype == UDM_UNI_DIGIT || *ctype == UDM_UNI_CJK)
76     *ctype= UDM_UNI_LETTER;
77   return wclen;
78 }
79 
80 
81 const char *
UdmStrGetSepTokenMB(UDM_UNIDATA * unidata,UDM_CHARSET * cs,const char * str,const char * strend,const char ** last,int * ctype0,int flags)82 UdmStrGetSepTokenMB(UDM_UNIDATA *unidata, UDM_CHARSET *cs,
83                       const char *str, const char *strend,
84                       const char **last, int *ctype0, int flags)
85 {
86   const char *beg;
87   int ctype, wclen;
88 
89   if(str == NULL && (str= *last) == NULL)
90     return NULL;
91 
92   if ((beg= str) >= strend)
93     return NULL;
94 
95   wclen= scan_one_char(unidata, cs, str, strend, ctype0, flags);
96   if (wclen <= 0)
97     return NULL;
98   str+= wclen;
99   for ( ; str < strend; )
100   {
101     wclen= scan_one_char(unidata, cs, str, strend, &ctype, flags);
102     if (wclen <= 0 || *ctype0 != ctype)
103       break;
104     str+= wclen;
105   }
106 
107   *last= str;
108   return beg;
109 }
110