1 /* Copyright (C) 2011-2011 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "udm_config.h"
19
20 #include <stdio.h>
21 #include <string.h>
22 #include "udm_uniconv.h"
23 #include "udm_unidata.h"
24 #include "udm_sgml.h"
25
26
27 size_t
UdmWellFormedLengthGeneric(UDM_CHARSET * cs,const char * src,size_t srclen,int flags)28 UdmWellFormedLengthGeneric(UDM_CHARSET *cs,
29 const char *src, size_t srclen, int flags)
30 {
31 const char *src0= src, *srcend= src + srclen;
32 while (src < srcend)
33 {
34 udm_wc_t wc;
35 int length= cs->cset->mb_wc(NULL, cs, &wc,
36 (const unsigned char *) src,
37 (const unsigned char *) srcend,
38 flags);
39 if (length <= 0)
40 break;
41 src+= length;
42 }
43 return src - src0;
44 }
45
46
47 static inline int
scan_one_char(UDM_UNIDATA * unidata,UDM_CHARSET * cs,const char * str,const char * strend,int * ctype,int flags)48 scan_one_char(UDM_UNIDATA *unidata, UDM_CHARSET *cs,
49 const char *str, const char *strend, int *ctype, int flags)
50 {
51 int wclen;
52 if (*str == '&' && (flags & UDM_RECODE_HTML_IN))
53 {
54 udm_wc_t wc;
55 wclen= UdmSGMLScan(&wc, (const unsigned char*) str, (const unsigned char*) strend);
56 *ctype= UdmUniCType(unidata, wc);
57 }
58 else if (*str > 0)
59 {
60 wclen= 1;
61 *ctype= udm_charset_usascii.ctype[(unsigned char) *str];
62 }
63 else
64 {
65 udm_wc_t wc;
66 wclen= cs->cset->mb_wc(NULL, cs, &wc, (const unsigned char *) str, (const unsigned char *) strend, flags);
67 if (wclen <= 0)
68 {
69 *ctype= UDM_UNI_SEPAR;
70 return 1;
71 }
72 *ctype= UdmUniCType(unidata, wc);
73 }
74
75 if (*ctype == UDM_UNI_DIGIT || *ctype == UDM_UNI_CJK)
76 *ctype= UDM_UNI_LETTER;
77 return wclen;
78 }
79
80
81 const char *
UdmStrGetSepTokenMB(UDM_UNIDATA * unidata,UDM_CHARSET * cs,const char * str,const char * strend,const char ** last,int * ctype0,int flags)82 UdmStrGetSepTokenMB(UDM_UNIDATA *unidata, UDM_CHARSET *cs,
83 const char *str, const char *strend,
84 const char **last, int *ctype0, int flags)
85 {
86 const char *beg;
87 int ctype, wclen;
88
89 if(str == NULL && (str= *last) == NULL)
90 return NULL;
91
92 if ((beg= str) >= strend)
93 return NULL;
94
95 wclen= scan_one_char(unidata, cs, str, strend, ctype0, flags);
96 if (wclen <= 0)
97 return NULL;
98 str+= wclen;
99 for ( ; str < strend; )
100 {
101 wclen= scan_one_char(unidata, cs, str, strend, &ctype, flags);
102 if (wclen <= 0 || *ctype0 != ctype)
103 break;
104 str+= wclen;
105 }
106
107 *last= str;
108 return beg;
109 }
110