1 /***************************************************************************
2 ccasefolder.cpp - Get a case folded version of a string
3 -------------------
4 begin : Thu Jul 17 2008
5 copyright : (C) 2008 by Edward Sheldrake
6 email : ejs1920@yahoo.co.uk
7 ***************************************************************************/
8
9 /***************************************************************************
10 * *
11 * This program is free software; you can redistribute it and/or modify *
12 * it under the terms of the GNU General Public License as published by *
13 * the Free Software Foundation; either version 2 of the License, or *
14 * (at your option) any later version. *
15 * *
16 ***************************************************************************/
17
18 #include "ccasefolder.h"
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #ifndef ICONV_CONST
25 #define ICONV_CONST const
26 #endif
27
28 /*
29 * Using iconv directly to convert to/from local and unicode.
30 * CIconv outputs a CString and that isn't really appropriate.
31 * There is mbstowcs() but it doesn't seem to work.
32 */
33 #include <iconv.h>
34
35 /* For local encoding */
36 #include "cconfig.h"
37
38 /* calloc */
39 #include <stdlib.h>
40
41 /* printf */
42 #include <stdio.h>
43
44 /* errno */
45 #include <errno.h>
46
47 /* int32_t */
48 #ifdef HAVE_STDINT_H
49 #include <stdint.h>
50 #else
51 #define int32_t int
52 #endif
53
54 #ifdef WORDS_BIGENDIAN
55 #define UCS4_HOST "UCS-4BE"
56 #else
57 #define UCS4_HOST "UCS-4LE"
58 #endif
59
60 /** */
CCaseFolder()61 CCaseFolder::CCaseFolder()
62 {
63 CString localenc;
64 if ( CConfig::Instance() )
65 {
66 localenc = CConfig::Instance()->GetLocalEncoding();
67 }
68 else
69 {
70 localenc = "UTF-8";
71 }
72
73 to_ucs4 = iconv_open( UCS4_HOST, localenc.Data() );
74 from_ucs4 = iconv_open( localenc.Data(), UCS4_HOST );
75 }
76
77 /** */
~CCaseFolder()78 CCaseFolder::~CCaseFolder()
79 {
80 if ( to_ucs4 != (iconv_t)-1 )
81 {
82 iconv_close( to_ucs4 );
83 }
84
85 if ( from_ucs4 != (iconv_t)-1 )
86 {
87 iconv_close( from_ucs4 );
88 }
89 }
90
91 /** */
Fold(const CString & input,CString & output)92 bool CCaseFolder::Fold( const CString & input, CString & output )
93 {
94 if ( (to_ucs4 == (iconv_t) -1) || (from_ucs4 == (iconv_t) -1) )
95 {
96 return false;
97 }
98
99 /* wchar_t was only 2 bytes on Windows, int32_t is always 4 bytes */
100 const int int32_t_size = sizeof(int32_t);
101 int errors = 0;
102
103 char * inbuf = input.Data();
104 size_t inleft = input.Length();
105
106 size_t outleft = inleft * int32_t_size + 10;
107 int32_t * unfolded = (int32_t*) calloc( 1, outleft );
108
109 if ( !unfolded )
110 {
111 return false;
112 }
113
114 char * outbuf = (char*) unfolded;
115 char * outstart = outbuf;
116
117 size_t res = (size_t) -1;
118
119 while ( res == (size_t) -1 )
120 {
121 res = iconv( to_ucs4, (ICONV_CONST char **)&inbuf, &inleft, &outbuf, &outleft );
122
123 if ( res == (size_t) -1 )
124 {
125 if ( errno == EILSEQ )
126 {
127 outstart[outbuf - outstart] = '_';
128
129 inbuf++;
130 inleft--;
131
132 outbuf++;
133 outleft--;
134
135 errors++;
136 }
137 else
138 {
139 printf("CCaseFolder::Fold: iconv() to unicode failed %d\n",errno);
140 free(unfolded);
141 return false;
142 }
143 }
144 }
145
146 /* output length in bytes */
147 size_t len = (input.Length() * int32_t_size + 10) - outleft;
148
149 /* output length in wchars */
150
151 if ( len%int32_t_size != 0 )
152 {
153 printf("CCaseFolder::Fold: unexpected output size\n");
154 free( unfolded );
155 return false;
156 }
157
158 size_t wlen = len / int32_t_size;
159
160 /* greatest change is 1 character to 3 */
161 size_t foldedlen = len * 3 + 10;
162
163 int32_t * folded = (int32_t *) calloc( 1, foldedlen );
164
165 if ( !folded )
166 {
167 free(unfolded);
168 return false;
169 }
170
171 size_t outpos = 0;
172 for ( size_t i = 0; i < wlen; ++i )
173 {
174 switch ( unfolded[i] )
175 {
176 #include "ccasefolder-generated-code.cpp"
177 default:
178 folded[outpos] = unfolded[i];
179 ++outpos;
180 break;
181 }
182 }
183
184 /* finished with unfolded unicode */
185 free( unfolded );
186
187 inbuf = (char*) folded;
188 inleft = outpos * int32_t_size;
189
190 outleft = (outpos * int32_t_size) + 10;
191 char * resultdata = (char*) calloc( 1, outleft );
192 outbuf = resultdata;
193
194 res = (size_t) -1;
195
196 while ( res == (size_t) -1 )
197 {
198 res = iconv( from_ucs4, (ICONV_CONST char**)&inbuf, &inleft, &outbuf, &outleft );
199
200 if ( res == (size_t) -1 )
201 {
202 if ( errno == EILSEQ )
203 {
204 resultdata[outbuf - resultdata] = '_';
205
206 inbuf++;
207 inleft--;
208
209 outbuf++;
210 outleft--;
211
212 errors++;
213 }
214 else
215 {
216 printf("CCaseFolder::Fold: iconv() from unicode failed %d\n",errno);
217 free( folded );
218 free( resultdata );
219 return false;
220 }
221 }
222 }
223
224 /* finished with folded unicode */
225 free( folded );
226
227 /*
228 * copy result data to output variable - CString computes whatever the length is,
229 * the data is null terminated
230 */
231 output = resultdata;
232
233 /* free memory */
234 free( resultdata );
235
236 if ( errors > 0 )
237 {
238 printf("CCaseFolder::Fold: total %d EILSEQs encountered\n",errors);
239 }
240
241 return true;
242 }
243