1 /***************************************************************************
2          ccasefolder.cpp  -  Get a case folded version of a string
3                              -------------------
4     begin                : Thu Jul 17 2008
5     copyright            : (C) 2008 by Edward Sheldrake
6     email                : ejs1920@yahoo.co.uk
7  ***************************************************************************/
8 
9 /***************************************************************************
10  *                                                                         *
11  *   This program is free software; you can redistribute it and/or modify  *
12  *   it under the terms of the GNU General Public License as published by  *
13  *   the Free Software Foundation; either version 2 of the License, or     *
14  *   (at your option) any later version.                                   *
15  *                                                                         *
16  ***************************************************************************/
17 
18 #include "ccasefolder.h"
19 
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 
24 #ifndef ICONV_CONST
25 #define ICONV_CONST const
26 #endif
27 
28 /*
29  * Using iconv directly to convert to/from local and unicode.
30  * CIconv outputs a CString and that isn't really appropriate.
31  * There is mbstowcs() but it doesn't seem to work.
32  */
33 #include <iconv.h>
34 
35 /* For local encoding */
36 #include "cconfig.h"
37 
38 /* calloc */
39 #include <stdlib.h>
40 
41 /* printf */
42 #include <stdio.h>
43 
44 /* errno */
45 #include <errno.h>
46 
47 /* int32_t */
48 #ifdef HAVE_STDINT_H
49 #include <stdint.h>
50 #else
51 #define int32_t int
52 #endif
53 
54 #ifdef WORDS_BIGENDIAN
55 #define UCS4_HOST "UCS-4BE"
56 #else
57 #define UCS4_HOST "UCS-4LE"
58 #endif
59 
60 /** */
CCaseFolder()61 CCaseFolder::CCaseFolder()
62 {
63 	CString localenc;
64 	if ( CConfig::Instance() )
65 	{
66 		localenc = CConfig::Instance()->GetLocalEncoding();
67 	}
68 	else
69 	{
70 		localenc = "UTF-8";
71 	}
72 
73 	to_ucs4 = iconv_open( UCS4_HOST, localenc.Data() );
74 	from_ucs4 = iconv_open( localenc.Data(), UCS4_HOST );
75 }
76 
77 /** */
~CCaseFolder()78 CCaseFolder::~CCaseFolder()
79 {
80 	if ( to_ucs4 != (iconv_t)-1 )
81 	{
82 		iconv_close( to_ucs4 );
83 	}
84 
85 	if ( from_ucs4 != (iconv_t)-1 )
86 	{
87 		iconv_close( from_ucs4 );
88 	}
89 }
90 
91 /** */
Fold(const CString & input,CString & output)92 bool CCaseFolder::Fold( const CString & input, CString & output )
93 {
94 	if ( (to_ucs4 == (iconv_t) -1) || (from_ucs4 == (iconv_t) -1) )
95 	{
96 		return false;
97 	}
98 
99 	/* wchar_t was only 2 bytes on Windows, int32_t is always 4 bytes */
100 	const int int32_t_size = sizeof(int32_t);
101 	int errors = 0;
102 
103 	char * inbuf = input.Data();
104 	size_t inleft = input.Length();
105 
106 	size_t outleft = inleft * int32_t_size + 10;
107 	int32_t * unfolded = (int32_t*) calloc( 1, outleft );
108 
109 	if ( !unfolded )
110 	{
111 		return false;
112 	}
113 
114 	char * outbuf = (char*) unfolded;
115 	char * outstart = outbuf;
116 
117 	size_t res = (size_t) -1;
118 
119 	while ( res == (size_t) -1 )
120 	{
121 		res = iconv( to_ucs4, (ICONV_CONST char **)&inbuf, &inleft, &outbuf, &outleft );
122 
123 		if ( res == (size_t) -1 )
124 		{
125 			if ( errno == EILSEQ )
126 			{
127 				outstart[outbuf - outstart] = '_';
128 
129 				inbuf++;
130 				inleft--;
131 
132 				outbuf++;
133 				outleft--;
134 
135 				errors++;
136 			}
137 			else
138 			{
139 				printf("CCaseFolder::Fold: iconv() to unicode failed %d\n",errno);
140 				free(unfolded);
141 				return false;
142 			}
143 		}
144 	}
145 
146 	/* output length in bytes */
147 	size_t len = (input.Length() * int32_t_size + 10) - outleft;
148 
149 	/* output length in wchars */
150 
151 	if ( len%int32_t_size != 0 )
152 	{
153 		printf("CCaseFolder::Fold: unexpected output size\n");
154 		free( unfolded );
155 		return false;
156 	}
157 
158 	size_t wlen = len / int32_t_size;
159 
160 	/* greatest change is 1 character to 3 */
161 	size_t foldedlen = len * 3 + 10;
162 
163 	int32_t * folded = (int32_t *) calloc( 1, foldedlen );
164 
165 	if ( !folded )
166 	{
167 		free(unfolded);
168 		return false;
169 	}
170 
171 	size_t outpos = 0;
172 	for ( size_t i = 0; i < wlen; ++i )
173 	{
174 		switch ( unfolded[i] )
175 		{
176 #include "ccasefolder-generated-code.cpp"
177 			default:
178 				folded[outpos] = unfolded[i];
179 				++outpos;
180 				break;
181 		}
182 	}
183 
184 	/* finished with unfolded unicode */
185 	free( unfolded );
186 
187 	inbuf = (char*) folded;
188 	inleft = outpos * int32_t_size;
189 
190 	outleft = (outpos * int32_t_size) + 10;
191 	char * resultdata = (char*) calloc( 1, outleft );
192 	outbuf = resultdata;
193 
194 	res = (size_t) -1;
195 
196 	while ( res == (size_t) -1 )
197 	{
198 		res = iconv( from_ucs4, (ICONV_CONST char**)&inbuf, &inleft, &outbuf, &outleft );
199 
200 		if ( res == (size_t) -1 )
201 		{
202 			if ( errno == EILSEQ )
203 			{
204 				resultdata[outbuf - resultdata] = '_';
205 
206 				inbuf++;
207 				inleft--;
208 
209 				outbuf++;
210 				outleft--;
211 
212 				errors++;
213 			}
214 			else
215 			{
216 				printf("CCaseFolder::Fold: iconv() from unicode failed %d\n",errno);
217 				free( folded );
218 				free( resultdata );
219 				return false;
220 			}
221 		}
222 	}
223 
224 	/* finished with folded unicode */
225 	free( folded );
226 
227 	/*
228 	 * copy result data to output variable - CString computes whatever the length is,
229 	 * the data is null terminated
230 	 */
231 	output = resultdata;
232 
233 	/* free memory */
234 	free( resultdata );
235 
236 	if ( errors > 0 )
237 	{
238 		printf("CCaseFolder::Fold: total %d EILSEQs encountered\n",errors);
239 	}
240 
241 	return true;
242 }
243