1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  unistr_cnv.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:2
12 *
13 *   created on: 2004aug19
14 *   created by: Markus W. Scherer
15 *
16 *   Character conversion functions moved here from unistr.cpp
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_CONVERSION
22 
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/ucnv.h"
29 #include "ucnv_imp.h"
30 #include "putilimp.h"
31 #include "ustr_cnv.h"
32 #include "ustr_imp.h"
33 
34 U_NAMESPACE_BEGIN
35 
36 //========================================
37 // Constructors
38 //========================================
39 
40 #if !U_CHARSET_IS_UTF8
41 
UnicodeString(const char * codepageData)42 UnicodeString::UnicodeString(const char *codepageData) {
43     fUnion.fFields.fLengthAndFlags = kShortString;
44     if(codepageData != 0) {
45         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
46     }
47 }
48 
UnicodeString(const char * codepageData,int32_t dataLength)49 UnicodeString::UnicodeString(const char *codepageData,
50                              int32_t dataLength) {
51     fUnion.fFields.fLengthAndFlags = kShortString;
52     if(codepageData != 0) {
53         doCodepageCreate(codepageData, dataLength, 0);
54     }
55 }
56 
57 // else see unistr.cpp
58 #endif
59 
UnicodeString(const char * codepageData,const char * codepage)60 UnicodeString::UnicodeString(const char *codepageData,
61                              const char *codepage) {
62     fUnion.fFields.fLengthAndFlags = kShortString;
63     if(codepageData != 0) {
64         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
65     }
66 }
67 
UnicodeString(const char * codepageData,int32_t dataLength,const char * codepage)68 UnicodeString::UnicodeString(const char *codepageData,
69                              int32_t dataLength,
70                              const char *codepage) {
71     fUnion.fFields.fLengthAndFlags = kShortString;
72     if(codepageData != 0) {
73         doCodepageCreate(codepageData, dataLength, codepage);
74     }
75 }
76 
UnicodeString(const char * src,int32_t srcLength,UConverter * cnv,UErrorCode & errorCode)77 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
78                              UConverter *cnv,
79                              UErrorCode &errorCode) {
80     fUnion.fFields.fLengthAndFlags = kShortString;
81     if(U_SUCCESS(errorCode)) {
82         // check arguments
83         if(src==NULL) {
84             // treat as an empty string, do nothing more
85         } else if(srcLength<-1) {
86             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
87         } else {
88             // get input length
89             if(srcLength==-1) {
90                 srcLength=(int32_t)uprv_strlen(src);
91             }
92             if(srcLength>0) {
93                 if(cnv!=0) {
94                     // use the provided converter
95                     ucnv_resetToUnicode(cnv);
96                     doCodepageCreate(src, srcLength, cnv, errorCode);
97                 } else {
98                     // use the default converter
99                     cnv=u_getDefaultConverter(&errorCode);
100                     doCodepageCreate(src, srcLength, cnv, errorCode);
101                     u_releaseDefaultConverter(cnv);
102                 }
103             }
104         }
105 
106         if(U_FAILURE(errorCode)) {
107             setToBogus();
108         }
109     }
110 }
111 
112 //========================================
113 // Codeset conversion
114 //========================================
115 
116 #if !U_CHARSET_IS_UTF8
117 
118 int32_t
extract(int32_t start,int32_t length,char * target,uint32_t dstSize) const119 UnicodeString::extract(int32_t start,
120                        int32_t length,
121                        char *target,
122                        uint32_t dstSize) const {
123     return extract(start, length, target, dstSize, 0);
124 }
125 
126 // else see unistr.cpp
127 #endif
128 
129 int32_t
extract(int32_t start,int32_t length,char * target,uint32_t dstSize,const char * codepage) const130 UnicodeString::extract(int32_t start,
131                        int32_t length,
132                        char *target,
133                        uint32_t dstSize,
134                        const char *codepage) const
135 {
136     // if the arguments are illegal, then do nothing
137     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
138         return 0;
139     }
140 
141     // pin the indices to legal values
142     pinIndices(start, length);
143 
144     // We need to cast dstSize to int32_t for all subsequent code.
145     // I don't know why the API was defined with uint32_t but we are stuck with it.
146     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
147     // as a limit in some functions, it may wrap around and yield a pointer
148     // that compares less-than target.
149     int32_t capacity;
150     if(dstSize < 0x7fffffff) {
151         // Assume that the capacity is real and a limit pointer won't wrap around.
152         capacity = (int32_t)dstSize;
153     } else {
154         // Pin the capacity so that a limit pointer does not wrap around.
155         char *targetLimit = (char *)U_MAX_PTR(target);
156         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
157         // greater than target and does not wrap around the top of the address space.
158         capacity = (int32_t)(targetLimit - target);
159     }
160 
161     // create the converter
162     UConverter *converter;
163     UErrorCode status = U_ZERO_ERROR;
164 
165     // just write the NUL if the string length is 0
166     if(length == 0) {
167         return u_terminateChars(target, capacity, 0, &status);
168     }
169 
170     // if the codepage is the default, use our cache
171     // if it is an empty string, then use the "invariant character" conversion
172     if (codepage == 0) {
173         const char *defaultName = ucnv_getDefaultName();
174         if(UCNV_FAST_IS_UTF8(defaultName)) {
175             return toUTF8(start, length, target, capacity);
176         }
177         converter = u_getDefaultConverter(&status);
178     } else if (*codepage == 0) {
179         // use the "invariant characters" conversion
180         int32_t destLength;
181         if(length <= capacity) {
182             destLength = length;
183         } else {
184             destLength = capacity;
185         }
186         u_UCharsToChars(getArrayStart() + start, target, destLength);
187         return u_terminateChars(target, capacity, length, &status);
188     } else {
189         converter = ucnv_open(codepage, &status);
190     }
191 
192     length = doExtract(start, length, target, capacity, converter, status);
193 
194     // close the converter
195     if (codepage == 0) {
196         u_releaseDefaultConverter(converter);
197     } else {
198         ucnv_close(converter);
199     }
200 
201     return length;
202 }
203 
204 int32_t
extract(char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const205 UnicodeString::extract(char *dest, int32_t destCapacity,
206                        UConverter *cnv,
207                        UErrorCode &errorCode) const
208 {
209     if(U_FAILURE(errorCode)) {
210         return 0;
211     }
212 
213     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
214         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
215         return 0;
216     }
217 
218     // nothing to do?
219     if(isEmpty()) {
220         return u_terminateChars(dest, destCapacity, 0, &errorCode);
221     }
222 
223     // get the converter
224     UBool isDefaultConverter;
225     if(cnv==0) {
226         isDefaultConverter=TRUE;
227         cnv=u_getDefaultConverter(&errorCode);
228         if(U_FAILURE(errorCode)) {
229             return 0;
230         }
231     } else {
232         isDefaultConverter=FALSE;
233         ucnv_resetFromUnicode(cnv);
234     }
235 
236     // convert
237     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
238 
239     // release the converter
240     if(isDefaultConverter) {
241         u_releaseDefaultConverter(cnv);
242     }
243 
244     return len;
245 }
246 
247 int32_t
doExtract(int32_t start,int32_t length,char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const248 UnicodeString::doExtract(int32_t start, int32_t length,
249                          char *dest, int32_t destCapacity,
250                          UConverter *cnv,
251                          UErrorCode &errorCode) const
252 {
253     if(U_FAILURE(errorCode)) {
254         if(destCapacity!=0) {
255             *dest=0;
256         }
257         return 0;
258     }
259 
260     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
261     char *originalDest=dest;
262     const char *destLimit;
263 
264     if(destCapacity==0) {
265         destLimit=dest=0;
266     } else if(destCapacity==-1) {
267         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
268         destLimit=(char*)U_MAX_PTR(dest);
269         // for NUL-termination, translate into highest int32_t
270         destCapacity=0x7fffffff;
271     } else {
272         destLimit=dest+destCapacity;
273     }
274 
275     // perform the conversion
276     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
277     length=(int32_t)(dest-originalDest);
278 
279     // if an overflow occurs, then get the preflighting length
280     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
281         char buffer[1024];
282 
283         destLimit=buffer+sizeof(buffer);
284         do {
285             dest=buffer;
286             errorCode=U_ZERO_ERROR;
287             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
288             length+=(int32_t)(dest-buffer);
289         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
290     }
291 
292     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
293 }
294 
295 void
doCodepageCreate(const char * codepageData,int32_t dataLength,const char * codepage)296 UnicodeString::doCodepageCreate(const char *codepageData,
297                                 int32_t dataLength,
298                                 const char *codepage)
299 {
300     // if there's nothing to convert, do nothing
301     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
302         return;
303     }
304     if(dataLength == -1) {
305         dataLength = (int32_t)uprv_strlen(codepageData);
306     }
307 
308     UErrorCode status = U_ZERO_ERROR;
309 
310     // create the converter
311     // if the codepage is the default, use our cache
312     // if it is an empty string, then use the "invariant character" conversion
313     UConverter *converter;
314     if (codepage == 0) {
315         const char *defaultName = ucnv_getDefaultName();
316         if(UCNV_FAST_IS_UTF8(defaultName)) {
317             setToUTF8(StringPiece(codepageData, dataLength));
318             return;
319         }
320         converter = u_getDefaultConverter(&status);
321     } else if(*codepage == 0) {
322         // use the "invariant characters" conversion
323         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
324             u_charsToUChars(codepageData, getArrayStart(), dataLength);
325             setLength(dataLength);
326         } else {
327             setToBogus();
328         }
329         return;
330     } else {
331         converter = ucnv_open(codepage, &status);
332     }
333 
334     // if we failed, set the appropriate flags and return
335     if(U_FAILURE(status)) {
336         setToBogus();
337         return;
338     }
339 
340     // perform the conversion
341     doCodepageCreate(codepageData, dataLength, converter, status);
342     if(U_FAILURE(status)) {
343         setToBogus();
344     }
345 
346     // close the converter
347     if(codepage == 0) {
348         u_releaseDefaultConverter(converter);
349     } else {
350         ucnv_close(converter);
351     }
352 }
353 
354 void
doCodepageCreate(const char * codepageData,int32_t dataLength,UConverter * converter,UErrorCode & status)355 UnicodeString::doCodepageCreate(const char *codepageData,
356                                 int32_t dataLength,
357                                 UConverter *converter,
358                                 UErrorCode &status)
359 {
360     if(U_FAILURE(status)) {
361         return;
362     }
363 
364     // set up the conversion parameters
365     const char *mySource     = codepageData;
366     const char *mySourceEnd  = mySource + dataLength;
367     UChar *array, *myTarget;
368 
369     // estimate the size needed:
370     int32_t arraySize;
371     if(dataLength <= US_STACKBUF_SIZE) {
372         // try to use the stack buffer
373         arraySize = US_STACKBUF_SIZE;
374     } else {
375         // 1.25 UChar's per source byte should cover most cases
376         arraySize = dataLength + (dataLength >> 2);
377     }
378 
379     // we do not care about the current contents
380     UBool doCopyArray = FALSE;
381     for(;;) {
382         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
383             setToBogus();
384             break;
385         }
386 
387         // perform the conversion
388         array = getArrayStart();
389         myTarget = array + length();
390         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
391             &mySource, mySourceEnd, 0, TRUE, &status);
392 
393         // update the conversion parameters
394         setLength((int32_t)(myTarget - array));
395 
396         // allocate more space and copy data, if needed
397         if(status == U_BUFFER_OVERFLOW_ERROR) {
398             // reset the error code
399             status = U_ZERO_ERROR;
400 
401             // keep the previous conversion results
402             doCopyArray = TRUE;
403 
404             // estimate the new size needed, larger than before
405             // try 2 UChar's per remaining source byte
406             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
407         } else {
408             break;
409         }
410     }
411 }
412 
413 U_NAMESPACE_END
414 
415 #endif
416