1 /*
2  *
3  *  Copyright (C) 2011-2017, OFFIS e.V.
4  *  All rights reserved.  See COPYRIGHT file for details.
5  *
6  *  This software and supporting documentation were developed by
7  *
8  *    OFFIS e.V.
9  *    R&D Division Health
10  *    Escherweg 2
11  *    D-26121 Oldenburg, Germany
12  *
13  *
14  *  Module:  dcmdata
15  *
16  *  Author:  Joerg Riesmeier
17  *
18  *  Purpose: Class for supporting the Specific Character Set attribute
19  *
20  */
21 
22 
23 #include "dcmtk/config/osconfig.h"    /* make sure OS specific configuration is included first */
24 
25 #include "dcmtk/dcmdata/dcspchrs.h"
26 #include "dcmtk/dcmdata/dcitem.h"
27 #include "dcmtk/dcmdata/dcbytstr.h"
28 #include "dcmtk/dcmdata/dcdeftag.h"
29 #include "dcmtk/ofstd/ofstream.h"
30 #include "dcmtk/ofstd/ofstd.h"
31 
32 
33 #define MAX_OUTPUT_STRING_LENGTH 60
34 
35 
36 /*------------------*
37  *  implementation  *
38  *------------------*/
39 
DcmSpecificCharacterSet()40 DcmSpecificCharacterSet::DcmSpecificCharacterSet()
41   : SourceCharacterSet(),
42     DestinationCharacterSet(),
43     DestinationEncoding(),
44     DefaultEncodingConverter(),
45     EncodingConverters()
46 {
47 }
48 
49 
~DcmSpecificCharacterSet()50 DcmSpecificCharacterSet::~DcmSpecificCharacterSet()
51 {
52     clear();
53 }
54 
55 
clear()56 void DcmSpecificCharacterSet::clear()
57 {
58     DefaultEncodingConverter.clear();
59     EncodingConverters.clear();
60     SourceCharacterSet.clear();
61     DestinationCharacterSet.clear();
62     DestinationEncoding.clear();
63 }
64 
65 
operator OFBool() const66 DcmSpecificCharacterSet::operator OFBool() const
67 {
68     return OFstatic_cast(OFBool, DefaultEncodingConverter);
69 }
70 
71 
operator !() const72 OFBool DcmSpecificCharacterSet::operator!() const
73 {
74     return !DefaultEncodingConverter;
75 }
76 
77 
getSourceCharacterSet() const78 const OFString &DcmSpecificCharacterSet::getSourceCharacterSet() const
79 {
80     return SourceCharacterSet;
81 }
82 
83 
getDestinationCharacterSet() const84 const OFString &DcmSpecificCharacterSet::getDestinationCharacterSet() const
85 {
86     return DestinationCharacterSet;
87 }
88 
89 
getDestinationEncoding() const90 const OFString &DcmSpecificCharacterSet::getDestinationEncoding() const
91 {
92     return DestinationEncoding;
93 }
94 
95 
getConversionFlags() const96 unsigned DcmSpecificCharacterSet::getConversionFlags() const
97 {
98     return DefaultEncodingConverter.getConversionFlags();
99 }
100 
101 
setConversionFlags(const unsigned flags)102 OFCondition DcmSpecificCharacterSet::setConversionFlags(const unsigned flags)
103 {
104     if (!EncodingConverters.empty())
105     {
106         /* pass conversion flags to all "encoding converters" */
107         for (T_EncodingConvertersMap::iterator it = EncodingConverters.begin();
108             it != EncodingConverters.end(); ++it)
109         {
110             OFCondition status = it->second.setConversionFlags(flags);
111             if (status.bad())
112                 return status;
113         }
114         return EC_Normal;
115     } else return DefaultEncodingConverter.setConversionFlags(flags);
116 }
117 
118 
selectCharacterSet(const OFString & fromCharset,const OFString & toCharset)119 OFCondition DcmSpecificCharacterSet::selectCharacterSet(const OFString &fromCharset,
120                                                         const OFString &toCharset)
121 {
122     // first, make sure that all converters are cleared
123     clear();
124     // determine the destination encoding (and check whether it is supported at all)
125     OFCondition status = determineDestinationEncoding(toCharset);
126     if (status.good())
127     {
128         // normalize the given string (original VR is "CS" with VM "1-n")
129         SourceCharacterSet = fromCharset;
130         normalizeString(SourceCharacterSet, MULTIPART, DELETE_LEADING, DELETE_TRAILING);
131         // check whether it is multi-valued
132         const unsigned long sourceVM = DcmElement::determineVM(SourceCharacterSet.c_str(), SourceCharacterSet.length());
133         if (sourceVM == 0)
134         {
135             // no character set specified, use ASCII
136             status = DefaultEncodingConverter.selectEncoding("ASCII", DestinationEncoding);
137             // output some useful debug information
138             if (status.good())
139             {
140                 DCMDATA_DEBUG("DcmSpecificCharacterSet: Selected character set '' (ASCII) "
141                     << "for the conversion to " << DestinationEncoding);
142             }
143         }
144         else if (sourceVM == 1)
145         {
146             // a single character set specified (no code extensions)
147             status = selectCharacterSetWithoutCodeExtensions();
148         } else {
149             // multiple character sets specified (code extensions used)
150             status = selectCharacterSetWithCodeExtensions(sourceVM);
151         }
152     }
153     return status;
154 }
155 
156 
selectCharacterSet(DcmItem & dataset,const OFString & toCharset)157 OFCondition DcmSpecificCharacterSet::selectCharacterSet(DcmItem &dataset,
158                                                         const OFString &toCharset)
159 {
160     OFString fromCharset;
161     // check whether Specific Character Set (0008,0005) is present in the given item/dataset
162     dataset.findAndGetOFStringArray(DCM_SpecificCharacterSet, fromCharset, OFFalse /*searchIntoSub*/);
163     // if missing or empty, the default character set (ASCII) will be used
164     return selectCharacterSet(fromCharset, toCharset);
165 }
166 
167 
determineDestinationEncoding(const OFString & toCharset)168 OFCondition DcmSpecificCharacterSet::determineDestinationEncoding(const OFString &toCharset)
169 {
170     OFCondition status = EC_Normal;
171     // normalize the given string (original VR is "CS" with VM "1-n", but we only support VM "1")
172     DestinationCharacterSet = toCharset;
173     normalizeString(DestinationCharacterSet, !MULTIPART, DELETE_LEADING, DELETE_TRAILING);
174     // there should only be a single character set specified (no code extensions)
175     if (DestinationCharacterSet.empty())                // ASCII (no value)
176         DestinationEncoding = "ASCII";
177     else if (DestinationCharacterSet == "ISO_IR 6")     // ASCII
178     {
179         DCMDATA_WARN("DcmSpecificCharacterSet: 'ISO_IR 6' is not a defined term in DICOM, "
180             << "will be treated as an empty value (ASCII)");
181         DestinationCharacterSet.clear();
182         DestinationEncoding = "ASCII";
183     }
184     else if (DestinationCharacterSet == "ISO_IR 100")   // Latin alphabet No. 1
185         DestinationEncoding = "ISO-8859-1";
186     else if (DestinationCharacterSet == "ISO_IR 101")   // Latin alphabet No. 2
187         DestinationEncoding = "ISO-8859-2";
188     else if (DestinationCharacterSet == "ISO_IR 109")   // Latin alphabet No. 3
189         DestinationEncoding = "ISO-8859-3";
190     else if (DestinationCharacterSet == "ISO_IR 110")   // Latin alphabet No. 4
191         DestinationEncoding = "ISO-8859-4";
192     else if (DestinationCharacterSet == "ISO_IR 144")   // Cyrillic
193         DestinationEncoding = "ISO-8859-5";
194     else if (DestinationCharacterSet == "ISO_IR 127")   // Arabic
195         DestinationEncoding = "ISO-8859-6";
196     else if (DestinationCharacterSet == "ISO_IR 126")   // Greek
197         DestinationEncoding = "ISO-8859-7";
198     else if (DestinationCharacterSet == "ISO_IR 138")   // Hebrew
199         DestinationEncoding = "ISO-8859-8";
200     else if (DestinationCharacterSet == "ISO_IR 148")   // Latin alphabet No. 5
201         DestinationEncoding = "ISO-8859-9";
202     else if (DestinationCharacterSet == "ISO_IR 13")    // Japanese
203 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICONV
204         DestinationEncoding = "JIS_X0201";              // - the name "ISO-IR-13" is not supported by libiconv
205 #else
206         DestinationEncoding = "Shift_JIS";              // - ICU and stdlibc iconv only know "Shift_JIS" (is this mapping correct?)
207 #endif
208     else if (DestinationCharacterSet == "ISO_IR 166")   // Thai
209 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICU
210         DestinationEncoding = "TIS-620";                // - the name "ISO-IR-166" is not supported by ICU
211 #else
212         DestinationEncoding = "ISO-IR-166";
213 #endif
214     else if (DestinationCharacterSet == "ISO_IR 192")   // Unicode in UTF-8 (multi-byte)
215         DestinationEncoding = "UTF-8";
216     else if (DestinationCharacterSet == "GB18030")      // Chinese (multi-byte)
217         DestinationEncoding = "GB18030";
218     else if (DestinationCharacterSet == "GBK")          // Chinese (multi-byte, subset of "GB 18030")
219         DestinationEncoding = "GBK";
220     else {
221         DestinationEncoding.clear();
222         // create an appropriate error code
223         OFOStringStream stream;
224         stream << "Cannot select destination character set: SpecificCharacterSet (0008,0005) value '"
225             << DestinationCharacterSet << "' not supported" << OFStringStream_ends;
226         OFSTRINGSTREAM_GETOFSTRING(stream, message)
227         status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
228     }
229     return status;
230 }
231 
232 
selectCharacterSetWithoutCodeExtensions()233 OFCondition DcmSpecificCharacterSet::selectCharacterSetWithoutCodeExtensions()
234 {
235     OFCondition status = EC_Normal;
236     // a single character set specified (no code extensions)
237     OFString fromEncoding;
238     if (SourceCharacterSet == "ISO_IR 6")           // ASCII
239     {
240         DCMDATA_WARN("DcmSpecificCharacterSet: 'ISO_IR 6' is not a defined term in DICOM, "
241             << "will be treated as an empty value (ASCII)");
242         SourceCharacterSet.clear();
243         fromEncoding = "ASCII";
244     }
245     else if (SourceCharacterSet == "ISO_IR 100")    // Latin alphabet No. 1
246         fromEncoding = "ISO-8859-1";
247     else if (SourceCharacterSet == "ISO_IR 101")    // Latin alphabet No. 2
248         fromEncoding = "ISO-8859-2";
249     else if (SourceCharacterSet == "ISO_IR 109")    // Latin alphabet No. 3
250         fromEncoding = "ISO-8859-3";
251     else if (SourceCharacterSet == "ISO_IR 110")    // Latin alphabet No. 4
252         fromEncoding = "ISO-8859-4";
253     else if (SourceCharacterSet == "ISO_IR 144")    // Cyrillic
254         fromEncoding = "ISO-8859-5";
255     else if (SourceCharacterSet == "ISO_IR 127")    // Arabic
256         fromEncoding = "ISO-8859-6";
257     else if (SourceCharacterSet == "ISO_IR 126")    // Greek
258         fromEncoding = "ISO-8859-7";
259     else if (SourceCharacterSet == "ISO_IR 138")    // Hebrew
260         fromEncoding = "ISO-8859-8";
261     else if (SourceCharacterSet == "ISO_IR 148")    // Latin alphabet No. 5
262         fromEncoding = "ISO-8859-9";
263     else if (SourceCharacterSet == "ISO_IR 13")     // Japanese
264 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICONV
265         fromEncoding = "JIS_X0201";                 // - the name "ISO-IR-13" is not supported by libiconv
266 #else
267         fromEncoding = "Shift_JIS";                 // - ICU and stdlibc iconv only know "Shift_JIS" (is this mapping correct?)
268 #endif
269     else if (SourceCharacterSet == "ISO_IR 166")    // Thai
270 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICU
271         fromEncoding = "TIS-620";                   // - the name "ISO-IR-166" is not supported by ICU
272 #else
273         fromEncoding = "ISO-IR-166";
274 #endif
275     else if (SourceCharacterSet == "ISO_IR 192")    // Unicode in UTF-8 (multi-byte)
276         fromEncoding = "UTF-8";
277     else if (SourceCharacterSet == "GB18030")       // Chinese (multi-byte)
278         fromEncoding = "GB18030";
279     else if (SourceCharacterSet == "GBK")           // Chinese (multi-byte, subset of "GB 18030")
280         fromEncoding = "GBK";
281     else {
282         // create an appropriate error code
283         OFOStringStream stream;
284         stream << "Cannot select source character set: SpecificCharacterSet (0008,0005) value '"
285             << SourceCharacterSet << "' not supported" << OFStringStream_ends;
286         OFSTRINGSTREAM_GETOFSTRING(stream, message)
287         status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
288     }
289     // check whether an appropriate character encoding has been found
290     if (!fromEncoding.empty())
291     {
292         status = DefaultEncodingConverter.selectEncoding(fromEncoding, DestinationEncoding);
293         // output some useful debug information
294         if (status.good())
295         {
296             DCMDATA_DEBUG("DcmSpecificCharacterSet: Selected character set '" << SourceCharacterSet
297                 << "' (" << fromEncoding << ") for the conversion to " << DestinationEncoding);
298         }
299     }
300     return status;
301 }
302 
303 
selectCharacterSetWithCodeExtensions(const unsigned long sourceVM)304 OFCondition DcmSpecificCharacterSet::selectCharacterSetWithCodeExtensions(const unsigned long sourceVM)
305 {
306     // first, check whether multiple character sets are specified (i.e. code extensions used)
307     if (sourceVM <= 1)
308         return EC_IllegalCall;
309     // then proceed with the real work
310     OFCondition status = EC_Normal;
311     size_t pos = 0;
312     OFBool needsASCII = OFFalse;
313     OFBool notFirstValue = OFFalse;
314     OFString definedTerm;
315     unsigned long i = 0;
316     while ((i < sourceVM) && status.good())
317     {
318         // extract single value from string (separated by a backslash)
319         pos = DcmElement::getValueFromString(SourceCharacterSet.c_str(), pos, SourceCharacterSet.length(), definedTerm);
320         if (definedTerm.empty() && (i == 0))            // assuming ASCII (according to DICOM PS 3.5)
321             definedTerm = "ISO 2022 IR 6";
322         // determine character encoding from DICOM defined term
323         OFString encodingName;
324         if (definedTerm == "ISO 2022 IR 6")             // ASCII
325             encodingName = "ASCII";
326         else if (definedTerm == "ISO 2022 IR 100")      // Latin alphabet No. 1
327         {
328             encodingName = "ISO-8859-1";
329             needsASCII = OFTrue;
330         }
331         else if (definedTerm == "ISO 2022 IR 101")      // Latin alphabet No. 2
332         {
333             encodingName = "ISO-8859-2";
334             needsASCII = OFTrue;
335         }
336         else if (definedTerm == "ISO 2022 IR 109")      // Latin alphabet No. 3
337         {
338             encodingName = "ISO-8859-3";
339             needsASCII = OFTrue;
340         }
341         else if (definedTerm == "ISO 2022 IR 110")      // Latin alphabet No. 4
342         {
343             encodingName = "ISO-8859-4";
344             needsASCII = OFTrue;
345         }
346         else if (definedTerm == "ISO 2022 IR 144")      // Cyrillic
347         {
348             encodingName = "ISO-8859-5";
349             needsASCII = OFTrue;
350         }
351         else if (definedTerm == "ISO 2022 IR 127")      // Arabic
352         {
353             encodingName = "ISO-8859-6";
354             needsASCII = OFTrue;
355         }
356         else if (definedTerm == "ISO 2022 IR 126")      // Greek
357         {
358             encodingName = "ISO-8859-7";
359             needsASCII = OFTrue;
360         }
361         else if (definedTerm == "ISO 2022 IR 138")      // Hebrew
362         {
363             encodingName = "ISO-8859-8";
364             needsASCII = OFTrue;
365         }
366         else if (definedTerm == "ISO 2022 IR 148")      // Latin alphabet No. 5
367         {
368             encodingName = "ISO-8859-9";
369             needsASCII = OFTrue;
370         }
371         else if (definedTerm == "ISO 2022 IR 13")       // Japanese
372         {
373 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICONV
374             encodingName = "JIS_X0201";                 // - the name "ISO-IR-13" is not supported by libiconv
375 #else
376             encodingName = "Shift_JIS";                 // - ICU and stdlibc iconv only know "Shift_JIS" (is this mapping correct?)
377 #endif
378         }
379         else if (definedTerm == "ISO 2022 IR 166")      // Thai
380         {
381 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICU
382             encodingName = "TIS-620";                   // - "ISO-IR-166" is not supported by ICU
383 #else
384             encodingName = "ISO-IR-166";
385 #endif
386             needsASCII = OFTrue;
387         }
388         else if (definedTerm == "ISO 2022 IR 87")       // Japanese (multi-byte)
389         {
390             encodingName = "ISO-IR-87";                 // - this might generate an error since "ISO-IR-87" is not supported by ICU and stdlibc iconv
391             notFirstValue = OFTrue;
392         }
393         else if (definedTerm == "ISO 2022 IR 159")      // Japanese (multi-byte)
394         {
395             encodingName = "ISO-IR-159";                // - this might generate an error since "ISO-IR-159" is not supported by ICU and stdlibc iconv
396             notFirstValue = OFTrue;
397         }
398         else if (definedTerm == "ISO 2022 IR 149")      // Korean (multi-byte)
399         {
400             encodingName = "EUC-KR";                    // - is this mapping really correct?
401             notFirstValue = OFTrue;                     //   "ISO-IR-149" does not work with the sample from DICOM PS 3.5
402         }
403         else if (definedTerm == "ISO 2022 IR 58")       // Simplified Chinese (multi-byte)
404         {
405             encodingName = "GB2312";                    // - should work, but not tested yet!
406             notFirstValue = OFTrue;
407         }
408         else {
409             // create an appropriate error code
410             OFOStringStream stream;
411             stream << "Cannot select source character set: SpecificCharacterSet (0008,0005) value " << (i + 1)
412                 << " of " << sourceVM << " '" << definedTerm << "' not supported" << OFStringStream_ends;
413             OFSTRINGSTREAM_GETOFSTRING(stream, message)
414             status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
415         }
416         // check whether character set is allowed as the default (first value)
417         if ((i == 0) && notFirstValue)
418         {
419             OFOStringStream stream;
420             stream << "Cannot select source character set: '" << definedTerm << "' is not a allowed "
421                 << "as the first value in SpecificCharacterSet (0008,0005)" << OFStringStream_ends;
422             OFSTRINGSTREAM_GETOFSTRING(stream, message)
423             status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
424         }
425         // add descriptor to the map using the defined term as a key
426         if (status.good() && !encodingName.empty())
427         {
428             OFPair<T_EncodingConvertersMap::iterator,OFBool> conv = EncodingConverters.insert(
429                 OFMake_pair(definedTerm, OFCharacterEncoding()));
430             // but first check whether this encoding has already been added before
431             if (conv.second)
432             {
433                 status = conv.first->second.selectEncoding(encodingName, DestinationEncoding);
434                 if (status.good())
435                 {
436                     // output some useful debug information
437                     DCMDATA_DEBUG("DcmSpecificCharacterSet: Added character set '" << definedTerm
438                         << "' (" << encodingName << ") for the conversion to " << DestinationEncoding);
439                     // also remember the default descriptor, which refers to the first character set
440                     if (i == 0)
441                     {
442                         DefaultEncodingConverter = conv.first->second;
443                         DCMDATA_TRACE("DcmSpecificCharacterSet: Also selected this character set "
444                             << "(i.e. '" << definedTerm << "') as the default one");
445                     }
446                 } else {
447                     DCMDATA_ERROR("DcmSpecificCharacterSet: '" << definedTerm <<
448                         "' is not supported by the utilized character set conversion library '"
449                         << OFCharacterEncoding::getLibraryVersionString() << '\'');
450                     EncodingConverters.erase(conv.first);
451                 }
452             } else {
453                 DCMDATA_WARN("DcmSpecificCharacterSet: '" << definedTerm << "' is defined more than once "
454                     << "in SpecificCharacterSet (0008,0005), ignoring the duplicate definition");
455             }
456         }
457         ++i;
458     }
459     // add ASCII to the map if needed but not already there
460     if (status.good() && needsASCII)
461     {
462         OFPair<T_EncodingConvertersMap::iterator,OFBool> conv = EncodingConverters.insert(
463             OFMake_pair(OFString("ISO 2022 IR 6"), OFCharacterEncoding()));
464         if (conv.second)
465         {
466             status = conv.first->second.selectEncoding("ASCII", DestinationEncoding);
467             if (status.good())
468             {
469                 // output some useful debug information
470                 DCMDATA_DEBUG("DcmSpecificCharacterSet: Added character set 'ISO 2022 IR 6' (ASCII) "
471                     << "for the conversion to " << DestinationEncoding
472                     << " (because it is needed for one or more of the previously added character sets)");
473             } else {
474                 DCMDATA_ERROR("DcmSpecificCharacterSet: 'ISO 2022 IR 6' is not supported by"
475                     << " the utilized character set conversion library '"
476                     << OFCharacterEncoding::getLibraryVersionString() << '\'');
477                 EncodingConverters.erase(conv.first);
478             }
479         }
480     }
481     return status;
482 }
483 
484 
convertString(const OFString & fromString,OFString & toString,const OFString & delimiters)485 OFCondition DcmSpecificCharacterSet::convertString(const OFString &fromString,
486                                                    OFString &toString,
487                                                    const OFString &delimiters)
488 {
489     // call the real method converting the given string
490     return convertString(fromString.c_str(), fromString.length(), toString, delimiters);
491 }
492 
493 
convertString(const char * fromString,const size_t fromLength,OFString & toString,const OFString & delimiters)494 OFCondition DcmSpecificCharacterSet::convertString(const char *fromString,
495                                                    const size_t fromLength,
496                                                    OFString &toString,
497                                                    const OFString &delimiters)
498 {
499     OFCondition status = EC_Normal;
500     // check whether there are any code extensions at all
501     if (EncodingConverters.empty() || !checkForEscapeCharacter(fromString, fromLength))
502     {
503         DCMDATA_DEBUG("DcmSpecificCharacterSet: Converting '"
504             << convertToLengthLimitedOctalString(fromString, fromLength) << "'");
505         // no code extensions according to ISO 2022 used - this is the simple case
506         status = DefaultEncodingConverter.convertString(fromString, fromLength, toString, OFTrue /*clearMode*/);
507     } else {
508         if (delimiters.empty())
509         {
510             DCMDATA_DEBUG("DcmSpecificCharacterSet: Converting '"
511                 << convertToLengthLimitedOctalString(fromString, fromLength)
512                 << "' (with code extensions)");
513         } else {
514             DCMDATA_DEBUG("DcmSpecificCharacterSet: Converting '"
515                 << convertToLengthLimitedOctalString(fromString, fromLength)
516                 << "' (with code extensions and delimiters '" << delimiters << "')");
517         }
518         // code extensions according to ISO 2022 used, so we need to check for
519         // particular escape sequences in order to switch between character sets
520         toString.clear();
521         size_t pos = 0;
522         // some (extended) character sets use more than 1 byte per character
523         // (however, the default character set always uses a single byte)
524         unsigned char bytesPerChar = 1;
525         // check whether '=' is a delimiter, as it is used in PN values
526         OFBool isFirstGroup = (delimiters.find('=') != OFString_npos);
527         // by default, we expect that delimiters can be checked by their corresponding ASCII codes
528         // (this implies that the default character set is not "ISO 2022 IR 87" or "ISO 2022 IR 159")
529         OFBool checkDelimiters = OFTrue;
530         const char *firstChar = fromString;
531         const char *currentChar = fromString;
532         // initially, use the default descriptor
533         OFCharacterEncoding converter = DefaultEncodingConverter;
534         DCMDATA_TRACE("  Starting with the default character set");
535         // iterate over all characters of the string (as long as there is no error)
536         while ((pos < fromLength) && status.good())
537         {
538             const char c0 = *currentChar++;
539             // check for characters ESC, HT, LF, FF, CR or any other specified delimiter
540             const OFBool isEscape = (c0 == '\033');
541             const OFBool isDelimiter = checkDelimiters &&
542                 ((c0 == '\011') || (c0 == '\012') || (c0 == '\014') || (c0 == '\015') || (delimiters.find(c0) != OFString_npos));
543             if (isEscape || isDelimiter)
544             {
545                 // convert the sub-string (before the delimiter) with the current character set
546                 const size_t convertLength = currentChar - firstChar - 1;
547                 if (convertLength > 0)
548                 {
549                     // output some debug information
550                     DCMDATA_TRACE("    Converting sub-string '"
551                         << convertToLengthLimitedOctalString(firstChar, convertLength) << "'");
552                     status = converter.convertString(firstChar, convertLength, toString, OFFalse /*clearMode*/);
553                     if (status.bad())
554                         DCMDATA_TRACE("    -> ERROR: " << status.text());
555                 }
556                 // check whether this was the first component group of a PN value
557                 if (isDelimiter && (c0 == '='))
558                     isFirstGroup = OFFalse;
559             }
560             // the ESC character is used to explicitly switch between character sets
561             if (isEscape)
562             {
563                 // report a warning as this is a violation of DICOM PS 3.5 Section 6.2.1
564                 if (isFirstGroup)
565                 {
566                     DCMDATA_WARN("DcmSpecificCharacterSet: Escape sequences shall not be used "
567                         << "in the first component group of a Person Name (PN), using them anyway");
568                 }
569                 // we need at least two more characters to determine the new character set
570                 size_t escLength = 2;
571                 if (pos + escLength < fromLength)
572                 {
573                     OFString key;
574                     const char c1 = *currentChar++;
575                     const char c2 = *currentChar++;
576                     char c3 = '\0';
577                     if ((c1 == 0x28) && (c2 == 0x42))       // ASCII
578                         key = "ISO 2022 IR 6";
579                     else if ((c1 == 0x2d) && (c2 == 0x41))  // Latin alphabet No. 1
580                         key = "ISO 2022 IR 100";
581                     else if ((c1 == 0x2d) && (c2 == 0x42))  // Latin alphabet No. 2
582                         key = "ISO 2022 IR 101";
583                     else if ((c1 == 0x2d) && (c2 == 0x43))  // Latin alphabet No. 3
584                         key = "ISO 2022 IR 109";
585                     else if ((c1 == 0x2d) && (c2 == 0x44))  // Latin alphabet No. 4
586                         key = "ISO 2022 IR 110";
587                     else if ((c1 == 0x2d) && (c2 == 0x4c))  // Cyrillic
588                         key = "ISO 2022 IR 144";
589                     else if ((c1 == 0x2d) && (c2 == 0x47))  // Arabic
590                         key = "ISO 2022 IR 127";
591                     else if ((c1 == 0x2d) && (c2 == 0x46))  // Greek
592                         key = "ISO 2022 IR 126";
593                     else if ((c1 == 0x2d) && (c2 == 0x48))  // Hebrew
594                         key = "ISO 2022 IR 138";
595                     else if ((c1 == 0x2d) && (c2 == 0x4d))  // Latin alphabet No. 5
596                         key = "ISO 2022 IR 148";
597                     else if ((c1 == 0x29) && (c2 == 0x49))  // Japanese
598                         key = "ISO 2022 IR 13";
599                     else if ((c1 == 0x28) && (c2 == 0x4a))  // Japanese - is this really correct?
600                         key = "ISO 2022 IR 13";
601                     else if ((c1 == 0x2d) && (c2 == 0x54))  // Thai
602                         key = "ISO 2022 IR 166";
603                     else if ((c1 == 0x24) && (c2 == 0x42))  // Japanese (multi-byte)
604                         key = "ISO 2022 IR 87";
605                     else if ((c1 == 0x24) && (c2 == 0x28))  // Japanese (multi-byte)
606                     {
607                         escLength = 3;
608                         // do we still have another character in the string?
609                         if (pos + escLength < fromLength)
610                         {
611                             c3 = *currentChar++;
612                             if (c3 == 0x44)
613                                 key = "ISO 2022 IR 159";
614                         }
615                     }
616                     else if ((c1 == 0x24) && (c2 == 0x29)) // might be Korean or Chinese
617                     {
618                         escLength = 3;
619                         // do we still have another character in the string?
620                         if (pos + escLength < fromLength)
621                         {
622                             c3 = *currentChar++;
623                             if (c3 == 0x43)                // Korean (single- and multi-byte)
624                                 key = "ISO 2022 IR 149";
625                             else if (c3 == 0x41)           // Simplified Chinese (multi-byte)
626                                 key = "ISO 2022 IR 58";
627                         }
628                     }
629                     // check whether a valid escape sequence has been found
630                     if (key.empty())
631                     {
632                         OFOStringStream stream;
633                         stream << "Cannot convert character set: Illegal escape sequence 'ESC "
634                             << STD_NAMESPACE dec << STD_NAMESPACE setfill('0')
635                             << STD_NAMESPACE setw(2) << OFstatic_cast(int, c1 >> 4) << "/"
636                             << STD_NAMESPACE setw(2) << OFstatic_cast(int, c1 & 0x0f) << " "
637                             << STD_NAMESPACE setw(2) << OFstatic_cast(int, c2 >> 4) << "/"
638                             << STD_NAMESPACE setw(2) << OFstatic_cast(int, c2 & 0x0f);
639                         if (escLength == 3)
640                         {
641                             stream << " " << STD_NAMESPACE setw(2) << OFstatic_cast(int, c3 >> 4) << "/"
642                                 << STD_NAMESPACE setw(2) << OFstatic_cast(int, c3 & 0x0f);
643                         }
644                         stream  << "' found" << OFStringStream_ends;
645                         OFSTRINGSTREAM_GETOFSTRING(stream, message)
646                         status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotConvertCharacterSet, OF_error, message.c_str());
647                     }
648                     if (status.good())
649                     {
650                         DCMDATA_TRACE("  Switching to character set '" << key << "'");
651                         T_EncodingConvertersMap::const_iterator it = EncodingConverters.find(key);
652                         // check whether the descriptor was found in the map, i.e. properly declared in (0008,0005)
653                         if (it != EncodingConverters.end())
654                         {
655                             converter = it->second;
656                             // special case: these Japanese character sets replace the ASCII part (G0 code area),
657                             // so according to DICOM PS 3.5 Section 6.2.1.2 an explicit switch to the default is required
658                             checkDelimiters = (key != "ISO 2022 IR 87") && (key != "ISO 2022 IR 159");
659                             // determine number of bytes per character (used by the selected character set)
660                             if ((key == "ISO 2022 IR 87") || (key == "ISO 2022 IR 159") || (key == "ISO 2022 IR 58"))
661                             {
662                                 DCMDATA_TRACE("    Now using 2 bytes per character");
663                                 bytesPerChar = 2;
664                             }
665                             else if (key == "ISO 2022 IR 149")
666                             {
667                                 DCMDATA_TRACE("    Now using 1 or 2 bytes per character");
668                                 bytesPerChar = 0;      // special handling for single- and multi-byte
669                             } else {
670                                 DCMDATA_TRACE("    Now using 1 byte per character");
671                                 bytesPerChar = 1;
672                             }
673                         } else {
674                             OFOStringStream stream;
675                             stream << "Cannot convert character set: Escape sequence refers to character set '" << key << "' that "
676                                 "was not declared in SpecificCharacterSet (0008,0005)" << OFStringStream_ends;
677                             OFSTRINGSTREAM_GETOFSTRING(stream, message)
678                             status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotConvertCharacterSet, OF_error, message.c_str());
679                         }
680                     }
681                     pos += escLength;
682                 }
683                 // check whether the escape sequence was complete
684                 if (status.good() && (pos >= fromLength))
685                 {
686                     OFOStringStream stream;
687                     stream << "Cannot convert character set: Incomplete escape sequence (" << (escLength + 1)
688                         << " bytes expected) at the end of the string to be converted" << OFStringStream_ends;
689                     OFSTRINGSTREAM_GETOFSTRING(stream, message)
690                     status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotConvertCharacterSet, OF_error, message.c_str());
691                 }
692                 // do not copy the escape sequence to the output
693                 firstChar = currentChar;
694             }
695             // the HT, LF, FF, CR character or other delimiters (depending on the VR) also cause a switch
696             else if (isDelimiter)
697             {
698                 // output some debug information
699                 DCMDATA_TRACE("    Appending delimiter '"
700                     << convertToLengthLimitedOctalString(currentChar - 1 /* identical to c0 */, 1)
701                     << "' to the output");
702                 // don't forget to append the delimiter
703                 toString += c0;
704                 // use the default descriptor again (see DICOM PS 3.5)
705                 if (converter != DefaultEncodingConverter)
706                 {
707                     DCMDATA_TRACE("  Switching back to the default character set (because a delimiter was found)");
708                     converter = DefaultEncodingConverter;
709                     checkDelimiters = OFTrue;
710                 }
711                 // start new sub-string after delimiter
712                 firstChar = currentChar;
713             }
714             // skip remaining bytes of current character (if any)
715             else if (bytesPerChar != 1)
716             {
717                 const size_t skipBytes = (bytesPerChar > 0) ? (bytesPerChar - 1) : ((c0 & 0x80) ? 1 : 0);
718                 if (pos + skipBytes < fromLength)
719                     currentChar += skipBytes;
720                 pos += skipBytes;
721             }
722             ++pos;
723         }
724         if (status.good())
725         {
726             // convert any remaining characters from the input string
727             const size_t convertLength = currentChar - firstChar;
728             if (convertLength > 0)
729             {
730                 // output some debug information
731                 DCMDATA_TRACE("    Converting remaining sub-string '"
732                     << convertToLengthLimitedOctalString(firstChar, convertLength) << "'");
733                 status = converter.convertString(firstChar, convertLength, toString, OFFalse /*clearMode*/);
734                 if (status.bad())
735                     DCMDATA_TRACE("    -> ERROR: " << status.text());
736             }
737         }
738     }
739     if (status.good())
740     {
741         // finally, output some debug information
742         if (DestinationEncoding == "UTF-8")
743         {
744             // output code points only in case of UTF-8 output
745             DCMDATA_TRACE("Converted result in " << DestinationEncoding << " is '"
746                 << convertToLengthLimitedOctalString(toString.c_str(), toString.length()) << "' ("
747                 << countCharactersInUTF8String(toString) << " code points)");
748         } else {
749             DCMDATA_TRACE("Converted result in " << DestinationEncoding << " is '"
750                 << convertToLengthLimitedOctalString(toString.c_str(), toString.length()) << "'");
751         }
752     }
753     return status;
754 }
755 
756 
isConversionAvailable()757 OFBool DcmSpecificCharacterSet::isConversionAvailable()
758 {
759     // just call the appropriate function from the underlying class
760     return OFCharacterEncoding::isLibraryAvailable();
761 }
762 
763 
countCharactersInUTF8String(const OFString & utf8String)764 size_t DcmSpecificCharacterSet::countCharactersInUTF8String(const OFString &utf8String)
765 {
766     // just call the appropriate function from the underlying class
767     return OFCharacterEncoding::countCharactersInUTF8String(utf8String);
768 }
769 
770 
checkForEscapeCharacter(const char * strValue,const size_t strLength) const771 OFBool DcmSpecificCharacterSet::checkForEscapeCharacter(const char *strValue,
772                                                         const size_t strLength) const
773 {
774     OFBool result = OFFalse;
775     // iterate over the string of characters
776     for (size_t pos = 0; pos < strLength; ++pos)
777     {
778         // and search for the first ESC character
779         if (*strValue++ == '\033')
780         {
781             // then return with "true"
782             result = OFTrue;
783             break;
784         }
785     }
786     return result;
787 }
788 
789 
convertToLengthLimitedOctalString(const char * strValue,const size_t strLength) const790 OFString DcmSpecificCharacterSet::convertToLengthLimitedOctalString(const char *strValue,
791                                                                     const size_t strLength) const
792 {
793     OFString octalString;
794     // convert given string to octal representation, allow one character more than the maximum ...
795     OFStandard::convertToOctalString(OFString(strValue, strLength), octalString, MAX_OUTPUT_STRING_LENGTH + 1);
796     // ... in order to determine whether trailing dots should be added, i.e. the string was cropped
797     if (octalString.length() > MAX_OUTPUT_STRING_LENGTH)
798     {
799         octalString.erase(MAX_OUTPUT_STRING_LENGTH);
800         octalString.append("...");
801     }
802     // return string by-value (in order to avoid another parameter)
803     return octalString;
804 }
805