1 /*
2 *
3 * Copyright (C) 2011-2017, OFFIS e.V.
4 * All rights reserved. See COPYRIGHT file for details.
5 *
6 * This software and supporting documentation were developed by
7 *
8 * OFFIS e.V.
9 * R&D Division Health
10 * Escherweg 2
11 * D-26121 Oldenburg, Germany
12 *
13 *
14 * Module: dcmdata
15 *
16 * Author: Joerg Riesmeier
17 *
18 * Purpose: Class for supporting the Specific Character Set attribute
19 *
20 */
21
22
23 #include "dcmtk/config/osconfig.h" /* make sure OS specific configuration is included first */
24
25 #include "dcmtk/dcmdata/dcspchrs.h"
26 #include "dcmtk/dcmdata/dcitem.h"
27 #include "dcmtk/dcmdata/dcbytstr.h"
28 #include "dcmtk/dcmdata/dcdeftag.h"
29 #include "dcmtk/ofstd/ofstream.h"
30 #include "dcmtk/ofstd/ofstd.h"
31
32
33 #define MAX_OUTPUT_STRING_LENGTH 60
34
35
36 /*------------------*
37 * implementation *
38 *------------------*/
39
DcmSpecificCharacterSet()40 DcmSpecificCharacterSet::DcmSpecificCharacterSet()
41 : SourceCharacterSet(),
42 DestinationCharacterSet(),
43 DestinationEncoding(),
44 DefaultEncodingConverter(),
45 EncodingConverters()
46 {
47 }
48
49
~DcmSpecificCharacterSet()50 DcmSpecificCharacterSet::~DcmSpecificCharacterSet()
51 {
52 clear();
53 }
54
55
clear()56 void DcmSpecificCharacterSet::clear()
57 {
58 DefaultEncodingConverter.clear();
59 EncodingConverters.clear();
60 SourceCharacterSet.clear();
61 DestinationCharacterSet.clear();
62 DestinationEncoding.clear();
63 }
64
65
operator OFBool() const66 DcmSpecificCharacterSet::operator OFBool() const
67 {
68 return OFstatic_cast(OFBool, DefaultEncodingConverter);
69 }
70
71
operator !() const72 OFBool DcmSpecificCharacterSet::operator!() const
73 {
74 return !DefaultEncodingConverter;
75 }
76
77
getSourceCharacterSet() const78 const OFString &DcmSpecificCharacterSet::getSourceCharacterSet() const
79 {
80 return SourceCharacterSet;
81 }
82
83
getDestinationCharacterSet() const84 const OFString &DcmSpecificCharacterSet::getDestinationCharacterSet() const
85 {
86 return DestinationCharacterSet;
87 }
88
89
getDestinationEncoding() const90 const OFString &DcmSpecificCharacterSet::getDestinationEncoding() const
91 {
92 return DestinationEncoding;
93 }
94
95
getConversionFlags() const96 unsigned DcmSpecificCharacterSet::getConversionFlags() const
97 {
98 return DefaultEncodingConverter.getConversionFlags();
99 }
100
101
setConversionFlags(const unsigned flags)102 OFCondition DcmSpecificCharacterSet::setConversionFlags(const unsigned flags)
103 {
104 if (!EncodingConverters.empty())
105 {
106 /* pass conversion flags to all "encoding converters" */
107 for (T_EncodingConvertersMap::iterator it = EncodingConverters.begin();
108 it != EncodingConverters.end(); ++it)
109 {
110 OFCondition status = it->second.setConversionFlags(flags);
111 if (status.bad())
112 return status;
113 }
114 return EC_Normal;
115 } else return DefaultEncodingConverter.setConversionFlags(flags);
116 }
117
118
selectCharacterSet(const OFString & fromCharset,const OFString & toCharset)119 OFCondition DcmSpecificCharacterSet::selectCharacterSet(const OFString &fromCharset,
120 const OFString &toCharset)
121 {
122 // first, make sure that all converters are cleared
123 clear();
124 // determine the destination encoding (and check whether it is supported at all)
125 OFCondition status = determineDestinationEncoding(toCharset);
126 if (status.good())
127 {
128 // normalize the given string (original VR is "CS" with VM "1-n")
129 SourceCharacterSet = fromCharset;
130 normalizeString(SourceCharacterSet, MULTIPART, DELETE_LEADING, DELETE_TRAILING);
131 // check whether it is multi-valued
132 const unsigned long sourceVM = DcmElement::determineVM(SourceCharacterSet.c_str(), SourceCharacterSet.length());
133 if (sourceVM == 0)
134 {
135 // no character set specified, use ASCII
136 status = DefaultEncodingConverter.selectEncoding("ASCII", DestinationEncoding);
137 // output some useful debug information
138 if (status.good())
139 {
140 DCMDATA_DEBUG("DcmSpecificCharacterSet: Selected character set '' (ASCII) "
141 << "for the conversion to " << DestinationEncoding);
142 }
143 }
144 else if (sourceVM == 1)
145 {
146 // a single character set specified (no code extensions)
147 status = selectCharacterSetWithoutCodeExtensions();
148 } else {
149 // multiple character sets specified (code extensions used)
150 status = selectCharacterSetWithCodeExtensions(sourceVM);
151 }
152 }
153 return status;
154 }
155
156
selectCharacterSet(DcmItem & dataset,const OFString & toCharset)157 OFCondition DcmSpecificCharacterSet::selectCharacterSet(DcmItem &dataset,
158 const OFString &toCharset)
159 {
160 OFString fromCharset;
161 // check whether Specific Character Set (0008,0005) is present in the given item/dataset
162 dataset.findAndGetOFStringArray(DCM_SpecificCharacterSet, fromCharset, OFFalse /*searchIntoSub*/);
163 // if missing or empty, the default character set (ASCII) will be used
164 return selectCharacterSet(fromCharset, toCharset);
165 }
166
167
determineDestinationEncoding(const OFString & toCharset)168 OFCondition DcmSpecificCharacterSet::determineDestinationEncoding(const OFString &toCharset)
169 {
170 OFCondition status = EC_Normal;
171 // normalize the given string (original VR is "CS" with VM "1-n", but we only support VM "1")
172 DestinationCharacterSet = toCharset;
173 normalizeString(DestinationCharacterSet, !MULTIPART, DELETE_LEADING, DELETE_TRAILING);
174 // there should only be a single character set specified (no code extensions)
175 if (DestinationCharacterSet.empty()) // ASCII (no value)
176 DestinationEncoding = "ASCII";
177 else if (DestinationCharacterSet == "ISO_IR 6") // ASCII
178 {
179 DCMDATA_WARN("DcmSpecificCharacterSet: 'ISO_IR 6' is not a defined term in DICOM, "
180 << "will be treated as an empty value (ASCII)");
181 DestinationCharacterSet.clear();
182 DestinationEncoding = "ASCII";
183 }
184 else if (DestinationCharacterSet == "ISO_IR 100") // Latin alphabet No. 1
185 DestinationEncoding = "ISO-8859-1";
186 else if (DestinationCharacterSet == "ISO_IR 101") // Latin alphabet No. 2
187 DestinationEncoding = "ISO-8859-2";
188 else if (DestinationCharacterSet == "ISO_IR 109") // Latin alphabet No. 3
189 DestinationEncoding = "ISO-8859-3";
190 else if (DestinationCharacterSet == "ISO_IR 110") // Latin alphabet No. 4
191 DestinationEncoding = "ISO-8859-4";
192 else if (DestinationCharacterSet == "ISO_IR 144") // Cyrillic
193 DestinationEncoding = "ISO-8859-5";
194 else if (DestinationCharacterSet == "ISO_IR 127") // Arabic
195 DestinationEncoding = "ISO-8859-6";
196 else if (DestinationCharacterSet == "ISO_IR 126") // Greek
197 DestinationEncoding = "ISO-8859-7";
198 else if (DestinationCharacterSet == "ISO_IR 138") // Hebrew
199 DestinationEncoding = "ISO-8859-8";
200 else if (DestinationCharacterSet == "ISO_IR 148") // Latin alphabet No. 5
201 DestinationEncoding = "ISO-8859-9";
202 else if (DestinationCharacterSet == "ISO_IR 13") // Japanese
203 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICONV
204 DestinationEncoding = "JIS_X0201"; // - the name "ISO-IR-13" is not supported by libiconv
205 #else
206 DestinationEncoding = "Shift_JIS"; // - ICU and stdlibc iconv only know "Shift_JIS" (is this mapping correct?)
207 #endif
208 else if (DestinationCharacterSet == "ISO_IR 166") // Thai
209 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICU
210 DestinationEncoding = "TIS-620"; // - the name "ISO-IR-166" is not supported by ICU
211 #else
212 DestinationEncoding = "ISO-IR-166";
213 #endif
214 else if (DestinationCharacterSet == "ISO_IR 192") // Unicode in UTF-8 (multi-byte)
215 DestinationEncoding = "UTF-8";
216 else if (DestinationCharacterSet == "GB18030") // Chinese (multi-byte)
217 DestinationEncoding = "GB18030";
218 else if (DestinationCharacterSet == "GBK") // Chinese (multi-byte, subset of "GB 18030")
219 DestinationEncoding = "GBK";
220 else {
221 DestinationEncoding.clear();
222 // create an appropriate error code
223 OFOStringStream stream;
224 stream << "Cannot select destination character set: SpecificCharacterSet (0008,0005) value '"
225 << DestinationCharacterSet << "' not supported" << OFStringStream_ends;
226 OFSTRINGSTREAM_GETOFSTRING(stream, message)
227 status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
228 }
229 return status;
230 }
231
232
selectCharacterSetWithoutCodeExtensions()233 OFCondition DcmSpecificCharacterSet::selectCharacterSetWithoutCodeExtensions()
234 {
235 OFCondition status = EC_Normal;
236 // a single character set specified (no code extensions)
237 OFString fromEncoding;
238 if (SourceCharacterSet == "ISO_IR 6") // ASCII
239 {
240 DCMDATA_WARN("DcmSpecificCharacterSet: 'ISO_IR 6' is not a defined term in DICOM, "
241 << "will be treated as an empty value (ASCII)");
242 SourceCharacterSet.clear();
243 fromEncoding = "ASCII";
244 }
245 else if (SourceCharacterSet == "ISO_IR 100") // Latin alphabet No. 1
246 fromEncoding = "ISO-8859-1";
247 else if (SourceCharacterSet == "ISO_IR 101") // Latin alphabet No. 2
248 fromEncoding = "ISO-8859-2";
249 else if (SourceCharacterSet == "ISO_IR 109") // Latin alphabet No. 3
250 fromEncoding = "ISO-8859-3";
251 else if (SourceCharacterSet == "ISO_IR 110") // Latin alphabet No. 4
252 fromEncoding = "ISO-8859-4";
253 else if (SourceCharacterSet == "ISO_IR 144") // Cyrillic
254 fromEncoding = "ISO-8859-5";
255 else if (SourceCharacterSet == "ISO_IR 127") // Arabic
256 fromEncoding = "ISO-8859-6";
257 else if (SourceCharacterSet == "ISO_IR 126") // Greek
258 fromEncoding = "ISO-8859-7";
259 else if (SourceCharacterSet == "ISO_IR 138") // Hebrew
260 fromEncoding = "ISO-8859-8";
261 else if (SourceCharacterSet == "ISO_IR 148") // Latin alphabet No. 5
262 fromEncoding = "ISO-8859-9";
263 else if (SourceCharacterSet == "ISO_IR 13") // Japanese
264 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICONV
265 fromEncoding = "JIS_X0201"; // - the name "ISO-IR-13" is not supported by libiconv
266 #else
267 fromEncoding = "Shift_JIS"; // - ICU and stdlibc iconv only know "Shift_JIS" (is this mapping correct?)
268 #endif
269 else if (SourceCharacterSet == "ISO_IR 166") // Thai
270 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICU
271 fromEncoding = "TIS-620"; // - the name "ISO-IR-166" is not supported by ICU
272 #else
273 fromEncoding = "ISO-IR-166";
274 #endif
275 else if (SourceCharacterSet == "ISO_IR 192") // Unicode in UTF-8 (multi-byte)
276 fromEncoding = "UTF-8";
277 else if (SourceCharacterSet == "GB18030") // Chinese (multi-byte)
278 fromEncoding = "GB18030";
279 else if (SourceCharacterSet == "GBK") // Chinese (multi-byte, subset of "GB 18030")
280 fromEncoding = "GBK";
281 else {
282 // create an appropriate error code
283 OFOStringStream stream;
284 stream << "Cannot select source character set: SpecificCharacterSet (0008,0005) value '"
285 << SourceCharacterSet << "' not supported" << OFStringStream_ends;
286 OFSTRINGSTREAM_GETOFSTRING(stream, message)
287 status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
288 }
289 // check whether an appropriate character encoding has been found
290 if (!fromEncoding.empty())
291 {
292 status = DefaultEncodingConverter.selectEncoding(fromEncoding, DestinationEncoding);
293 // output some useful debug information
294 if (status.good())
295 {
296 DCMDATA_DEBUG("DcmSpecificCharacterSet: Selected character set '" << SourceCharacterSet
297 << "' (" << fromEncoding << ") for the conversion to " << DestinationEncoding);
298 }
299 }
300 return status;
301 }
302
303
selectCharacterSetWithCodeExtensions(const unsigned long sourceVM)304 OFCondition DcmSpecificCharacterSet::selectCharacterSetWithCodeExtensions(const unsigned long sourceVM)
305 {
306 // first, check whether multiple character sets are specified (i.e. code extensions used)
307 if (sourceVM <= 1)
308 return EC_IllegalCall;
309 // then proceed with the real work
310 OFCondition status = EC_Normal;
311 size_t pos = 0;
312 OFBool needsASCII = OFFalse;
313 OFBool notFirstValue = OFFalse;
314 OFString definedTerm;
315 unsigned long i = 0;
316 while ((i < sourceVM) && status.good())
317 {
318 // extract single value from string (separated by a backslash)
319 pos = DcmElement::getValueFromString(SourceCharacterSet.c_str(), pos, SourceCharacterSet.length(), definedTerm);
320 if (definedTerm.empty() && (i == 0)) // assuming ASCII (according to DICOM PS 3.5)
321 definedTerm = "ISO 2022 IR 6";
322 // determine character encoding from DICOM defined term
323 OFString encodingName;
324 if (definedTerm == "ISO 2022 IR 6") // ASCII
325 encodingName = "ASCII";
326 else if (definedTerm == "ISO 2022 IR 100") // Latin alphabet No. 1
327 {
328 encodingName = "ISO-8859-1";
329 needsASCII = OFTrue;
330 }
331 else if (definedTerm == "ISO 2022 IR 101") // Latin alphabet No. 2
332 {
333 encodingName = "ISO-8859-2";
334 needsASCII = OFTrue;
335 }
336 else if (definedTerm == "ISO 2022 IR 109") // Latin alphabet No. 3
337 {
338 encodingName = "ISO-8859-3";
339 needsASCII = OFTrue;
340 }
341 else if (definedTerm == "ISO 2022 IR 110") // Latin alphabet No. 4
342 {
343 encodingName = "ISO-8859-4";
344 needsASCII = OFTrue;
345 }
346 else if (definedTerm == "ISO 2022 IR 144") // Cyrillic
347 {
348 encodingName = "ISO-8859-5";
349 needsASCII = OFTrue;
350 }
351 else if (definedTerm == "ISO 2022 IR 127") // Arabic
352 {
353 encodingName = "ISO-8859-6";
354 needsASCII = OFTrue;
355 }
356 else if (definedTerm == "ISO 2022 IR 126") // Greek
357 {
358 encodingName = "ISO-8859-7";
359 needsASCII = OFTrue;
360 }
361 else if (definedTerm == "ISO 2022 IR 138") // Hebrew
362 {
363 encodingName = "ISO-8859-8";
364 needsASCII = OFTrue;
365 }
366 else if (definedTerm == "ISO 2022 IR 148") // Latin alphabet No. 5
367 {
368 encodingName = "ISO-8859-9";
369 needsASCII = OFTrue;
370 }
371 else if (definedTerm == "ISO 2022 IR 13") // Japanese
372 {
373 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICONV
374 encodingName = "JIS_X0201"; // - the name "ISO-IR-13" is not supported by libiconv
375 #else
376 encodingName = "Shift_JIS"; // - ICU and stdlibc iconv only know "Shift_JIS" (is this mapping correct?)
377 #endif
378 }
379 else if (definedTerm == "ISO 2022 IR 166") // Thai
380 {
381 #if DCMTK_ENABLE_CHARSET_CONVERSION == DCMTK_CHARSET_CONVERSION_ICU
382 encodingName = "TIS-620"; // - "ISO-IR-166" is not supported by ICU
383 #else
384 encodingName = "ISO-IR-166";
385 #endif
386 needsASCII = OFTrue;
387 }
388 else if (definedTerm == "ISO 2022 IR 87") // Japanese (multi-byte)
389 {
390 encodingName = "ISO-IR-87"; // - this might generate an error since "ISO-IR-87" is not supported by ICU and stdlibc iconv
391 notFirstValue = OFTrue;
392 }
393 else if (definedTerm == "ISO 2022 IR 159") // Japanese (multi-byte)
394 {
395 encodingName = "ISO-IR-159"; // - this might generate an error since "ISO-IR-159" is not supported by ICU and stdlibc iconv
396 notFirstValue = OFTrue;
397 }
398 else if (definedTerm == "ISO 2022 IR 149") // Korean (multi-byte)
399 {
400 encodingName = "EUC-KR"; // - is this mapping really correct?
401 notFirstValue = OFTrue; // "ISO-IR-149" does not work with the sample from DICOM PS 3.5
402 }
403 else if (definedTerm == "ISO 2022 IR 58") // Simplified Chinese (multi-byte)
404 {
405 encodingName = "GB2312"; // - should work, but not tested yet!
406 notFirstValue = OFTrue;
407 }
408 else {
409 // create an appropriate error code
410 OFOStringStream stream;
411 stream << "Cannot select source character set: SpecificCharacterSet (0008,0005) value " << (i + 1)
412 << " of " << sourceVM << " '" << definedTerm << "' not supported" << OFStringStream_ends;
413 OFSTRINGSTREAM_GETOFSTRING(stream, message)
414 status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
415 }
416 // check whether character set is allowed as the default (first value)
417 if ((i == 0) && notFirstValue)
418 {
419 OFOStringStream stream;
420 stream << "Cannot select source character set: '" << definedTerm << "' is not a allowed "
421 << "as the first value in SpecificCharacterSet (0008,0005)" << OFStringStream_ends;
422 OFSTRINGSTREAM_GETOFSTRING(stream, message)
423 status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotSelectCharacterSet, OF_error, message.c_str());
424 }
425 // add descriptor to the map using the defined term as a key
426 if (status.good() && !encodingName.empty())
427 {
428 OFPair<T_EncodingConvertersMap::iterator,OFBool> conv = EncodingConverters.insert(
429 OFMake_pair(definedTerm, OFCharacterEncoding()));
430 // but first check whether this encoding has already been added before
431 if (conv.second)
432 {
433 status = conv.first->second.selectEncoding(encodingName, DestinationEncoding);
434 if (status.good())
435 {
436 // output some useful debug information
437 DCMDATA_DEBUG("DcmSpecificCharacterSet: Added character set '" << definedTerm
438 << "' (" << encodingName << ") for the conversion to " << DestinationEncoding);
439 // also remember the default descriptor, which refers to the first character set
440 if (i == 0)
441 {
442 DefaultEncodingConverter = conv.first->second;
443 DCMDATA_TRACE("DcmSpecificCharacterSet: Also selected this character set "
444 << "(i.e. '" << definedTerm << "') as the default one");
445 }
446 } else {
447 DCMDATA_ERROR("DcmSpecificCharacterSet: '" << definedTerm <<
448 "' is not supported by the utilized character set conversion library '"
449 << OFCharacterEncoding::getLibraryVersionString() << '\'');
450 EncodingConverters.erase(conv.first);
451 }
452 } else {
453 DCMDATA_WARN("DcmSpecificCharacterSet: '" << definedTerm << "' is defined more than once "
454 << "in SpecificCharacterSet (0008,0005), ignoring the duplicate definition");
455 }
456 }
457 ++i;
458 }
459 // add ASCII to the map if needed but not already there
460 if (status.good() && needsASCII)
461 {
462 OFPair<T_EncodingConvertersMap::iterator,OFBool> conv = EncodingConverters.insert(
463 OFMake_pair(OFString("ISO 2022 IR 6"), OFCharacterEncoding()));
464 if (conv.second)
465 {
466 status = conv.first->second.selectEncoding("ASCII", DestinationEncoding);
467 if (status.good())
468 {
469 // output some useful debug information
470 DCMDATA_DEBUG("DcmSpecificCharacterSet: Added character set 'ISO 2022 IR 6' (ASCII) "
471 << "for the conversion to " << DestinationEncoding
472 << " (because it is needed for one or more of the previously added character sets)");
473 } else {
474 DCMDATA_ERROR("DcmSpecificCharacterSet: 'ISO 2022 IR 6' is not supported by"
475 << " the utilized character set conversion library '"
476 << OFCharacterEncoding::getLibraryVersionString() << '\'');
477 EncodingConverters.erase(conv.first);
478 }
479 }
480 }
481 return status;
482 }
483
484
convertString(const OFString & fromString,OFString & toString,const OFString & delimiters)485 OFCondition DcmSpecificCharacterSet::convertString(const OFString &fromString,
486 OFString &toString,
487 const OFString &delimiters)
488 {
489 // call the real method converting the given string
490 return convertString(fromString.c_str(), fromString.length(), toString, delimiters);
491 }
492
493
convertString(const char * fromString,const size_t fromLength,OFString & toString,const OFString & delimiters)494 OFCondition DcmSpecificCharacterSet::convertString(const char *fromString,
495 const size_t fromLength,
496 OFString &toString,
497 const OFString &delimiters)
498 {
499 OFCondition status = EC_Normal;
500 // check whether there are any code extensions at all
501 if (EncodingConverters.empty() || !checkForEscapeCharacter(fromString, fromLength))
502 {
503 DCMDATA_DEBUG("DcmSpecificCharacterSet: Converting '"
504 << convertToLengthLimitedOctalString(fromString, fromLength) << "'");
505 // no code extensions according to ISO 2022 used - this is the simple case
506 status = DefaultEncodingConverter.convertString(fromString, fromLength, toString, OFTrue /*clearMode*/);
507 } else {
508 if (delimiters.empty())
509 {
510 DCMDATA_DEBUG("DcmSpecificCharacterSet: Converting '"
511 << convertToLengthLimitedOctalString(fromString, fromLength)
512 << "' (with code extensions)");
513 } else {
514 DCMDATA_DEBUG("DcmSpecificCharacterSet: Converting '"
515 << convertToLengthLimitedOctalString(fromString, fromLength)
516 << "' (with code extensions and delimiters '" << delimiters << "')");
517 }
518 // code extensions according to ISO 2022 used, so we need to check for
519 // particular escape sequences in order to switch between character sets
520 toString.clear();
521 size_t pos = 0;
522 // some (extended) character sets use more than 1 byte per character
523 // (however, the default character set always uses a single byte)
524 unsigned char bytesPerChar = 1;
525 // check whether '=' is a delimiter, as it is used in PN values
526 OFBool isFirstGroup = (delimiters.find('=') != OFString_npos);
527 // by default, we expect that delimiters can be checked by their corresponding ASCII codes
528 // (this implies that the default character set is not "ISO 2022 IR 87" or "ISO 2022 IR 159")
529 OFBool checkDelimiters = OFTrue;
530 const char *firstChar = fromString;
531 const char *currentChar = fromString;
532 // initially, use the default descriptor
533 OFCharacterEncoding converter = DefaultEncodingConverter;
534 DCMDATA_TRACE(" Starting with the default character set");
535 // iterate over all characters of the string (as long as there is no error)
536 while ((pos < fromLength) && status.good())
537 {
538 const char c0 = *currentChar++;
539 // check for characters ESC, HT, LF, FF, CR or any other specified delimiter
540 const OFBool isEscape = (c0 == '\033');
541 const OFBool isDelimiter = checkDelimiters &&
542 ((c0 == '\011') || (c0 == '\012') || (c0 == '\014') || (c0 == '\015') || (delimiters.find(c0) != OFString_npos));
543 if (isEscape || isDelimiter)
544 {
545 // convert the sub-string (before the delimiter) with the current character set
546 const size_t convertLength = currentChar - firstChar - 1;
547 if (convertLength > 0)
548 {
549 // output some debug information
550 DCMDATA_TRACE(" Converting sub-string '"
551 << convertToLengthLimitedOctalString(firstChar, convertLength) << "'");
552 status = converter.convertString(firstChar, convertLength, toString, OFFalse /*clearMode*/);
553 if (status.bad())
554 DCMDATA_TRACE(" -> ERROR: " << status.text());
555 }
556 // check whether this was the first component group of a PN value
557 if (isDelimiter && (c0 == '='))
558 isFirstGroup = OFFalse;
559 }
560 // the ESC character is used to explicitly switch between character sets
561 if (isEscape)
562 {
563 // report a warning as this is a violation of DICOM PS 3.5 Section 6.2.1
564 if (isFirstGroup)
565 {
566 DCMDATA_WARN("DcmSpecificCharacterSet: Escape sequences shall not be used "
567 << "in the first component group of a Person Name (PN), using them anyway");
568 }
569 // we need at least two more characters to determine the new character set
570 size_t escLength = 2;
571 if (pos + escLength < fromLength)
572 {
573 OFString key;
574 const char c1 = *currentChar++;
575 const char c2 = *currentChar++;
576 char c3 = '\0';
577 if ((c1 == 0x28) && (c2 == 0x42)) // ASCII
578 key = "ISO 2022 IR 6";
579 else if ((c1 == 0x2d) && (c2 == 0x41)) // Latin alphabet No. 1
580 key = "ISO 2022 IR 100";
581 else if ((c1 == 0x2d) && (c2 == 0x42)) // Latin alphabet No. 2
582 key = "ISO 2022 IR 101";
583 else if ((c1 == 0x2d) && (c2 == 0x43)) // Latin alphabet No. 3
584 key = "ISO 2022 IR 109";
585 else if ((c1 == 0x2d) && (c2 == 0x44)) // Latin alphabet No. 4
586 key = "ISO 2022 IR 110";
587 else if ((c1 == 0x2d) && (c2 == 0x4c)) // Cyrillic
588 key = "ISO 2022 IR 144";
589 else if ((c1 == 0x2d) && (c2 == 0x47)) // Arabic
590 key = "ISO 2022 IR 127";
591 else if ((c1 == 0x2d) && (c2 == 0x46)) // Greek
592 key = "ISO 2022 IR 126";
593 else if ((c1 == 0x2d) && (c2 == 0x48)) // Hebrew
594 key = "ISO 2022 IR 138";
595 else if ((c1 == 0x2d) && (c2 == 0x4d)) // Latin alphabet No. 5
596 key = "ISO 2022 IR 148";
597 else if ((c1 == 0x29) && (c2 == 0x49)) // Japanese
598 key = "ISO 2022 IR 13";
599 else if ((c1 == 0x28) && (c2 == 0x4a)) // Japanese - is this really correct?
600 key = "ISO 2022 IR 13";
601 else if ((c1 == 0x2d) && (c2 == 0x54)) // Thai
602 key = "ISO 2022 IR 166";
603 else if ((c1 == 0x24) && (c2 == 0x42)) // Japanese (multi-byte)
604 key = "ISO 2022 IR 87";
605 else if ((c1 == 0x24) && (c2 == 0x28)) // Japanese (multi-byte)
606 {
607 escLength = 3;
608 // do we still have another character in the string?
609 if (pos + escLength < fromLength)
610 {
611 c3 = *currentChar++;
612 if (c3 == 0x44)
613 key = "ISO 2022 IR 159";
614 }
615 }
616 else if ((c1 == 0x24) && (c2 == 0x29)) // might be Korean or Chinese
617 {
618 escLength = 3;
619 // do we still have another character in the string?
620 if (pos + escLength < fromLength)
621 {
622 c3 = *currentChar++;
623 if (c3 == 0x43) // Korean (single- and multi-byte)
624 key = "ISO 2022 IR 149";
625 else if (c3 == 0x41) // Simplified Chinese (multi-byte)
626 key = "ISO 2022 IR 58";
627 }
628 }
629 // check whether a valid escape sequence has been found
630 if (key.empty())
631 {
632 OFOStringStream stream;
633 stream << "Cannot convert character set: Illegal escape sequence 'ESC "
634 << STD_NAMESPACE dec << STD_NAMESPACE setfill('0')
635 << STD_NAMESPACE setw(2) << OFstatic_cast(int, c1 >> 4) << "/"
636 << STD_NAMESPACE setw(2) << OFstatic_cast(int, c1 & 0x0f) << " "
637 << STD_NAMESPACE setw(2) << OFstatic_cast(int, c2 >> 4) << "/"
638 << STD_NAMESPACE setw(2) << OFstatic_cast(int, c2 & 0x0f);
639 if (escLength == 3)
640 {
641 stream << " " << STD_NAMESPACE setw(2) << OFstatic_cast(int, c3 >> 4) << "/"
642 << STD_NAMESPACE setw(2) << OFstatic_cast(int, c3 & 0x0f);
643 }
644 stream << "' found" << OFStringStream_ends;
645 OFSTRINGSTREAM_GETOFSTRING(stream, message)
646 status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotConvertCharacterSet, OF_error, message.c_str());
647 }
648 if (status.good())
649 {
650 DCMDATA_TRACE(" Switching to character set '" << key << "'");
651 T_EncodingConvertersMap::const_iterator it = EncodingConverters.find(key);
652 // check whether the descriptor was found in the map, i.e. properly declared in (0008,0005)
653 if (it != EncodingConverters.end())
654 {
655 converter = it->second;
656 // special case: these Japanese character sets replace the ASCII part (G0 code area),
657 // so according to DICOM PS 3.5 Section 6.2.1.2 an explicit switch to the default is required
658 checkDelimiters = (key != "ISO 2022 IR 87") && (key != "ISO 2022 IR 159");
659 // determine number of bytes per character (used by the selected character set)
660 if ((key == "ISO 2022 IR 87") || (key == "ISO 2022 IR 159") || (key == "ISO 2022 IR 58"))
661 {
662 DCMDATA_TRACE(" Now using 2 bytes per character");
663 bytesPerChar = 2;
664 }
665 else if (key == "ISO 2022 IR 149")
666 {
667 DCMDATA_TRACE(" Now using 1 or 2 bytes per character");
668 bytesPerChar = 0; // special handling for single- and multi-byte
669 } else {
670 DCMDATA_TRACE(" Now using 1 byte per character");
671 bytesPerChar = 1;
672 }
673 } else {
674 OFOStringStream stream;
675 stream << "Cannot convert character set: Escape sequence refers to character set '" << key << "' that "
676 "was not declared in SpecificCharacterSet (0008,0005)" << OFStringStream_ends;
677 OFSTRINGSTREAM_GETOFSTRING(stream, message)
678 status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotConvertCharacterSet, OF_error, message.c_str());
679 }
680 }
681 pos += escLength;
682 }
683 // check whether the escape sequence was complete
684 if (status.good() && (pos >= fromLength))
685 {
686 OFOStringStream stream;
687 stream << "Cannot convert character set: Incomplete escape sequence (" << (escLength + 1)
688 << " bytes expected) at the end of the string to be converted" << OFStringStream_ends;
689 OFSTRINGSTREAM_GETOFSTRING(stream, message)
690 status = makeOFCondition(OFM_dcmdata, EC_CODE_CannotConvertCharacterSet, OF_error, message.c_str());
691 }
692 // do not copy the escape sequence to the output
693 firstChar = currentChar;
694 }
695 // the HT, LF, FF, CR character or other delimiters (depending on the VR) also cause a switch
696 else if (isDelimiter)
697 {
698 // output some debug information
699 DCMDATA_TRACE(" Appending delimiter '"
700 << convertToLengthLimitedOctalString(currentChar - 1 /* identical to c0 */, 1)
701 << "' to the output");
702 // don't forget to append the delimiter
703 toString += c0;
704 // use the default descriptor again (see DICOM PS 3.5)
705 if (converter != DefaultEncodingConverter)
706 {
707 DCMDATA_TRACE(" Switching back to the default character set (because a delimiter was found)");
708 converter = DefaultEncodingConverter;
709 checkDelimiters = OFTrue;
710 }
711 // start new sub-string after delimiter
712 firstChar = currentChar;
713 }
714 // skip remaining bytes of current character (if any)
715 else if (bytesPerChar != 1)
716 {
717 const size_t skipBytes = (bytesPerChar > 0) ? (bytesPerChar - 1) : ((c0 & 0x80) ? 1 : 0);
718 if (pos + skipBytes < fromLength)
719 currentChar += skipBytes;
720 pos += skipBytes;
721 }
722 ++pos;
723 }
724 if (status.good())
725 {
726 // convert any remaining characters from the input string
727 const size_t convertLength = currentChar - firstChar;
728 if (convertLength > 0)
729 {
730 // output some debug information
731 DCMDATA_TRACE(" Converting remaining sub-string '"
732 << convertToLengthLimitedOctalString(firstChar, convertLength) << "'");
733 status = converter.convertString(firstChar, convertLength, toString, OFFalse /*clearMode*/);
734 if (status.bad())
735 DCMDATA_TRACE(" -> ERROR: " << status.text());
736 }
737 }
738 }
739 if (status.good())
740 {
741 // finally, output some debug information
742 if (DestinationEncoding == "UTF-8")
743 {
744 // output code points only in case of UTF-8 output
745 DCMDATA_TRACE("Converted result in " << DestinationEncoding << " is '"
746 << convertToLengthLimitedOctalString(toString.c_str(), toString.length()) << "' ("
747 << countCharactersInUTF8String(toString) << " code points)");
748 } else {
749 DCMDATA_TRACE("Converted result in " << DestinationEncoding << " is '"
750 << convertToLengthLimitedOctalString(toString.c_str(), toString.length()) << "'");
751 }
752 }
753 return status;
754 }
755
756
isConversionAvailable()757 OFBool DcmSpecificCharacterSet::isConversionAvailable()
758 {
759 // just call the appropriate function from the underlying class
760 return OFCharacterEncoding::isLibraryAvailable();
761 }
762
763
countCharactersInUTF8String(const OFString & utf8String)764 size_t DcmSpecificCharacterSet::countCharactersInUTF8String(const OFString &utf8String)
765 {
766 // just call the appropriate function from the underlying class
767 return OFCharacterEncoding::countCharactersInUTF8String(utf8String);
768 }
769
770
checkForEscapeCharacter(const char * strValue,const size_t strLength) const771 OFBool DcmSpecificCharacterSet::checkForEscapeCharacter(const char *strValue,
772 const size_t strLength) const
773 {
774 OFBool result = OFFalse;
775 // iterate over the string of characters
776 for (size_t pos = 0; pos < strLength; ++pos)
777 {
778 // and search for the first ESC character
779 if (*strValue++ == '\033')
780 {
781 // then return with "true"
782 result = OFTrue;
783 break;
784 }
785 }
786 return result;
787 }
788
789
convertToLengthLimitedOctalString(const char * strValue,const size_t strLength) const790 OFString DcmSpecificCharacterSet::convertToLengthLimitedOctalString(const char *strValue,
791 const size_t strLength) const
792 {
793 OFString octalString;
794 // convert given string to octal representation, allow one character more than the maximum ...
795 OFStandard::convertToOctalString(OFString(strValue, strLength), octalString, MAX_OUTPUT_STRING_LENGTH + 1);
796 // ... in order to determine whether trailing dots should be added, i.e. the string was cropped
797 if (octalString.length() > MAX_OUTPUT_STRING_LENGTH)
798 {
799 octalString.erase(MAX_OUTPUT_STRING_LENGTH);
800 octalString.append("...");
801 }
802 // return string by-value (in order to avoid another parameter)
803 return octalString;
804 }
805