1// Created on: 2013-01-28 2// Created by: Kirill GAVRILOV 3// Copyright (c) 2013-2014 OPEN CASCADE SAS 4// 5// This file is part of Open CASCADE Technology software library. 6// 7// This library is free software; you can redistribute it and/or modify it under 8// the terms of the GNU Lesser General Public License version 2.1 as published 9// by the Free Software Foundation, with special exception defined in the file 10// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT 11// distribution for complete text of the license and disclaimer of any warranty. 12// 13// Alternatively, this file may be used under the terms of Open CASCADE 14// commercial license or contractual agreement. 15 16// Portions of code are copyrighted by Unicode, Inc. 17// 18// Copyright (c) 2001-2004 Unicode, Inc. 19// 20// Disclaimer 21// 22// This source code is provided as is by Unicode, Inc. No claims are 23// made as to fitness for any particular purpose. No warranties of any 24// kind are expressed or implied. The recipient agrees to determine 25// applicability of information provided. If this file has been 26// purchased on magnetic or optical media from Unicode, Inc., the 27// sole remedy for any claim will be exchange of defective media 28// within 90 days of receipt. 29// 30// Limitations on Rights to Redistribute This Code 31// 32// Unicode, Inc. hereby grants the right to freely use the information 33// supplied in this file in the creation of products supporting the 34// Unicode Standard, and to make copies of this file in any form 35// for internal or external distribution as long as this notice 36// remains attached. 37 38//! The first character in a UTF-8 sequence indicates how many bytes 39//! to read (among other things). 40template<typename Type> 41const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] = 42{ 43 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 44 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 45 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 49 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 50 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 51}; 52 53//! Magic values subtracted from a buffer value during UTF-8 conversion. 54//! This table contains as many values as there might be trailing bytes 55//! in a UTF-8 sequence. 56template<typename Type> 57const unsigned long NCollection_UtfIterator<Type>::offsetsFromUTF8[6] = 58{ 59 0x00000000UL, 0x00003080UL, 0x000E2080UL, 60 0x03C82080UL, 0xFA082080UL, 0x82082080UL 61}; 62 63//! The first character in a UTF-8 sequence indicates how many bytes to read. 64template<typename Type> 65const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 66 67// ======================================================================= 68// function : readUTF8 69// purpose : Get a UTF-8 character; leave the tracking pointer at the start of the next character. 70// Not protected against invalid UTF-8. 71// ======================================================================= 72template<typename Type> 73inline void NCollection_UtfIterator<Type>::readUTF8() 74{ 75 // unsigned arithmetic used 76 Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext; 77 const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos]; 78 myCharUtf32 = 0; 79 switch (aBytesToRead) 80 { 81 case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 82 Standard_FALLTHROUGH 83 case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 84 Standard_FALLTHROUGH 85 case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; 86 Standard_FALLTHROUGH 87 case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; 88 Standard_FALLTHROUGH 89 case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; 90 Standard_FALLTHROUGH 91 case 0: myCharUtf32 += *aPos++; 92 } 93 myCharUtf32 -= offsetsFromUTF8[aBytesToRead]; 94 myPosNext = (Type* )aPos; 95} 96 97// magic numbers 98template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF; 99template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80; 100template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800; 101template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END = 0xDBFF; 102template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START = 0xDC00; 103template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END = 0xDFFF; 104template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10; 105template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE = 0x0010000UL; 106template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK = 0x3FFUL; 107template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_BMP = 0x0000FFFFUL; 108template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL; 109 110// ======================================================================= 111// function : readUTF16 112// purpose : 113// ======================================================================= 114template<typename Type> inline 115void NCollection_UtfIterator<Type>::readUTF16() 116{ 117 Standard_Utf32Char aChar = *myPosNext++; 118 // if we have the first half of the surrogate pair 119 if (aChar >= UTF16_SURROGATE_HIGH_START 120 && aChar <= UTF16_SURROGATE_HIGH_END) 121 { 122 const Standard_Utf32Char aChar2 = *myPosNext; 123 // complete the surrogate pair 124 if (aChar2 >= UTF16_SURROGATE_LOW_START 125 && aChar2 <= UTF16_SURROGATE_LOW_END) 126 { 127 aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT) 128 + (aChar2 - UTF16_SURROGATE_LOW_START) + UTF16_SURROGATE_LOW_BASE; 129 ++myPosNext; 130 } 131 } 132 myCharUtf32 = aChar; 133} 134 135// ======================================================================= 136// function : AdvanceBytesUtf8 137// purpose : 138// ======================================================================= 139template<typename Type> inline 140Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const 141{ 142 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START 143 && myCharUtf32 <= UTF16_SURROGATE_LOW_END) 144 { 145 // UTF-16 surrogate values are illegal in UTF-32 146 return 0; 147 } 148 else if (myCharUtf32 < Standard_Utf32Char(0x80)) 149 { 150 return 1; 151 } 152 else if (myCharUtf32 < Standard_Utf32Char(0x800)) 153 { 154 return 2; 155 } 156 else if (myCharUtf32 < Standard_Utf32Char(0x10000)) 157 { 158 return 3; 159 } 160 else if (myCharUtf32 <= UTF32_MAX_LEGAL) 161 { 162 return 4; 163 } 164 else 165 { 166 // illegal 167 return 0; 168 } 169} 170 171// ======================================================================= 172// function : GetUtf8 173// purpose : 174// ======================================================================= 175template<typename Type> inline 176Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const 177{ 178 // unsigned arithmetic used 179 return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer); 180} 181 182// ======================================================================= 183// function : GetUtf8 184// purpose : 185// ======================================================================= 186template<typename Type> inline 187Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const 188{ 189 Standard_Utf32Char aChar = myCharUtf32; 190 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START 191 && myCharUtf32 <= UTF16_SURROGATE_LOW_END) 192 { 193 // UTF-16 surrogate values are illegal in UTF-32 194 return theBuffer; 195 } 196 else if (myCharUtf32 < Standard_Utf32Char(0x80)) 197 { 198 *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]); 199 return theBuffer; 200 } 201 else if (myCharUtf32 < Standard_Utf32Char(0x800)) 202 { 203 *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; 204 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]); 205 return theBuffer + 2; 206 } 207 else if (myCharUtf32 < Standard_Utf32Char(0x10000)) 208 { 209 theBuffer += 3; 210 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; 211 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; 212 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]); 213 return theBuffer + 3; 214 } 215 else if (myCharUtf32 <= UTF32_MAX_LEGAL) 216 { 217 theBuffer += 4; 218 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; 219 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; 220 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; 221 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]); 222 return theBuffer + 4; 223 } 224 else 225 { 226 // illegal 227 return theBuffer; 228 } 229} 230 231// ======================================================================= 232// function : AdvanceBytesUtf16 233// purpose : 234// ======================================================================= 235template<typename Type> inline 236Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const 237{ 238 return AdvanceCodeUnitsUtf16() * sizeof(Standard_Utf16Char); 239} 240 241// ======================================================================= 242// function : AdvanceCodeUnitsUtf16 243// purpose : 244// ======================================================================= 245template<typename Type> inline 246Standard_Integer NCollection_UtfIterator<Type>::AdvanceCodeUnitsUtf16() const 247{ 248 if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF 249 { 250 // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values 251 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START 252 && myCharUtf32 <= UTF16_SURROGATE_LOW_END) 253 { 254 return 0; 255 } 256 else 257 { 258 return 1; 259 } 260 } 261 else if (myCharUtf32 > UTF32_MAX_LEGAL) 262 { 263 // illegal 264 return 0; 265 } 266 else 267 { 268 // target is a character in range 0xFFFF - 0x10FFFF 269 // surrogate pair 270 return 2; 271 } 272} 273 274// ======================================================================= 275// function : GetUtf16 276// purpose : 277// ======================================================================= 278template<typename Type> inline 279Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const 280{ 281 if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF 282 { 283 // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values 284 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START 285 && myCharUtf32 <= UTF16_SURROGATE_LOW_END) 286 { 287 return theBuffer; 288 } 289 else 290 { 291 *theBuffer++ = Standard_Utf16Char(myCharUtf32); 292 return theBuffer; 293 } 294 } 295 else if (myCharUtf32 > UTF32_MAX_LEGAL) 296 { 297 // illegal 298 return theBuffer; 299 } 300 else 301 { 302 // surrogate pair 303 Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE; 304 *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START); 305 *theBuffer++ = Standard_Utf16Char((aChar & UTF16_SURROGATE_LOW_MASK) + UTF16_SURROGATE_LOW_START); 306 return theBuffer; 307 } 308} 309 310// ======================================================================= 311// function : GetUtf32 312// purpose : 313// ======================================================================= 314template<typename Type> inline 315Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const 316{ 317 *theBuffer++ = myCharUtf32; 318 return theBuffer; 319} 320