1 /* 2 * encoding.cpp -- implementation of the encodings shipped with Pire. 3 * 4 * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>, 5 * Alexander Gololobov <agololobov@gmail.com> 6 * 7 * This file is part of Pire, the Perl Incompatible 8 * Regular Expressions library. 9 * 10 * Pire is free software: you can redistribute it and/or modify 11 * it under the terms of the GNU Lesser Public License as published by 12 * the Free Software Foundation, either version 3 of the License, or 13 * (at your option) any later version. 14 * 15 * Pire is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser Public License for more details. 19 * You should have received a copy of the GNU Lesser Public License 20 * along with Pire. If not, see <http://www.gnu.org/licenses>. 21 */ 22 23 24 #include <stdexcept> 25 #include <utility> 26 #include "stub/defaults.h" 27 #include "stub/utf8.h" 28 #include "stub/singleton.h" 29 #include "encoding.h" 30 #include "fsm.h" 31 32 33 namespace Pire { 34 35 namespace { 36 37 class Latin1: public Encoding { 38 public: Latin1()39 Latin1() : Encoding() {} 40 FromLocal(const char * & begin,const char * end) const41 wchar32 FromLocal(const char*& begin, const char* end) const 42 { 43 if (begin == end) 44 throw Error("EOF reached in Pire::Latin1::fromLocal()"); 45 else if (static_cast<unsigned char>(*begin) >= 0x80) 46 throw Error("Pire::Latin1::fromLocal(): wrong character encountered (>=0x80)"); 47 else 48 return (wchar32) *begin++; 49 } 50 ToLocal(wchar32 ch) const51 ystring ToLocal(wchar32 ch) const 52 { 53 if (ch < 0x80) 54 return ystring(1, (char) ch); 55 else 56 return ystring(); 57 } 58 AppendDot(Fsm & fsm) const59 void AppendDot(Fsm& fsm) const { fsm.AppendDot(); } 60 }; 61 62 namespace UtfRanges { 63 64 static const size_t MaxLen = 4; 65 static const size_t First[MaxLen][2] = { 66 {0x00, 0x80}, 67 {0xC0, 0xE0}, 68 {0xE0, 0xF0}, 69 {0xF0, 0xF8} 70 }; 71 static const size_t Next[2] = {0x80, 0xC0}; 72 } 73 74 75 class Utf8: public Encoding { 76 public: Utf8()77 Utf8() : Encoding() {} 78 FromLocal(const char * & begin,const char * end) const79 wchar32 FromLocal(const char*& begin, const char* end) const 80 { 81 wchar32 rune; 82 size_t len; 83 if (utf8_read_rune(rune, len, reinterpret_cast<const unsigned char*>(begin), reinterpret_cast<const unsigned char*>(end)) != RECODE_OK) 84 throw Error("Error reading UTF8 sequence"); 85 begin += len; 86 return rune; 87 } 88 ToLocal(wchar32 c) const89 ystring ToLocal(wchar32 c) const 90 { 91 ystring ret(utf8_rune_len_by_ucs(c), ' '); 92 size_t len; 93 unsigned char* p = (unsigned char*) &*ret.begin(); 94 if (utf8_put_rune(c, len, p, p + ret.size()) != RECODE_OK) 95 Y_ASSERT(!"Pire::UTF8::toLocal(): Internal error"); 96 return ret; 97 } 98 AppendDot(Fsm & fsm) const99 void AppendDot(Fsm& fsm) const 100 { 101 size_t last = fsm.Resize(fsm.Size() + UtfRanges::MaxLen); 102 for (size_t i = 0; i < UtfRanges::MaxLen; ++i) 103 for (size_t letter = UtfRanges::First[i][0]; letter < UtfRanges::First[i][1]; ++letter) 104 fsm.ConnectFinal(fsm.Size() - i - 1, letter); 105 for (size_t i = 0; i < UtfRanges::MaxLen - 1; ++i) 106 for (size_t letter = UtfRanges::Next[0]; letter < UtfRanges::Next[1]; ++letter) 107 fsm.Connect(last + i, last + i + 1, letter); 108 fsm.ClearFinal(); 109 fsm.SetFinal(fsm.Size() - 1, true); 110 fsm.SetIsDetermined(false); 111 } 112 }; 113 } 114 115 namespace Encodings { 116 Utf8()117 const Encoding& Utf8() 118 { 119 static const Pire::Utf8 utf8; 120 return utf8; 121 } 122 Latin1()123 const Encoding& Latin1() 124 { 125 static const Pire::Latin1 latin1; 126 return latin1; 127 } 128 129 } 130 131 } 132