1 /*
2  * encoding.cpp -- implementation of the encodings shipped with Pire.
3  *
4  * Copyright (c) 2007-2010, Dmitry Prokoptsev <dprokoptsev@gmail.com>,
5  *                          Alexander Gololobov <agololobov@gmail.com>
6  *
7  * This file is part of Pire, the Perl Incompatible
8  * Regular Expressions library.
9  *
10  * Pire is free software: you can redistribute it and/or modify
11  * it under the terms of the GNU Lesser Public License as published by
12  * the Free Software Foundation, either version 3 of the License, or
13  * (at your option) any later version.
14  *
15  * Pire is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser Public License for more details.
19  * You should have received a copy of the GNU Lesser Public License
20  * along with Pire.  If not, see <http://www.gnu.org/licenses>.
21  */
22 
23 
24 #include <stdexcept>
25 #include <utility>
26 #include "stub/defaults.h"
27 #include "stub/utf8.h"
28 #include "stub/singleton.h"
29 #include "encoding.h"
30 #include "fsm.h"
31 
32 
33 namespace Pire {
34 
35 namespace {
36 
37 	class Latin1: public Encoding {
38 	public:
Latin1()39 		Latin1() : Encoding() {}
40 
FromLocal(const char * & begin,const char * end) const41 		wchar32 FromLocal(const char*& begin, const char* end) const
42 		{
43 			if (begin == end)
44 				throw Error("EOF reached in Pire::Latin1::fromLocal()");
45 			else if (static_cast<unsigned char>(*begin) >= 0x80)
46 				throw Error("Pire::Latin1::fromLocal(): wrong character encountered (>=0x80)");
47 			else
48 				return (wchar32) *begin++;
49 		}
50 
ToLocal(wchar32 ch) const51 		ystring ToLocal(wchar32 ch) const
52 		{
53 			if (ch < 0x80)
54 				return ystring(1, (char) ch);
55 			else
56 				return ystring();
57 		}
58 
AppendDot(Fsm & fsm) const59 		void AppendDot(Fsm& fsm) const { fsm.AppendDot(); }
60 	};
61 
62 	namespace UtfRanges {
63 
64 		static const size_t MaxLen = 4;
65 		static const size_t First[MaxLen][2] = {
66 			{0x00, 0x80},
67 			{0xC0, 0xE0},
68 			{0xE0, 0xF0},
69 			{0xF0, 0xF8}
70 		};
71 		static const size_t Next[2] = {0x80, 0xC0};
72 	}
73 
74 
75 	class Utf8: public Encoding {
76 	public:
Utf8()77 		Utf8() : Encoding() {}
78 
FromLocal(const char * & begin,const char * end) const79 		wchar32 FromLocal(const char*& begin, const char* end) const
80 		{
81 			wchar32 rune;
82 			size_t len;
83 			if (utf8_read_rune(rune, len, reinterpret_cast<const unsigned char*>(begin), reinterpret_cast<const unsigned char*>(end)) != RECODE_OK)
84 				throw Error("Error reading UTF8 sequence");
85 			begin += len;
86 			return rune;
87 		}
88 
ToLocal(wchar32 c) const89 		ystring ToLocal(wchar32 c) const
90 		{
91 			ystring ret(utf8_rune_len_by_ucs(c), ' ');
92 			size_t len;
93 			unsigned char* p = (unsigned char*) &*ret.begin();
94 			if (utf8_put_rune(c, len, p, p + ret.size()) != RECODE_OK)
95 				Y_ASSERT(!"Pire::UTF8::toLocal(): Internal error");
96 			return ret;
97 		}
98 
AppendDot(Fsm & fsm) const99 		void AppendDot(Fsm& fsm) const
100 		{
101 			size_t last = fsm.Resize(fsm.Size() + UtfRanges::MaxLen);
102 			for (size_t i = 0; i < UtfRanges::MaxLen; ++i)
103 				for (size_t letter = UtfRanges::First[i][0]; letter < UtfRanges::First[i][1]; ++letter)
104 					fsm.ConnectFinal(fsm.Size() - i - 1, letter);
105 			for (size_t i = 0; i < UtfRanges::MaxLen - 1; ++i)
106 				for (size_t letter = UtfRanges::Next[0]; letter < UtfRanges::Next[1]; ++letter)
107 					fsm.Connect(last + i, last + i + 1, letter);
108 			fsm.ClearFinal();
109 			fsm.SetFinal(fsm.Size() - 1, true);
110 			fsm.SetIsDetermined(false);
111 		}
112 	};
113 }
114 
115 namespace Encodings {
116 
Utf8()117 	const Encoding& Utf8()
118 	{
119 		static const Pire::Utf8 utf8;
120 		return utf8;
121 	}
122 
Latin1()123 	const Encoding& Latin1()
124 	{
125 		static const Pire::Latin1 latin1;
126 		return latin1;
127 	}
128 
129 }
130 
131 }
132