1 /*
2  * Transcoder.cpp -
3  *
4  *   Copyright (c) 2008  Kokosabu(MIURA Yasuyuki)  <kokosabu@gmail.com>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
23  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  *  $Id$
30  */
31 
32 #include <stdio.h>
33 #include "scheme.h"
34 #include "Object.h"
35 #include "Object-inl.h"
36 #include "SString.h"
37 #include "Symbol.h"
38 #include "BinaryOutputPort.h"
39 #include "Transcoder.h"
40 #include "UTF8Codec.h"
41 
42 using namespace scheme;
43 
Transcoder(Codec * codec)44 Transcoder::Transcoder(Codec* codec) :
45     beginningOfInput_(true),
46     codec_(codec),
47     eolStyle_(EolStyle(LF)), // LF means no convert.
48     errorHandlingMode_(ErrorHandlingMode(REPLACE_ERROR)),
49     lineNo_(1)
50 {
51 }
52 
Transcoder(Codec * codec,EolStyle eolStyle)53 Transcoder::Transcoder(Codec* codec, EolStyle eolStyle) :
54     beginningOfInput_(true),
55     codec_(codec),
56     eolStyle_(eolStyle),
57     errorHandlingMode_(ErrorHandlingMode(REPLACE_ERROR)),
58     lineNo_(1)
59 {
60 }
61 
Transcoder(Codec * codec,EolStyle eolStyle,enum ErrorHandlingMode errorHandlingMode)62 Transcoder::Transcoder(Codec* codec, EolStyle eolStyle, enum ErrorHandlingMode errorHandlingMode) :
63     beginningOfInput_(true),
64     codec_(codec),
65     eolStyle_(eolStyle),
66     errorHandlingMode_(errorHandlingMode),
67     lineNo_(1)
68 {
69 }
70 
getLineNo() const71 int Transcoder::getLineNo() const
72 {
73     return lineNo_;
74 }
75 
eolStyle()76 enum EolStyle Transcoder::eolStyle()
77 {
78     return eolStyle_;
79 }
80 
errorHandlingMode()81 enum ErrorHandlingMode Transcoder::errorHandlingMode()
82 {
83     return errorHandlingMode_;
84 }
85 
eolStyleSymbol()86 Object Transcoder::eolStyleSymbol()
87 {
88     return eolStyleToSymbol(eolStyle_);
89 }
90 
errorHandlingModeSymbol()91 Object Transcoder::errorHandlingModeSymbol()
92 {
93     return errorHandlingModeToSymbol(errorHandlingMode_);
94 }
95 
nativeEolStyle()96 enum EolStyle Transcoder::nativeEolStyle()
97 {
98 #if LINE_FEED_CODE_LF
99     return EolStyle(LF);
100 #elif LINE_FEED_CODE_CRLF
101     return EolStyle(CRLF);
102 #elif LINE_FEED_CODE_CR
103     return EolStyle::CR;
104 #else
105     MOSH_FATAL("not found platform native eol style\n");
106 #endif
107 }
108 
eolStyleToSymbol(const enum EolStyle eolstyle)109 Object Transcoder::eolStyleToSymbol(const enum EolStyle eolstyle)
110 {
111     switch (eolstyle) {
112     case EolStyle(LF):
113         return Symbol::LF;
114     case EolStyle(CR):
115         return Symbol::CR;
116     case EolStyle(CRLF):
117         return Symbol::CRLF;
118     case EolStyle(NEL):
119         return Symbol::NEL;
120     case EolStyle(CRNEL):
121         return Symbol::CRNEL;
122     case EolStyle(LS):
123         return Symbol::LS;
124     default:
125         return Symbol::NONE;
126     }
127 }
128 
errorHandlingModeToSymbol(const enum ErrorHandlingMode errorHandlingMode)129 Object Transcoder::errorHandlingModeToSymbol(const enum ErrorHandlingMode errorHandlingMode)
130 {
131     switch (errorHandlingMode) {
132     case ErrorHandlingMode(IGNORE_ERROR):
133         return Symbol::IGNORE_ERROR;
134     case ErrorHandlingMode(RAISE_ERROR):
135         return Symbol::RAISE_ERROR;
136     case ErrorHandlingMode(REPLACE_ERROR):
137         return Symbol::REPLACE_ERROR;
138     default:
139         MOSH_FATAL("not found errorHandlingMode\n");
140     }
141     return Object::Undef;
142 }
143 
putString(BinaryOutputPort * port,const ucs4string & s)144 void Transcoder::putString(BinaryOutputPort* port, const ucs4string& s)
145 {
146     for (ucs4string::const_iterator it = s.begin(); it != s.end(); ++it) {
147         putChar(port, *it);
148     }
149 }
150 
putChar(BinaryOutputPort * port,ucs4char c)151 void Transcoder::putChar(BinaryOutputPort* port, ucs4char c)
152 {
153     if (!buffer_.empty()) {
154         // remove 1 character
155         buffer_.erase(0, 1);
156     }
157     if (eolStyle_ == EolStyle(E_NONE)) {
158         codec_->putChar(port, c, errorHandlingMode_);
159         return;
160     } else if (c == EolStyle(LF)) {
161         switch (eolStyle_) {
162         case EolStyle(LF):
163         case EolStyle(CR):
164         case EolStyle(NEL):
165         case EolStyle(LS):
166         {
167             codec_->putChar(port, eolStyle_, errorHandlingMode_);
168             break;
169         }
170         case EolStyle(E_NONE):
171         {
172             codec_->putChar(port, c, errorHandlingMode_);
173             break;
174         }
175         case EolStyle(CRLF):
176         {
177             codec_->putChar(port, EolStyle(CR), errorHandlingMode_);
178             codec_->putChar(port, EolStyle(LF), errorHandlingMode_);
179             break;
180         }
181         case EolStyle(CRNEL):
182         {
183             codec_->putChar(port, EolStyle(CR), errorHandlingMode_);
184             codec_->putChar(port, EolStyle(NEL), errorHandlingMode_);
185             break;
186         }
187         }
188     } else {
189         codec_->putChar(port, c, errorHandlingMode_);
190     }
191 }
192 
193 // int Transcoder::putChar(uint8_t* buf, ucs4char c)
194 // {
195 //     return codec_->out(buf, c, errorHandlingMode_);
196 // }
197 
unGetChar(ucs4char c)198 void Transcoder::unGetChar(ucs4char c)
199 {
200     if (EOF == c) {
201         return;
202     }
203     buffer_ += c;
204     if (c == EolStyle(LF)) {
205         lineNo_--;
206     }
207 
208 }
209 
getCharInternal(BinaryInputPort * port)210 ucs4char Transcoder::getCharInternal(BinaryInputPort* port)
211 {
212     // In the beginning of input, we have to check the BOM.
213     if (beginningOfInput_) {
214         beginningOfInput_ = false;
215         const bool checkBOM = true;
216         return codec_->getChar(port, errorHandlingMode_, checkBOM);
217     }
218     ucs4char c;
219     if (buffer_.empty()) {
220         c= codec_->getChar(port, errorHandlingMode_);
221     } else {
222         c = buffer_[buffer_.size() - 1];
223         buffer_.erase(buffer_.size() - 1, 1);
224     }
225     return c;
226 }
227 
getChar(BinaryInputPort * port)228 ucs4char Transcoder::getChar(BinaryInputPort* port)
229 {
230     const ucs4char c = getCharInternal(port);
231     if (eolStyle_ == EolStyle(E_NONE)) {
232         if (c == EolStyle(LF)) {
233             lineNo_++;
234         }
235         return c;
236     }
237     switch(c) {
238     case EolStyle(LF):
239     case EolStyle(NEL):
240     case EolStyle(LS):
241     {
242         lineNo_++;
243         return EolStyle(LF);
244     }
245     case EolStyle(CR):
246     {
247         const ucs4char c2 = getCharInternal(port);
248         lineNo_++;
249         switch(c2) {
250         case EolStyle(LF):
251         case EolStyle(NEL):
252             return EolStyle(LF);
253         default:
254             unGetChar(c2);
255             return EolStyle(LF);
256         }
257     }
258     default:
259         return c;
260     }
261 }
262 
getString(BinaryInputPort * port)263 ucs4string Transcoder::getString(BinaryInputPort* port)
264 {
265     ucs4string ret;
266     for (ucs4char c = getChar(port); c != EOF; c = getChar(port)) {
267         ret += c;
268     }
269     return ret;
270 }
271 
validateEolStyle(Object eolStyle,EolStyle & result)272 bool Transcoder::validateEolStyle(Object eolStyle, EolStyle& result)
273 {
274     MOSH_ASSERT(eolStyle.isSymbol());
275     if (eolStyle == Symbol::LF) {
276         result = EolStyle(LF);
277     } else if (eolStyle == Symbol::CR) {
278         result = EolStyle(CR);
279     } else if (eolStyle == Symbol::CRLF) {
280         result = EolStyle(CRLF);
281     } else if (eolStyle == Symbol::NEL) {
282         result = EolStyle(NEL);
283     } else if (eolStyle == Symbol::CRNEL) {
284         result = EolStyle(CRNEL);
285     } else if (eolStyle == Symbol::LS) {
286         result = EolStyle(LS);
287     } else if (eolStyle == Symbol::NONE) {
288         result = EolStyle(E_NONE);
289     } else {
290         return false;
291     }
292     return true;
293 }
294 
validateErrorHandlingMode(Object symbol,enum ErrorHandlingMode & result)295 bool Transcoder::validateErrorHandlingMode(Object symbol, enum ErrorHandlingMode& result)
296 {
297     MOSH_ASSERT(symbol.isSymbol());
298     if (symbol == Symbol::IGNORE_ERROR) {
299         result = ErrorHandlingMode(IGNORE_ERROR);
300     } else if (symbol == Symbol::RAISE_ERROR) {
301         result = ErrorHandlingMode(RAISE_ERROR);
302     } else if (symbol == Symbol::REPLACE_ERROR) {
303         result = ErrorHandlingMode(REPLACE_ERROR);
304     } else {
305         return false;
306     }
307     return true;
308 }
309 
310