1 //
2 // VMime library (http://www.vmime.org)
3 // Copyright (C) 2002-2013 Vincent Richard <vincent@vmime.org>
4 //
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License as
7 // published by the Free Software Foundation; either version 3 of
8 // the License, or (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 //
19 // Linking this library statically or dynamically with other modules is making
20 // a combined work based on this library. Thus, the terms and conditions of
21 // the GNU General Public License cover the whole combination.
22 //
23
24 #include "vmime/wordEncoder.hpp"
25
26 #include "vmime/exception.hpp"
27 #include "vmime/charsetConverter.hpp"
28
29 #include "vmime/encoding.hpp"
30
31 #include "vmime/utility/encoder/b64Encoder.hpp"
32 #include "vmime/utility/encoder/qpEncoder.hpp"
33
34 #include "vmime/utility/stringUtils.hpp"
35
36 #include "vmime/utility/outputStreamStringAdapter.hpp"
37 #include "vmime/utility/inputStreamStringAdapter.hpp"
38
39
40 namespace vmime
41 {
42
43
wordEncoder(const string & buffer,const charset & charset,const Encoding encoding)44 wordEncoder::wordEncoder(const string& buffer, const charset& charset, const Encoding encoding)
45 : m_buffer(buffer), m_pos(0), m_length(buffer.length()), m_charset(charset), m_encoding(encoding)
46 {
47 try
48 {
49 string utf8Buffer;
50
51 vmime::charset::convert
52 (buffer, utf8Buffer, charset, vmime::charset(charsets::UTF_8));
53
54 m_buffer = utf8Buffer;
55 m_length = utf8Buffer.length();
56
57 m_simple = false;
58 }
59 catch (exceptions::charset_conv_error&)
60 {
61 // Ignore exception.
62 // We will fall back on simple encoding.
63 m_simple = true;
64 }
65
66 if (m_encoding == ENCODING_AUTO)
67 m_encoding = guessBestEncoding(buffer, charset);
68
69 if (m_encoding == ENCODING_B64)
70 {
71 m_encoder = make_shared <utility::encoder::b64Encoder>();
72 }
73 else // ENCODING_QP
74 {
75 m_encoder = make_shared <utility::encoder::qpEncoder>();
76 m_encoder->getProperties()["rfc2047"] = true;
77 }
78 }
79
80
getUTF8CharLength(const string & buffer,const size_t pos,const size_t length)81 static size_t getUTF8CharLength
82 (const string& buffer, const size_t pos, const size_t length)
83 {
84 // Gives the number of extra bytes in a UTF8 char, given the leading char
85 static const unsigned char UTF8_EXTRA_BYTES[256] =
86 {
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
102 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
103 };
104
105 const unsigned char c = buffer[pos];
106 const unsigned char n = UTF8_EXTRA_BYTES[c];
107
108 if (n < length - pos)
109 return n + 1;
110 else
111 return 1;
112 }
113
114
getNextChunk(const size_t maxLength)115 const string wordEncoder::getNextChunk(const size_t maxLength)
116 {
117 const size_t remaining = m_length - m_pos;
118
119 if (remaining == 0)
120 return string();
121
122 vmime::string chunk;
123 vmime::utility::outputStreamStringAdapter chunkStream(chunk);
124
125 // Simple encoding
126 if (m_simple)
127 {
128 // WARNING! Simple encoding can encode a non-integral number of
129 // characters and then may generate incorrectly-formed words!
130
131 if (m_encoding == ENCODING_B64)
132 {
133 // Here, we have a formula to compute the maximum number of source
134 // bytes to encode knowing the maximum number of encoded chars. In
135 // Base64 encoding, 3 bytes of input provide 4 bytes of output.
136 const size_t inputCount =
137 std::min(remaining, (maxLength > 1) ? ((maxLength - 1) * 3) / 4 : 1);
138
139 // Encode chunk
140 utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
141
142 m_encoder->encode(in, chunkStream);
143 m_pos += inputCount;
144 }
145 else // ENCODING_QP
146 {
147 // Compute exactly how much input bytes are needed to have an output
148 // string length of less than 'maxLength' bytes. In Quoted-Printable
149 // encoding, encoded bytes take 3 bytes.
150 size_t inputCount = 0;
151 size_t outputCount = 0;
152
153 while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
154 {
155 const unsigned char c = m_buffer[m_pos + inputCount];
156
157 inputCount++;
158 outputCount += utility::encoder::qpEncoder::RFC2047_getEncodedLength(c);
159 }
160
161 // Encode chunk
162 utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
163
164 m_encoder->encode(in, chunkStream);
165 m_pos += inputCount;
166 }
167 }
168 // Fully RFC-compliant encoding
169 else
170 {
171 shared_ptr <charsetConverter> conv = charsetConverter::create(charsets::UTF_8, m_charset);
172
173 size_t inputCount = 0;
174 size_t outputCount = 0;
175 string encodeBuffer;
176
177 while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
178 {
179 // Get the next UTF8 character
180 const size_t inputCharLength =
181 getUTF8CharLength(m_buffer, m_pos + inputCount, m_length);
182
183 const string inputChar(m_buffer.begin() + m_pos + inputCount,
184 m_buffer.begin() + m_pos + inputCount + inputCharLength);
185
186 // Convert back to original encoding
187 string encodeBytes;
188 conv->convert(inputChar, encodeBytes);
189
190 encodeBuffer += encodeBytes;
191
192 // Compute number of output bytes
193 if (m_encoding == ENCODING_B64)
194 {
195 outputCount = std::max(static_cast <size_t>(4),
196 (encodeBuffer.length() * 4) / 3);
197 }
198 else // ENCODING_QP
199 {
200 for (size_t i = 0, n = encodeBytes.length() ; i < n ; ++i)
201 {
202 const unsigned char c = encodeBytes[i];
203 outputCount += utility::encoder::qpEncoder::RFC2047_getEncodedLength(c);
204 }
205 }
206
207 inputCount += inputCharLength;
208 }
209
210 // Encode chunk
211 utility::inputStreamStringAdapter in(encodeBuffer);
212
213 m_encoder->encode(in, chunkStream);
214 m_pos += inputCount;
215 }
216
217 return chunk;
218 }
219
220
getEncoding() const221 wordEncoder::Encoding wordEncoder::getEncoding() const
222 {
223 return m_encoding;
224 }
225
226
227 // static
isEncodingNeeded(const generationContext & ctx,const string & buffer,const charset & charset,const string & lang)228 bool wordEncoder::isEncodingNeeded
229 (const generationContext& ctx, const string& buffer,
230 const charset& charset, const string& lang)
231 {
232 if (!ctx.getInternationalizedEmailSupport())
233 {
234 // Charset-specific encoding
235 encoding recEncoding;
236
237 if (charset.getRecommendedEncoding(recEncoding))
238 return true;
239
240 // No encoding is needed if the buffer only contains ASCII chars
241 if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos)
242 return true;
243 }
244
245 // Force encoding when there are only ASCII chars, but there is
246 // also at least one of '\n' or '\r' (header fields)
247 if (buffer.find_first_of("\n\r") != string::npos)
248 return true;
249
250 // If any RFC-2047 sequence is found in the buffer, encode it
251 if (buffer.find("=?") != string::npos || buffer.find("?=") != string::npos)
252 return true;
253
254 // If a language is specified, force encoding
255 if (!lang.empty())
256 return true;
257
258 return false;
259 }
260
261
262 // static
guessBestEncoding(const string & buffer,const charset & charset)263 wordEncoder::Encoding wordEncoder::guessBestEncoding
264 (const string& buffer, const charset& charset)
265 {
266 // Charset-specific encoding
267 encoding recEncoding;
268
269 if (charset.getRecommendedEncoding(recEncoding))
270 {
271 if (recEncoding == encoding(encodingTypes::QUOTED_PRINTABLE))
272 return ENCODING_QP;
273 else
274 return ENCODING_B64;
275 }
276
277 // Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default)
278 const size_t asciiCount =
279 utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end());
280
281 const size_t asciiPercent =
282 (buffer.length() == 0 ? 100 : (100 * asciiCount) / buffer.length());
283
284 if (asciiPercent < 60)
285 return ENCODING_B64;
286 else
287 return ENCODING_QP;
288 }
289
290
291 } // vmime
292
293