1 //
2 // VMime library (http://www.vmime.org)
3 // Copyright (C) 2002-2013 Vincent Richard <vincent@vmime.org>
4 //
5 // This program is free software; you can redistribute it and/or
6 // modify it under the terms of the GNU General Public License as
7 // published by the Free Software Foundation; either version 3 of
8 // the License, or (at your option) any later version.
9 //
10 // This program is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License along
16 // with this program; if not, write to the Free Software Foundation, Inc.,
17 // 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 //
19 // Linking this library statically or dynamically with other modules is making
20 // a combined work based on this library.  Thus, the terms and conditions of
21 // the GNU General Public License cover the whole combination.
22 //
23 
24 #include "vmime/wordEncoder.hpp"
25 
26 #include "vmime/exception.hpp"
27 #include "vmime/charsetConverter.hpp"
28 
29 #include "vmime/encoding.hpp"
30 
31 #include "vmime/utility/encoder/b64Encoder.hpp"
32 #include "vmime/utility/encoder/qpEncoder.hpp"
33 
34 #include "vmime/utility/stringUtils.hpp"
35 
36 #include "vmime/utility/outputStreamStringAdapter.hpp"
37 #include "vmime/utility/inputStreamStringAdapter.hpp"
38 
39 
40 namespace vmime
41 {
42 
43 
wordEncoder(const string & buffer,const charset & charset,const Encoding encoding)44 wordEncoder::wordEncoder(const string& buffer, const charset& charset, const Encoding encoding)
45 	: m_buffer(buffer), m_pos(0), m_length(buffer.length()), m_charset(charset), m_encoding(encoding)
46 {
47 	try
48 	{
49 		string utf8Buffer;
50 
51 		vmime::charset::convert
52 			(buffer, utf8Buffer, charset, vmime::charset(charsets::UTF_8));
53 
54 		m_buffer = utf8Buffer;
55 		m_length = utf8Buffer.length();
56 
57 		m_simple = false;
58 	}
59 	catch (exceptions::charset_conv_error&)
60 	{
61 		// Ignore exception.
62 		// We will fall back on simple encoding.
63 		m_simple = true;
64 	}
65 
66 	if (m_encoding == ENCODING_AUTO)
67 		m_encoding = guessBestEncoding(buffer, charset);
68 
69 	if (m_encoding == ENCODING_B64)
70 	{
71 		m_encoder = make_shared <utility::encoder::b64Encoder>();
72 	}
73 	else // ENCODING_QP
74 	{
75 		m_encoder = make_shared <utility::encoder::qpEncoder>();
76 		m_encoder->getProperties()["rfc2047"] = true;
77 	}
78 }
79 
80 
getUTF8CharLength(const string & buffer,const size_t pos,const size_t length)81 static size_t getUTF8CharLength
82 	(const string& buffer, const size_t pos, const size_t length)
83 {
84 	// Gives the number of extra bytes in a UTF8 char, given the leading char
85 	static const unsigned char UTF8_EXTRA_BYTES[256] =
86 	{
87 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
90 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
100 		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
101 		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
102 		3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
103 	};
104 
105 	const unsigned char c = buffer[pos];
106 	const unsigned char n = UTF8_EXTRA_BYTES[c];
107 
108 	if (n < length - pos)
109 		return n + 1;
110 	else
111 		return 1;
112 }
113 
114 
getNextChunk(const size_t maxLength)115 const string wordEncoder::getNextChunk(const size_t maxLength)
116 {
117 	const size_t remaining = m_length - m_pos;
118 
119 	if (remaining == 0)
120 		return string();
121 
122 	vmime::string chunk;
123 	vmime::utility::outputStreamStringAdapter chunkStream(chunk);
124 
125 	// Simple encoding
126 	if (m_simple)
127 	{
128 		// WARNING! Simple encoding can encode a non-integral number of
129 		// characters and then may generate incorrectly-formed words!
130 
131 		if (m_encoding == ENCODING_B64)
132 		{
133 			// Here, we have a formula to compute the maximum number of source
134 			// bytes to encode knowing the maximum number of encoded chars. In
135 			// Base64 encoding, 3 bytes of input provide 4 bytes of output.
136 			const size_t inputCount =
137 				std::min(remaining, (maxLength > 1) ? ((maxLength - 1) * 3) / 4 : 1);
138 
139 			// Encode chunk
140 			utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
141 
142 			m_encoder->encode(in, chunkStream);
143 			m_pos += inputCount;
144 		}
145 		else // ENCODING_QP
146 		{
147 			// Compute exactly how much input bytes are needed to have an output
148 			// string length of less than 'maxLength' bytes. In Quoted-Printable
149 			// encoding, encoded bytes take 3 bytes.
150 			size_t inputCount = 0;
151 			size_t outputCount = 0;
152 
153 			while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
154 			{
155 				const unsigned char c = m_buffer[m_pos + inputCount];
156 
157 				inputCount++;
158 				outputCount += utility::encoder::qpEncoder::RFC2047_getEncodedLength(c);
159 			}
160 
161 			// Encode chunk
162 			utility::inputStreamStringAdapter in(m_buffer, m_pos, m_pos + inputCount);
163 
164 			m_encoder->encode(in, chunkStream);
165 			m_pos += inputCount;
166 		}
167 	}
168 	// Fully RFC-compliant encoding
169 	else
170 	{
171 		shared_ptr <charsetConverter> conv = charsetConverter::create(charsets::UTF_8, m_charset);
172 
173 		size_t inputCount = 0;
174 		size_t outputCount = 0;
175 		string encodeBuffer;
176 
177 		while ((inputCount == 0 || outputCount < maxLength) && (inputCount < remaining))
178 		{
179 			// Get the next UTF8 character
180 			const size_t inputCharLength =
181 				getUTF8CharLength(m_buffer, m_pos + inputCount, m_length);
182 
183 			const string inputChar(m_buffer.begin() + m_pos + inputCount,
184 				m_buffer.begin() + m_pos + inputCount + inputCharLength);
185 
186 			// Convert back to original encoding
187 			string encodeBytes;
188 			conv->convert(inputChar, encodeBytes);
189 
190 			encodeBuffer += encodeBytes;
191 
192 			// Compute number of output bytes
193 			if (m_encoding == ENCODING_B64)
194 			{
195 				outputCount = std::max(static_cast <size_t>(4),
196 					(encodeBuffer.length() * 4) / 3);
197 			}
198 			else // ENCODING_QP
199 			{
200 				for (size_t i = 0, n = encodeBytes.length() ; i < n ; ++i)
201 				{
202 					const unsigned char c = encodeBytes[i];
203 					outputCount += utility::encoder::qpEncoder::RFC2047_getEncodedLength(c);
204 				}
205 			}
206 
207 			inputCount += inputCharLength;
208 		}
209 
210 		// Encode chunk
211 		utility::inputStreamStringAdapter in(encodeBuffer);
212 
213 		m_encoder->encode(in, chunkStream);
214 		m_pos += inputCount;
215 	}
216 
217 	return chunk;
218 }
219 
220 
getEncoding() const221 wordEncoder::Encoding wordEncoder::getEncoding() const
222 {
223 	return m_encoding;
224 }
225 
226 
227 // static
isEncodingNeeded(const generationContext & ctx,const string & buffer,const charset & charset,const string & lang)228 bool wordEncoder::isEncodingNeeded
229 	(const generationContext& ctx, const string& buffer,
230 	 const charset& charset, const string& lang)
231 {
232 	if (!ctx.getInternationalizedEmailSupport())
233 	{
234 		// Charset-specific encoding
235 		encoding recEncoding;
236 
237 		if (charset.getRecommendedEncoding(recEncoding))
238 			return true;
239 
240 		// No encoding is needed if the buffer only contains ASCII chars
241 		if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos)
242 			return true;
243 	}
244 
245 	// Force encoding when there are only ASCII chars, but there is
246 	// also at least one of '\n' or '\r' (header fields)
247 	if (buffer.find_first_of("\n\r") != string::npos)
248 		return true;
249 
250 	// If any RFC-2047 sequence is found in the buffer, encode it
251 	if (buffer.find("=?") != string::npos || buffer.find("?=") != string::npos)
252 		return true;
253 
254 	// If a language is specified, force encoding
255 	if (!lang.empty())
256 		return true;
257 
258 	return false;
259 }
260 
261 
262 // static
guessBestEncoding(const string & buffer,const charset & charset)263 wordEncoder::Encoding wordEncoder::guessBestEncoding
264 	(const string& buffer, const charset& charset)
265 {
266 	// Charset-specific encoding
267 	encoding recEncoding;
268 
269 	if (charset.getRecommendedEncoding(recEncoding))
270 	{
271 		if (recEncoding == encoding(encodingTypes::QUOTED_PRINTABLE))
272 			return ENCODING_QP;
273 		else
274 			return ENCODING_B64;
275 	}
276 
277 	// Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default)
278 	const size_t asciiCount =
279 		utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end());
280 
281 	const size_t asciiPercent =
282 		(buffer.length() == 0 ? 100 : (100 * asciiCount) / buffer.length());
283 
284 	if (asciiPercent < 60)
285 		return ENCODING_B64;
286 	else
287 		return ENCODING_QP;
288 }
289 
290 
291 } // vmime
292 
293