1 /*-
2  * Copyright (c) 2011 Michihiro NAKAJIMA
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "test.h"
26 __FBSDID("$FreeBSD$");
27 
28 #include <locale.h>
29 
30 DEFINE_TEST(test_zip_filename_encoding_UTF8)
31 {
32   	struct archive *a;
33   	struct archive_entry *entry;
34 	char buff[4096];
35 	size_t used;
36 
37 	if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
38 		skipping("en_US.UTF-8 locale not available on this system.");
39 		return;
40 	}
41 
42 	/*
43 	 * Verify that UTF-8 filenames are correctly stored with
44 	 * hdrcharset=UTF-8 option.
45 	 */
46 	a = archive_write_new();
47 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
48 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
49 		skipping("This system cannot convert character-set"
50 		    " for UTF-8.");
51 		archive_write_free(a);
52 		return;
53 	}
54 	assertEqualInt(ARCHIVE_OK,
55 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
56 
57 	entry = archive_entry_new2(a);
58 	/* Set a UTF-8 filename. */
59 	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
60 	archive_entry_set_filetype(entry, AE_IFREG);
61 	archive_entry_set_size(entry, 0);
62 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
63 	archive_entry_free(entry);
64 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
65 
66 	/* A bit 11 of general purpose flag should be 0x08,
67 	 * which indicates the filename charset is UTF-8. */
68 	assertEqualInt(0x08, buff[7]);
69 	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
70 
71 	/*
72 	 * Verify that UTF-8 filenames are correctly stored without
73 	 * hdrcharset=UTF-8 option.
74 	 */
75 	a = archive_write_new();
76 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
77 	assertEqualInt(ARCHIVE_OK,
78 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
79 
80 	entry = archive_entry_new2(a);
81 	/* Set a UTF-8 filename. */
82 	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
83 	archive_entry_set_filetype(entry, AE_IFREG);
84 	archive_entry_set_size(entry, 0);
85 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
86 	archive_entry_free(entry);
87 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
88 
89 	/* A bit 11 of general purpose flag should be 0x08,
90 	 * which indicates the filename charset is UTF-8. */
91 	assertEqualInt(0x08, buff[7]);
92 	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
93 
94 	/*
95 	 * Verify that A bit 11 of general purpose flag is not set
96 	 * when ASCII filenames are stored.
97 	 */
98 	a = archive_write_new();
99 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
100 	assertEqualInt(ARCHIVE_OK,
101 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
102 
103 	entry = archive_entry_new2(a);
104 	/* Set an ASCII filename. */
105 	archive_entry_set_pathname(entry, "abcABC");
106 	archive_entry_set_filetype(entry, AE_IFREG);
107 	archive_entry_set_size(entry, 0);
108 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
109 	archive_entry_free(entry);
110 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
111 
112 	/* A bit 11 of general purpose flag should be 0,
113 	 * which indicates the filename charset is unknown. */
114 	assertEqualInt(0, buff[7]);
115 	assertEqualMem(buff + 30, "abcABC", 6);
116 }
117 
118 DEFINE_TEST(test_zip_filename_encoding_KOI8R)
119 {
120   	struct archive *a;
121   	struct archive_entry *entry;
122 	char buff[4096];
123 	size_t used;
124 
125 	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
126 		skipping("KOI8-R locale not available on this system.");
127 		return;
128 	}
129 
130 	/*
131 	 * Verify that KOI8-R filenames are correctly translated to UTF-8.
132 	 */
133 	a = archive_write_new();
134 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
135 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
136 		skipping("This system cannot convert character-set"
137 		    " from KOI8-R to UTF-8.");
138 		archive_write_free(a);
139 		return;
140 	}
141 	assertEqualInt(ARCHIVE_OK,
142 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
143 
144 	entry = archive_entry_new2(a);
145 	/* Set a KOI8-R filename. */
146 	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
147 	archive_entry_set_filetype(entry, AE_IFREG);
148 	archive_entry_set_size(entry, 0);
149 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
150 	archive_entry_free(entry);
151 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
152 
153 	/* A bit 11 of general purpose flag should be 0x08,
154 	 * which indicates the filename charset is UTF-8. */
155 	assertEqualInt(0x08, buff[7]);
156 	/* Above three characters in KOI8-R should translate to the following
157 	 * three characters (two bytes each) in UTF-8. */
158 	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
159 
160 	/*
161 	 * Verify that KOI8-R filenames are not translated to UTF-8.
162 	 */
163 	a = archive_write_new();
164 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
165 	assertEqualInt(ARCHIVE_OK,
166 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
167 
168 	entry = archive_entry_new2(a);
169 	/* Set a KOI8-R filename. */
170 	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
171 	archive_entry_set_filetype(entry, AE_IFREG);
172 	archive_entry_set_size(entry, 0);
173 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
174 	archive_entry_free(entry);
175 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
176 
177 	/* A bit 11 of general purpose flag should be 0,
178 	 * which indicates the filename charset is unknown. */
179 	assertEqualInt(0, buff[7]);
180 	/* Above three characters in KOI8-R should not translate to
181 	 * any character-set. */
182 	assertEqualMem(buff + 30, "\xD0\xD2\xC9", 3);
183 
184 	/*
185 	 * Verify that A bit 11 of general purpose flag is not set
186 	 * when ASCII filenames are stored even if hdrcharset=UTF-8
187 	 * is specified.
188 	 */
189 	a = archive_write_new();
190 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
191 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
192 		skipping("This system cannot convert character-set"
193 		    " from KOI8-R to UTF-8.");
194 		archive_write_free(a);
195 		return;
196 	}
197 	assertEqualInt(ARCHIVE_OK,
198 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
199 
200 	entry = archive_entry_new2(a);
201 	/* Set an ASCII filename. */
202 	archive_entry_set_pathname(entry, "abcABC");
203 	archive_entry_set_filetype(entry, AE_IFREG);
204 	archive_entry_set_size(entry, 0);
205 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
206 	archive_entry_free(entry);
207 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
208 
209 	/* A bit 11 of general purpose flag should be 0,
210 	 * which indicates the filename charset is unknown. */
211 	assertEqualInt(0, buff[7]);
212 	assertEqualMem(buff + 30, "abcABC", 6);
213 }
214 
215 /*
216  * Do not translate CP1251 into CP866 if non Windows platform.
217  */
218 DEFINE_TEST(test_zip_filename_encoding_ru_RU_CP1251)
219 {
220   	struct archive *a;
221   	struct archive_entry *entry;
222 	char buff[4096];
223 	size_t used;
224 
225 	if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
226 		skipping("Russian_Russia locale not available on this system.");
227 		return;
228 	}
229 
230 	/*
231 	 * Verify that CP1251 filenames are not translated into any
232 	 * other character-set, in particular, CP866.
233 	 */
234 	a = archive_write_new();
235 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
236 	assertEqualInt(ARCHIVE_OK,
237 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
238 
239 	entry = archive_entry_new2(a);
240 	/* Set a CP1251 filename. */
241 	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
242 	archive_entry_set_filetype(entry, AE_IFREG);
243 	archive_entry_set_size(entry, 0);
244 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
245 	archive_entry_free(entry);
246 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
247 
248 	/* A bit 11 of general purpose flag should be 0,
249 	 * which indicates the filename charset is unknown. */
250 	assertEqualInt(0, buff[7]);
251 	/* Above three characters in CP1251 should not translate into
252 	 * any other character-set. */
253 	assertEqualMem(buff + 30, "\xEF\xF0\xE8", 3);
254 }
255 
256 /*
257  * Other archiver applications on Windows translate CP1251 filenames
258  * into CP866 filenames and store it in the zip file.
259  * Test above behavior works well.
260  */
261 DEFINE_TEST(test_zip_filename_encoding_Russian_Russia)
262 {
263   	struct archive *a;
264   	struct archive_entry *entry;
265 	char buff[4096];
266 	size_t used;
267 
268 	if (NULL == setlocale(LC_ALL, "Russian_Russia")) {
269 		skipping("Russian_Russia locale not available on this system.");
270 		return;
271 	}
272 
273 	/*
274 	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
275 	 * to UTF-8.
276 	 */
277 	a = archive_write_new();
278 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
279 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
280 		skipping("This system cannot convert character-set"
281 		    " from Russian_Russia.CP1251 to UTF-8.");
282 		archive_write_free(a);
283 		return;
284 	}
285 	assertEqualInt(ARCHIVE_OK,
286 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
287 
288 	entry = archive_entry_new2(a);
289 	/* Set a CP1251 filename. */
290 	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
291 	archive_entry_set_filetype(entry, AE_IFREG);
292 	archive_entry_set_size(entry, 0);
293 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
294 	archive_entry_free(entry);
295 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
296 
297 	/* A bit 11 of general purpose flag should be 0x08,
298 	 * which indicates the filename charset is UTF-8. */
299 	assertEqualInt(0x08, buff[7]);
300 	/* Above three characters in CP1251 should translate to the following
301 	 * three characters (two bytes each) in UTF-8. */
302 	assertEqualMem(buff + 30, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
303 
304 	/*
305 	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
306 	 * to CP866.
307 	 */
308 	a = archive_write_new();
309 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
310 	assertEqualInt(ARCHIVE_OK,
311 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
312 
313 	entry = archive_entry_new2(a);
314 	/* Set a CP1251 filename. */
315 	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
316 	archive_entry_set_filetype(entry, AE_IFREG);
317 	archive_entry_set_size(entry, 0);
318 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
319 	archive_entry_free(entry);
320 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
321 
322 	/* A bit 11 of general purpose flag should be 0,
323 	 * which indicates the filename charset is unknown. */
324 	assertEqualInt(0, buff[7]);
325 	/* Above three characters in CP1251 should translate to the following
326 	 * three characters in CP866. */
327 	assertEqualMem(buff + 30, "\xAF\xE0\xA8", 3);
328 }
329 
330 DEFINE_TEST(test_zip_filename_encoding_EUCJP)
331 {
332   	struct archive *a;
333   	struct archive_entry *entry;
334 	char buff[4096];
335 	size_t used;
336 
337 	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
338 		skipping("eucJP locale not available on this system.");
339 		return;
340 	}
341 
342 	/*
343 	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
344 	 */
345 	a = archive_write_new();
346 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
347 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
348 		skipping("This system cannot convert character-set"
349 		    " from eucJP to UTF-8.");
350 		archive_write_free(a);
351 		return;
352 	}
353 	assertEqualInt(ARCHIVE_OK,
354 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
355 
356 	entry = archive_entry_new2(a);
357 	/* Set an EUC-JP filename. */
358 	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
359 	/* Check the Unicode version. */
360 	archive_entry_set_filetype(entry, AE_IFREG);
361 	archive_entry_set_size(entry, 0);
362 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
363 	archive_entry_free(entry);
364 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
365 
366 	/* A bit 11 of general purpose flag should be 0x08,
367 	 * which indicates the filename charset is UTF-8. */
368 	assertEqualInt(0x08, buff[7]);
369 	/* Check UTF-8 version. */
370 	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
371 
372 	/*
373 	 * Verify that EUC-JP filenames are not translated to UTF-8.
374 	 */
375 	a = archive_write_new();
376 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
377 	assertEqualInt(ARCHIVE_OK,
378 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
379 
380 	entry = archive_entry_new2(a);
381 	/* Set an EUC-JP filename. */
382 	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
383 	/* Check the Unicode version. */
384 	archive_entry_set_filetype(entry, AE_IFREG);
385 	archive_entry_set_size(entry, 0);
386 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
387 	archive_entry_free(entry);
388 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
389 
390 	/* A bit 11 of general purpose flag should be 0,
391 	 * which indicates the filename charset is unknown. */
392 	assertEqualInt(0, buff[7]);
393 	/* Above three characters in EUC-JP should not translate to
394 	 * any character-set. */
395 	assertEqualMem(buff + 30, "\xC9\xBD.txt", 6);
396 
397 	/*
398 	 * Verify that A bit 11 of general purpose flag is not set
399 	 * when ASCII filenames are stored even if hdrcharset=UTF-8
400 	 * is specified.
401 	 */
402 	a = archive_write_new();
403 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
404 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
405 		skipping("This system cannot convert character-set"
406 		    " from eucJP to UTF-8.");
407 		archive_write_free(a);
408 		return;
409 	}
410 	assertEqualInt(ARCHIVE_OK,
411 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
412 
413 	entry = archive_entry_new2(a);
414 	/* Set an ASCII filename. */
415 	archive_entry_set_pathname(entry, "abcABC");
416 	/* Check the Unicode version. */
417 	archive_entry_set_filetype(entry, AE_IFREG);
418 	archive_entry_set_size(entry, 0);
419 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
420 	archive_entry_free(entry);
421 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
422 
423 	/* A bit 11 of general purpose flag should be 0,
424 	 * which indicates the filename charset is unknown. */
425 	assertEqualInt(0, buff[7]);
426 	assertEqualMem(buff + 30, "abcABC", 6);
427 }
428 
429 DEFINE_TEST(test_zip_filename_encoding_CP932)
430 {
431   	struct archive *a;
432   	struct archive_entry *entry;
433 	char buff[4096];
434 	size_t used;
435 
436 	if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
437 	    NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
438 		skipping("CP932/SJIS locale not available on this system.");
439 		return;
440 	}
441 
442 	/*
443 	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
444 	 */
445 	a = archive_write_new();
446 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
447 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
448 		skipping("This system cannot convert character-set"
449 		    " from CP932/SJIS to UTF-8.");
450 		archive_write_free(a);
451 		return;
452 	}
453 	assertEqualInt(ARCHIVE_OK,
454 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
455 
456 	entry = archive_entry_new2(a);
457 	/* Set a CP932/SJIS filename. */
458 	archive_entry_set_pathname(entry, "\x95\x5C.txt");
459 	/* Check the Unicode version. */
460 	archive_entry_set_filetype(entry, AE_IFREG);
461 	archive_entry_set_size(entry, 0);
462 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
463 	archive_entry_free(entry);
464 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
465 
466 	/* A bit 11 of general purpose flag should be 0x08,
467 	 * which indicates the filename charset is UTF-8. */
468 	assertEqualInt(0x08, buff[7]);
469 	/* Check UTF-8 version. */
470 	assertEqualMem(buff + 30, "\xE8\xA1\xA8.txt", 7);
471 
472 	/*
473 	 * Verify that CP932/SJIS filenames are not translated to UTF-8.
474 	 */
475 	a = archive_write_new();
476 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
477 	assertEqualInt(ARCHIVE_OK,
478 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
479 
480 	entry = archive_entry_new2(a);
481 	/* Set a CP932/SJIS filename. */
482 	archive_entry_set_pathname(entry, "\x95\x5C.txt");
483 	/* Check the Unicode version. */
484 	archive_entry_set_filetype(entry, AE_IFREG);
485 	archive_entry_set_size(entry, 0);
486 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
487 	archive_entry_free(entry);
488 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
489 
490 	/* A bit 11 of general purpose flag should be 0,
491 	 * which indicates the filename charset is unknown. */
492 	assertEqualInt(0, buff[7]);
493 	/* Above three characters in CP932/SJIS should not translate to
494 	 * any character-set. */
495 	assertEqualMem(buff + 30, "\x95\x5C.txt", 6);
496 
497 	/*
498 	 * Verify that A bit 11 of general purpose flag is not set
499 	 * when ASCII filenames are stored even if hdrcharset=UTF-8
500 	 * is specified.
501 	 */
502 	a = archive_write_new();
503 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_zip(a));
504 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
505 		skipping("This system cannot convert character-set"
506 		    " from CP932/SJIS to UTF-8.");
507 		archive_write_free(a);
508 		return;
509 	}
510 	assertEqualInt(ARCHIVE_OK,
511 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
512 
513 	entry = archive_entry_new2(a);
514 	/* Set an ASCII filename. */
515 	archive_entry_set_pathname(entry, "abcABC");
516 	/* Check the Unicode version. */
517 	archive_entry_set_filetype(entry, AE_IFREG);
518 	archive_entry_set_size(entry, 0);
519 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
520 	archive_entry_free(entry);
521 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
522 
523 	/* A bit 11 of general purpose flag should be 0,
524 	 * which indicates the filename charset is unknown. */
525 	assertEqualInt(0, buff[7]);
526 	assertEqualMem(buff + 30, "abcABC", 6);
527 }
528