1 /*
2 * HWP Stuff
3 *
4 * Copyright (C) 2015-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5 *
6 * Authors: Kevin Lin
7 *
8 * This program is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License version 2 as published by the
10 * Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc., 51
19 * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 */
21
22 #if HAVE_CONFIG_H
23 #include "clamav-config.h"
24 #endif
25
26 #if HAVE_LIBXML2
27 #include <libxml/xmlreader.h>
28 #endif
29
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <fcntl.h>
33 #include <string.h>
34 #include <ctype.h>
35 #include <zlib.h>
36
37 #if HAVE_ICONV
38 #include <iconv.h>
39 #endif
40
41 #include "clamav.h"
42 #include "fmap.h"
43 #include "str.h"
44 #include "conv.h"
45 #include "others.h"
46 #include "scanners.h"
47 #include "msxml_parser.h"
48 #include "msxml.h"
49 #include "json_api.h"
50 #include "hwp.h"
51 #if HAVE_JSON
52 #include "msdoc.h"
53 #endif
54
55 #define HWP5_DEBUG 0
56 #define HWP3_DEBUG 0
57 #define HWP3_VERIFY 0
58 #define HWPML_DEBUG 0
59 #if HWP5_DEBUG
60 #define hwp5_debug(...) cli_dbgmsg(__VA_ARGS__)
61 #else
62 #define hwp5_debug(...) {};
63 #endif
64 #if HWP3_DEBUG
65 #define hwp3_debug(...) cli_dbgmsg(__VA_ARGS__)
66 #else
67 #define hwp3_debug(...) {};
68 #endif
69 #if HWPML_DEBUG
70 #define hwpml_debug(...) cli_dbgmsg(__VA_ARGS__)
71 #else
72 #define hwpml_debug(...) {};
73 #endif
74
75 typedef cl_error_t (*hwp_cb)(void *cbdata, int fd, const char *filepath, cli_ctx *ctx);
76
decompress_and_callback(cli_ctx * ctx,fmap_t * input,size_t at,size_t len,const char * parent,hwp_cb cb,void * cbdata)77 static cl_error_t decompress_and_callback(cli_ctx *ctx, fmap_t *input, size_t at, size_t len, const char *parent, hwp_cb cb, void *cbdata)
78 {
79 cl_error_t ret = CL_SUCCESS;
80 int zret, ofd;
81 size_t in;
82 size_t off_in = at;
83 size_t count, remain = 1, outsize = 0;
84 z_stream zstrm;
85 char *tmpname;
86 unsigned char inbuf[FILEBUFF], outbuf[FILEBUFF];
87
88 if (!ctx || !input || !cb)
89 return CL_ENULLARG;
90
91 if (len)
92 remain = len;
93
94 /* reserve tempfile for output and callback */
95 if ((ret = cli_gentempfd(ctx->sub_tmpdir, &tmpname, &ofd)) != CL_SUCCESS) {
96 cli_errmsg("%s: Can't generate temporary file\n", parent);
97 return ret;
98 }
99
100 /* initialize zlib inflation stream */
101 memset(&zstrm, 0, sizeof(zstrm));
102 zstrm.zalloc = Z_NULL;
103 zstrm.zfree = Z_NULL;
104 zstrm.opaque = Z_NULL;
105 zstrm.next_in = inbuf;
106 zstrm.next_out = outbuf;
107 zstrm.avail_in = 0;
108 zstrm.avail_out = FILEBUFF;
109
110 zret = inflateInit2(&zstrm, -15);
111 if (zret != Z_OK) {
112 cli_errmsg("%s: Can't initialize zlib inflation stream\n", parent);
113 ret = CL_EUNPACK;
114 goto dc_end;
115 }
116
117 /* inflation loop */
118 do {
119 if (zstrm.avail_in == 0) {
120 zstrm.next_in = inbuf;
121
122 in = fmap_readn(input, inbuf, off_in, FILEBUFF);
123 if (in == (size_t)-1) {
124 cli_errmsg("%s: Error reading stream\n", parent);
125 ret = CL_EUNPACK;
126 goto dc_end;
127 }
128 if (!in)
129 break;
130
131 if (len) {
132 if (remain < in)
133 in = remain;
134 remain -= in;
135 }
136 zstrm.avail_in = in;
137 off_in += in;
138 }
139 zret = inflate(&zstrm, Z_SYNC_FLUSH);
140 count = FILEBUFF - zstrm.avail_out;
141 if (count) {
142 if ((ret = cli_checklimits("HWP", ctx, outsize + count, 0, 0)) != CL_SUCCESS)
143 break;
144
145 if (cli_writen(ofd, outbuf, count) != count) {
146 cli_errmsg("%s: Can't write to file %s\n", parent, tmpname);
147 ret = CL_EWRITE;
148 goto dc_end;
149 }
150 outsize += count;
151 }
152 zstrm.next_out = outbuf;
153 zstrm.avail_out = FILEBUFF;
154 } while (zret == Z_OK && remain);
155
156 cli_dbgmsg("%s: Decompressed %zu bytes to %s\n", parent, outsize, tmpname);
157
158 /* post inflation checks */
159 if (zret != Z_STREAM_END && zret != Z_OK) {
160 if (outsize == 0) {
161 cli_infomsg(ctx, "%s: Error decompressing stream. No data decompressed.\n", parent);
162 ret = CL_EUNPACK;
163 goto dc_end;
164 }
165
166 cli_infomsg(ctx, "%s: Error decompressing stream. Scanning what was decompressed.\n", parent);
167 }
168
169 /* check for limits exceeded or zlib failure */
170 if (ret == CL_SUCCESS && (zret == Z_STREAM_END || zret == Z_OK)) {
171 if (len && remain > 0)
172 cli_infomsg(ctx, "%s: Error decompressing stream. Not all requested input was converted\n", parent);
173
174 /* scanning inflated stream */
175 ret = cb(cbdata, ofd, tmpname, ctx);
176 } else {
177 /* default to scanning what we got */
178 ret = cli_magic_scan_desc(ofd, tmpname, ctx, NULL);
179 }
180
181 /* clean-up */
182 dc_end:
183 zret = inflateEnd(&zstrm);
184 if (zret != Z_OK) {
185 cli_errmsg("%s: Error closing zlib inflation stream\n", parent);
186 if (ret == CL_SUCCESS)
187 ret = CL_EUNPACK;
188 }
189 close(ofd);
190 if (!ctx->engine->keeptmp)
191 if (cli_unlink(tmpname))
192 ret = CL_EUNLINK;
193 free(tmpname);
194 return ret;
195 }
196
197 /* convert HANGUL_NUMERICAL to UTF-8 encoding using iconv library, converts to base64 encoding if no iconv or failure */
198 #define HANGUL_NUMERICAL 0
convert_hstr_to_utf8(const char * begin,size_t sz,const char * parent,cl_error_t * ret)199 static char *convert_hstr_to_utf8(const char *begin, size_t sz, const char *parent, cl_error_t *ret)
200 {
201 cl_error_t rc = CL_SUCCESS;
202 char *res = NULL;
203 #if HANGUL_NUMERICAL && HAVE_ICONV
204 char *p1, *p2, *inbuf = NULL, *outbuf = NULL;
205 size_t inlen, outlen;
206 iconv_t cd;
207
208 do {
209 p1 = inbuf = cli_calloc(1, sz + 1);
210 if (!inbuf) {
211 cli_errmsg("%s: Failed to allocate memory for encoding conversion buffer\n", parent);
212 rc = CL_EMEM;
213 break;
214 }
215 memcpy(inbuf, begin, sz);
216 p2 = outbuf = cli_calloc(1, sz + 1);
217 if (!outbuf) {
218 cli_errmsg("%s: Failed to allocate memory for encoding conversion buffer\n", parent);
219 rc = CL_EMEM;
220 break;
221 }
222 inlen = outlen = sz;
223
224 cd = iconv_open("UTF-8", "UNICODE");
225 if (cd == (iconv_t)(-1)) {
226 char errbuf[128];
227 cli_strerror(errno, errbuf, sizeof(errbuf));
228 cli_errmsg("%s: Failed to initialize iconv for encoding %s: %s\n", parent, HANGUL_NUMERICAL, errbuf);
229 break;
230 }
231
232 iconv(cd, (char **)(&p1), &inlen, &p2, &outlen);
233 iconv_close(cd);
234
235 /* no data was converted */
236 if (outlen == sz)
237 break;
238
239 outbuf[sz - outlen] = '\0';
240
241 if (!(res = strdup(outbuf))) {
242 cli_errmsg("%s: Failed to allocate memory for encoding conversion buffer\n", parent);
243 rc = CL_EMEM;
244 break;
245 }
246 } while (0);
247
248 if (inbuf)
249 free(inbuf);
250 if (outbuf)
251 free(outbuf);
252 #endif
253 /* safety base64 encoding */
254 if (!res && (rc == CL_SUCCESS)) {
255 char *tmpbuf;
256
257 tmpbuf = cli_calloc(1, sz + 1);
258 if (tmpbuf) {
259 memcpy(tmpbuf, begin, sz);
260
261 res = (char *)cl_base64_encode(tmpbuf, sz);
262 if (res)
263 rc = CL_VIRUS; /* used as placeholder */
264 else
265 rc = CL_EMEM;
266
267 free(tmpbuf);
268 } else {
269 cli_errmsg("%s: Failed to allocate memory for temporary buffer\n", parent);
270 rc = CL_EMEM;
271 }
272 }
273
274 (*ret) = rc;
275 return res;
276 }
277
278 /*** HWPOLE2 ***/
cli_scanhwpole2(cli_ctx * ctx)279 cl_error_t cli_scanhwpole2(cli_ctx *ctx)
280 {
281 fmap_t *map = ctx->fmap;
282 uint32_t usize, asize;
283
284 asize = (uint32_t)(map->len - sizeof(usize));
285
286 if (fmap_readn(map, &usize, 0, sizeof(usize)) != sizeof(usize)) {
287 cli_errmsg("HWPOLE2: Failed to read uncompressed ole2 filesize\n");
288 return CL_EREAD;
289 }
290
291 if (usize != asize)
292 cli_warnmsg("HWPOLE2: Mismatched uncompressed prefix and size: %u != %u\n", usize, asize);
293 else
294 cli_dbgmsg("HWPOLE2: Matched uncompressed prefix and size: %u == %u\n", usize, asize);
295
296 return cli_magic_scan_nested_fmap_type(map, 4, 0, ctx, CL_TYPE_ANY, NULL);
297 //return cli_magic_scan_nested_fmap_type(map, 4, 0, ctx, CL_TYPE_OLE2);
298 }
299
300 /*** HWP5 ***/
301
cli_hwp5header(cli_ctx * ctx,hwp5_header_t * hwp5)302 cl_error_t cli_hwp5header(cli_ctx *ctx, hwp5_header_t *hwp5)
303 {
304 if (!ctx || !hwp5)
305 return CL_ENULLARG;
306
307 #if HAVE_JSON
308 if (SCAN_COLLECT_METADATA) {
309 json_object *header, *flags;
310
311 header = cli_jsonobj(ctx->wrkproperty, "Hwp5Header");
312 if (!header) {
313 cli_errmsg("HWP5.x: No memory for Hwp5Header object\n");
314 return CL_EMEM;
315 }
316
317 /* version */
318 cli_jsonint(header, "RawVersion", hwp5->version);
319
320 /* flags */
321 cli_jsonint(header, "RawFlags", hwp5->flags);
322
323 flags = cli_jsonarray(header, "Flags");
324 if (!flags) {
325 cli_errmsg("HWP5.x: No memory for Hwp5Header/Flags array\n");
326 return CL_EMEM;
327 }
328
329 if (hwp5->flags & HWP5_COMPRESSED) {
330 cli_jsonstr(flags, NULL, "HWP5_COMPRESSED");
331 }
332 if (hwp5->flags & HWP5_PASSWORD) {
333 cli_jsonstr(flags, NULL, "HWP5_PASSWORD");
334 }
335 if (hwp5->flags & HWP5_DISTRIBUTABLE) {
336 cli_jsonstr(flags, NULL, "HWP5_DISTRIBUTABLE");
337 }
338 if (hwp5->flags & HWP5_SCRIPT) {
339 cli_jsonstr(flags, NULL, "HWP5_SCRIPT");
340 }
341 if (hwp5->flags & HWP5_DRM) {
342 cli_jsonstr(flags, NULL, "HWP5_DRM");
343 }
344 if (hwp5->flags & HWP5_XMLTEMPLATE) {
345 cli_jsonstr(flags, NULL, "HWP5_XMLTEMPLATE");
346 }
347 if (hwp5->flags & HWP5_HISTORY) {
348 cli_jsonstr(flags, NULL, "HWP5_HISTORY");
349 }
350 if (hwp5->flags & HWP5_CERT_SIGNED) {
351 cli_jsonstr(flags, NULL, "HWP5_CERT_SIGNED");
352 }
353 if (hwp5->flags & HWP5_CERT_ENCRYPTED) {
354 cli_jsonstr(flags, NULL, "HWP5_CERT_ENCRYPTED");
355 }
356 if (hwp5->flags & HWP5_CERT_EXTRA) {
357 cli_jsonstr(flags, NULL, "HWP5_CERT_EXTRA");
358 }
359 if (hwp5->flags & HWP5_CERT_DRM) {
360 cli_jsonstr(flags, NULL, "HWP5_CERT_DRM");
361 }
362 if (hwp5->flags & HWP5_CCL) {
363 cli_jsonstr(flags, NULL, "HWP5_CCL");
364 }
365 }
366 #endif
367 return CL_SUCCESS;
368 }
369
hwp5_cb(void * cbdata,int fd,const char * filepath,cli_ctx * ctx)370 static cl_error_t hwp5_cb(void *cbdata, int fd, const char *filepath, cli_ctx *ctx)
371 {
372 UNUSEDPARAM(cbdata);
373
374 if (fd < 0 || !ctx)
375 return CL_ENULLARG;
376
377 return cli_magic_scan_desc(fd, filepath, ctx, NULL);
378 }
379
cli_scanhwp5_stream(cli_ctx * ctx,hwp5_header_t * hwp5,char * name,int fd,const char * filepath)380 cl_error_t cli_scanhwp5_stream(cli_ctx *ctx, hwp5_header_t *hwp5, char *name, int fd, const char *filepath)
381 {
382 hwp5_debug("HWP5.x: NAME: %s\n", name ? name : "(NULL)");
383
384 if (fd < 0) {
385 cli_errmsg("HWP5.x: Invalid file descriptor argument\n");
386 return CL_ENULLARG;
387 }
388
389 if (name) {
390 /* encrypted and compressed streams */
391 if (!strncmp(name, "bin", 3) || !strncmp(name, "jscriptversion", 14) ||
392 !strncmp(name, "defaultjscript", 14) || !strncmp(name, "section", 7) ||
393 !strncmp(name, "viewtext", 8) || !strncmp(name, "docinfo", 7)) {
394
395 if (hwp5->flags & HWP5_PASSWORD) {
396 cli_dbgmsg("HWP5.x: Password encrypted stream, scanning as-is\n");
397 return cli_magic_scan_desc(fd, filepath, ctx, name);
398 }
399
400 if (hwp5->flags & HWP5_COMPRESSED) {
401 /* DocInfo JSON Handling */
402 STATBUF statbuf;
403 fmap_t *input;
404 cl_error_t ret;
405
406 hwp5_debug("HWP5.x: Sending %s for decompress and scan\n", name);
407
408 /* fmap the input file for easier manipulation */
409 if (FSTAT(fd, &statbuf) == -1) {
410 cli_errmsg("HWP5.x: Can't stat file descriptor\n");
411 return CL_ESTAT;
412 }
413
414 input = fmap(fd, 0, statbuf.st_size, NULL);
415 if (!input) {
416 cli_errmsg("HWP5.x: Failed to get fmap for input stream\n");
417 return CL_EMAP;
418 }
419 ret = decompress_and_callback(ctx, input, 0, 0, "HWP5.x", hwp5_cb, NULL);
420 funmap(input);
421 return ret;
422 }
423 }
424
425 #if HAVE_JSON
426 /* JSON Output Summary Information */
427 if (SCAN_COLLECT_METADATA && ctx->properties != NULL) {
428 if (name && !strncmp(name, "_5_hwpsummaryinformation", 24)) {
429 cli_dbgmsg("HWP5.x: Detected a '_5_hwpsummaryinformation' stream\n");
430 /* JSONOLE2 - what to do if something breaks? */
431 if (cli_ole2_summary_json(ctx, fd, 2) == CL_ETIMEOUT)
432 return CL_ETIMEOUT;
433 }
434 }
435
436 #endif
437 }
438
439 /* normal streams */
440 return cli_magic_scan_desc(fd, filepath, ctx, name);
441 }
442
443 /*** HWP3 ***/
444
445 /* all fields use little endian and unicode encoding, if appliable */
446
447 //File Identification Information - (30 total bytes)
448 #define HWP3_IDENTITY_INFO_SIZE 30
449
450 //Document Information - (128 total bytes)
451 #define HWP3_DOCINFO_SIZE 128
452
453 #define DI_WRITEPROT 24 /* offset 24 (4 bytes) - write protection */
454 #define DI_EXTERNAPP 28 /* offset 28 (2 bytes) - external application */
455 #define DI_PNAME 32 /* offset 32 (40 x 1 bytes) - print name */
456 #define DI_ANNOTE 72 /* offset 72 (24 x 1 bytes) - annotation */
457 #define DI_PASSWD 96 /* offset 96 (2 bytes) - password protected */
458 #define DI_COMPRESSED 124 /* offset 124 (1 byte) - compression */
459 #define DI_INFOBLKSIZE 126 /* offset 126 (2 bytes) - information block length */
460 struct hwp3_docinfo {
461 uint32_t di_writeprot;
462 uint16_t di_externapp;
463 uint16_t di_passwd;
464 uint8_t di_compressed;
465 uint16_t di_infoblksize;
466 };
467
468 //Document Summary - (1008 total bytes)
469 #define HWP3_DOCSUMMARY_SIZE 1008
470 struct hwp3_docsummary_entry {
471 size_t offset;
472 const char *name;
473 } hwp3_docsummary_fields[] = {
474 {0, "Title"}, /* offset 0 (56 x 2 bytes) - title */
475 {112, "Subject"}, /* offset 112 (56 x 2 bytes) - subject */
476 {224, "Author"}, /* offset 224 (56 x 2 bytes) - author */
477 {336, "Date"}, /* offset 336 (56 x 2 bytes) - date */
478 {448, "Keyword1"}, /* offset 448 (2 x 56 x 2 bytes) - keywords */
479 {560, "Keyword2"},
480
481 {672, "Etc0"}, /* offset 672 (3 x 56 x 2 bytes) - etc */
482 {784, "Etc1"},
483 {896, "Etc2"}};
484 #define NUM_DOCSUMMARY_FIELDS sizeof(hwp3_docsummary_fields) / sizeof(struct hwp3_docsummary_entry)
485
486 //Document Paragraph Information - (43 or 230 total bytes)
487 #define HWP3_PARAINFO_SIZE_S 43
488 #define HWP3_PARAINFO_SIZE_L 230
489 #define HWP3_LINEINFO_SIZE 14
490 #define HWP3_CHARSHPDATA_SIZE 31
491
492 #define HWP3_FIELD_LENGTH 512
493
494 #define PI_PPFS 0 /* offset 0 (1 byte) - prior paragraph format style */
495 #define PI_NCHARS 1 /* offset 1 (2 bytes) - character count */
496 #define PI_NLINES 3 /* offset 3 (2 bytes) - line count */
497 #define PI_IFSC 5 /* offset 5 (1 byte) - including font style of characters */
498 #define PI_FLAGS 6 /* offset 6 (1 byte) - other flags */
499 #define PI_SPECIAL 7 /* offset 7 (4 bytes) - special characters markers */
500 #define PI_ISTYLE 11 /* offset 11 (1 byte) - paragraph style index */
501
502 #define PLI_LOFF 0 /* offset 0 (2 bytes) - line starting offset */
503 #define PLI_LCOR 2 /* offset 2 (2 bytes) - line blank correction */
504 #define PLI_LHEI 4 /* offset 4 (2 bytes) - line max char height */
505 #define PLI_LPAG 12 /* offset 12 (2 bytes) - line pagination*/
506
507 #define PCSD_SIZE 0 /* offset 0 (2 bytes) - size of characters */
508 #define PCSD_PROP 26 /* offset 26 (1 byte) - properties */
509
parsehwp3_docinfo(cli_ctx * ctx,size_t offset,struct hwp3_docinfo * docinfo)510 static inline cl_error_t parsehwp3_docinfo(cli_ctx *ctx, size_t offset, struct hwp3_docinfo *docinfo)
511 {
512 const uint8_t *hwp3_ptr;
513 cl_error_t iret;
514
515 //TODO: use fmap_readn?
516 if (!(hwp3_ptr = fmap_need_off_once(ctx->fmap, offset, HWP3_DOCINFO_SIZE))) {
517 cli_errmsg("HWP3.x: Failed to read fmap for hwp docinfo\n");
518 return CL_EMAP;
519 }
520
521 memcpy(&(docinfo->di_writeprot), hwp3_ptr + DI_WRITEPROT, sizeof(docinfo->di_writeprot));
522 memcpy(&(docinfo->di_externapp), hwp3_ptr + DI_EXTERNAPP, sizeof(docinfo->di_externapp));
523 memcpy(&(docinfo->di_passwd), hwp3_ptr + DI_PASSWD, sizeof(docinfo->di_passwd));
524 memcpy(&(docinfo->di_compressed), hwp3_ptr + DI_COMPRESSED, sizeof(docinfo->di_compressed));
525 memcpy(&(docinfo->di_infoblksize), hwp3_ptr + DI_INFOBLKSIZE, sizeof(docinfo->di_infoblksize));
526
527 docinfo->di_writeprot = le32_to_host(docinfo->di_writeprot);
528 docinfo->di_externapp = le16_to_host(docinfo->di_externapp);
529 docinfo->di_passwd = le16_to_host(docinfo->di_passwd);
530 docinfo->di_infoblksize = le16_to_host(docinfo->di_infoblksize);
531
532 hwp3_debug("HWP3.x: di_writeprot: %u\n", docinfo->di_writeprot);
533 hwp3_debug("HWP3.x: di_externapp: %u\n", docinfo->di_externapp);
534 hwp3_debug("HWP3.x: di_passwd: %u\n", docinfo->di_passwd);
535 hwp3_debug("HWP3.x: di_compressed: %u\n", docinfo->di_compressed);
536 hwp3_debug("HWP3.x: di_infoblksize: %u\n", docinfo->di_infoblksize);
537
538 #if HAVE_JSON
539 if (SCAN_COLLECT_METADATA) {
540 json_object *header, *flags;
541 char *str;
542
543 header = cli_jsonobj(ctx->wrkproperty, "Hwp3Header");
544 if (!header) {
545 cli_errmsg("HWP3.x: No memory for Hwp3Header object\n");
546 return CL_EMEM;
547 }
548
549 flags = cli_jsonarray(header, "Flags");
550 if (!flags) {
551 cli_errmsg("HWP5.x: No memory for Hwp5Header/Flags array\n");
552 return CL_EMEM;
553 }
554
555 if (docinfo->di_writeprot) {
556 cli_jsonstr(flags, NULL, "HWP3_WRITEPROTECTED"); /* HWP3_DISTRIBUTABLE */
557 }
558 if (docinfo->di_externapp) {
559 cli_jsonstr(flags, NULL, "HWP3_EXTERNALAPPLICATION");
560 }
561 if (docinfo->di_passwd) {
562 cli_jsonstr(flags, NULL, "HWP3_PASSWORD");
563 }
564 if (docinfo->di_compressed) {
565 cli_jsonstr(flags, NULL, "HWP3_COMPRESSED");
566 }
567
568 /* Printed File Name */
569 str = convert_hstr_to_utf8((char *)(hwp3_ptr + DI_PNAME), 40, "HWP3.x", &iret);
570 if (!str)
571 return CL_EMEM;
572
573 if (iret == CL_VIRUS)
574 cli_jsonbool(header, "PrintName_base64", 1);
575
576 hwp3_debug("HWP3.x: di_pname: %s\n", str);
577 cli_jsonstr(header, "PrintName", str);
578 free(str);
579
580 /* Annotation */
581 str = convert_hstr_to_utf8((char *)(hwp3_ptr + DI_ANNOTE), 24, "HWP3.x", &iret);
582 if (!str)
583 return CL_EMEM;
584
585 if (iret == CL_VIRUS)
586 cli_jsonbool(header, "Annotation_base64", 1);
587
588 hwp3_debug("HWP3.x: di_annote: %s\n", str);
589 cli_jsonstr(header, "Annotation", str);
590 free(str);
591 }
592 #endif
593
594 return CL_SUCCESS;
595 }
596
parsehwp3_docsummary(cli_ctx * ctx,size_t offset)597 static inline cl_error_t parsehwp3_docsummary(cli_ctx *ctx, size_t offset)
598 {
599 #if HAVE_JSON
600 const uint8_t *hwp3_ptr;
601 char *str;
602 size_t i;
603 cl_error_t ret, iret;
604
605 json_object *summary;
606
607 if (!SCAN_COLLECT_METADATA)
608 return CL_SUCCESS;
609
610 if (!(hwp3_ptr = fmap_need_off_once(ctx->fmap, offset, HWP3_DOCSUMMARY_SIZE))) {
611 cli_errmsg("HWP3.x: Failed to read fmap for hwp docinfo\n");
612 return CL_EMAP;
613 }
614
615 summary = cli_jsonobj(ctx->wrkproperty, "Hwp3SummaryInfo");
616 if (!summary) {
617 cli_errmsg("HWP3.x: No memory for json object\n");
618 return CL_EMEM;
619 }
620
621 for (i = 0; i < NUM_DOCSUMMARY_FIELDS; i++) {
622 str = convert_hstr_to_utf8((char *)(hwp3_ptr + hwp3_docsummary_fields[i].offset), 112, "HWP3.x", &iret);
623 if (!str)
624 return CL_EMEM;
625
626 if (iret == CL_VIRUS) {
627 char *b64;
628 size_t b64len = strlen(hwp3_docsummary_fields[i].name) + 8;
629 b64 = cli_calloc(1, b64len);
630 if (!b64) {
631 cli_errmsg("HWP3.x: Failed to allocate memory for b64 boolean\n");
632 free(str);
633 return CL_EMEM;
634 }
635 snprintf(b64, b64len, "%s_base64", hwp3_docsummary_fields[i].name);
636 cli_jsonbool(summary, b64, 1);
637 free(b64);
638 }
639
640 hwp3_debug("HWP3.x: %s, %s\n", hwp3_docsummary_fields[i].name, str);
641 ret = cli_jsonstr(summary, hwp3_docsummary_fields[i].name, str);
642 free(str);
643 if (ret != CL_SUCCESS)
644 return ret;
645 }
646 #else
647 UNUSEDPARAM(ctx);
648 UNUSEDPARAM(offset);
649 #endif
650 return CL_SUCCESS;
651 }
652
653 #if HWP3_VERIFY
654 #define HWP3_PSPECIAL_VERIFY(map, offset, second, id, match) \
655 do { \
656 if (fmap_readn(map, &match, offset + second, sizeof(match)) != sizeof(match)) \
657 return CL_EREAD; \
658 \
659 match = le16_to_host(match); \
660 \
661 if (id != match) { \
662 cli_errmsg("HWP3.x: ID %u block fails verification\n", id); \
663 return CL_EFORMAT; \
664 } \
665 } while (0)
666
667 #else
668 #define HWP3_PSPECIAL_VERIFY(map, offset, second, id, match)
669 #endif
670
parsehwp3_paragraph(cli_ctx * ctx,fmap_t * map,int p,uint32_t level,size_t * roffset,int * last)671 static inline cl_error_t parsehwp3_paragraph(cli_ctx *ctx, fmap_t *map, int p, uint32_t level, size_t *roffset, int *last)
672 {
673 cl_error_t ret = CL_SUCCESS;
674
675 size_t offset = *roffset;
676 size_t new_offset;
677 uint16_t nchars, nlines, content;
678 uint8_t ppfs, ifsc, cfsb;
679 uint16_t i;
680 int c, l, sp = 0, term = 0;
681 #if HWP3_VERIFY
682 uint16_t match;
683 #endif
684 #if HWP3_DEBUG
685 /* other paragraph info */
686 uint8_t flags, istyle;
687 uint16_t fsize;
688 uint32_t special;
689
690 /* line info */
691 uint16_t loff, lcor, lhei, lpag;
692
693 /* char shape data */
694 uint16_t pcsd_size;
695 uint8_t pcsd_prop;
696 #endif
697
698 hwp3_debug("HWP3.x: recursion level: %u\n", level);
699 hwp3_debug("HWP3.x: Paragraph[%u, %d] starts @ offset %zu\n", level, p, offset);
700
701 if (level >= ctx->engine->maxrechwp3)
702 return CL_EMAXREC;
703
704 if (fmap_readn(map, &ppfs, offset + PI_PPFS, sizeof(ppfs)) != sizeof(ppfs))
705 return CL_EREAD;
706
707 if (fmap_readn(map, &nchars, offset + PI_NCHARS, sizeof(nchars)) != sizeof(nchars))
708 return CL_EREAD;
709
710 nchars = le16_to_host(nchars);
711
712 if (fmap_readn(map, &nlines, offset + PI_NLINES, sizeof(nlines)) != sizeof(nlines))
713 return CL_EREAD;
714
715 nlines = le16_to_host(nlines);
716
717 if (fmap_readn(map, &ifsc, offset + PI_IFSC, sizeof(ifsc)) != sizeof(ifsc))
718 return CL_EREAD;
719
720 hwp3_debug("HWP3.x: Paragraph[%u, %d]: ppfs %u\n", level, p, ppfs);
721 hwp3_debug("HWP3.x: Paragraph[%u, %d]: nchars %u\n", level, p, nchars);
722 hwp3_debug("HWP3.x: Paragraph[%u, %d]: nlines %u\n", level, p, nlines);
723 hwp3_debug("HWP3.x: Paragraph[%u, %d]: ifsc %u\n", level, p, ifsc);
724
725 #if HWP3_DEBUG
726 if (fmap_readn(map, &flags, offset + PI_FLAGS, sizeof(flags)) != sizeof(flags))
727 return CL_EREAD;
728
729 if (fmap_readn(map, &special, offset + PI_SPECIAL, sizeof(special)) != sizeof(special))
730 return CL_EREAD;
731
732 if (fmap_readn(map, &istyle, offset + PI_ISTYLE, sizeof(istyle)) != sizeof(istyle))
733 return CL_EREAD;
734
735 if (fmap_readn(map, &fsize, offset + 12, sizeof(fsize)) != sizeof(fsize))
736 return CL_EREAD;
737
738 hwp3_debug("HWP3.x: Paragraph[%u, %d]: flags %x\n", level, p, flags);
739 hwp3_debug("HWP3.x: Paragraph[%u, %d]: spcl %x\n", level, p, special);
740 hwp3_debug("HWP3.x: Paragraph[%u, %d]: istyle %u\n", level, p, istyle);
741 hwp3_debug("HWP3.x: Paragraph[%u, %d]: fsize %u\n", level, p, fsize);
742 #endif
743
744 /* detected empty paragraph marker => end-of-paragraph list */
745 if (nchars == 0) {
746 hwp3_debug("HWP3.x: Detected end-of-paragraph list @ offset %zu\n", offset);
747 hwp3_debug("HWP3.x: end recursion level: %u\n", level);
748 (*roffset) = offset + HWP3_PARAINFO_SIZE_S;
749 (*last) = 1;
750 return CL_SUCCESS;
751 }
752
753 if (ppfs)
754 offset += HWP3_PARAINFO_SIZE_S;
755 else
756 offset += HWP3_PARAINFO_SIZE_L;
757
758 /* line information blocks */
759 #if HWP3_DEBUG
760 for (i = 0; (i < nlines) && (offset < map->len); i++) {
761 hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d information starts @ offset %zu\n", level, p, i, offset);
762 if (fmap_readn(map, &loff, offset + PLI_LOFF, sizeof(loff)) != sizeof(loff))
763 return CL_EREAD;
764
765 if (fmap_readn(map, &lcor, offset + PLI_LCOR, sizeof(lcor)) != sizeof(lcor))
766 return CL_EREAD;
767
768 if (fmap_readn(map, &lhei, offset + PLI_LHEI, sizeof(lhei)) != sizeof(lhei))
769 return CL_EREAD;
770
771 if (fmap_readn(map, &lpag, offset + PLI_LPAG, sizeof(lpag)) != sizeof(lpag))
772 return CL_EREAD;
773
774 loff = le16_to_host(loff);
775 lcor = le16_to_host(lcor);
776 lhei = le16_to_host(lhei);
777 lpag = le16_to_host(lpag);
778
779 hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: loff %u\n", level, p, i, loff);
780 hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: lcor %x\n", level, p, i, lcor);
781 hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: lhei %u\n", level, p, i, lhei);
782 hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: lpag %u\n", level, p, i, lpag);
783
784 offset += HWP3_LINEINFO_SIZE;
785 }
786 #else
787 new_offset = offset + (nlines * HWP3_LINEINFO_SIZE);
788 if ((new_offset < offset) || (new_offset >= map->len)) {
789 cli_errmsg("HWP3.x: Paragraph[%u, %d]: nlines value is too high, invalid. %u\n", level, p, nlines);
790 return CL_EPARSE;
791 }
792 offset = new_offset;
793 #endif
794
795 if (offset >= map->len)
796 return CL_EFORMAT;
797
798 if (ifsc) {
799 for (i = 0, c = 0; i < nchars; i++) {
800 /* examine byte for cs data type */
801 if (fmap_readn(map, &cfsb, offset, sizeof(cfsb)) != sizeof(cfsb))
802 return CL_EREAD;
803
804 offset += sizeof(cfsb);
805
806 switch (cfsb) {
807 case 0: /* character shape block */
808 hwp3_debug("HWP3.x: Paragraph[%u, %d]: character font style data @ offset %zu\n", level, p, offset);
809
810 #if HWP3_DEBUG
811 if (fmap_readn(map, &pcsd_size, offset + PCSD_SIZE, sizeof(pcsd_size)) != sizeof(pcsd_size))
812 return CL_EREAD;
813
814 if (fmap_readn(map, &pcsd_prop, offset + PCSD_PROP, sizeof(pcsd_prop)) != sizeof(pcsd_prop))
815 return CL_EREAD;
816
817 pcsd_size = le16_to_host(pcsd_size);
818
819 hwp3_debug("HWP3.x: Paragraph[%u, %d]: CFS %u: pcsd_size %u\n", level, p, 0, pcsd_size);
820 hwp3_debug("HWP3.x: Paragraph[%u, %d]: CFS %u: pcsd_prop %x\n", level, p, 0, pcsd_prop);
821 #endif
822
823 c++;
824 offset += HWP3_CHARSHPDATA_SIZE;
825 break;
826 case 1: /* normal character - as representation of another character for previous cs block */
827 break;
828 default:
829 cli_errmsg("HWP3.x: Paragraph[%u, %d]: unknown CFS type 0x%x @ offset %zu\n", level, p, cfsb, offset);
830 cli_errmsg("HWP3.x: Paragraph parsing detected %d of %u characters\n", i, nchars);
831 return CL_EPARSE;
832 }
833 }
834
835 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected %d CFS block(s) and %d characters\n", level, p, c, i);
836 } else {
837 hwp3_debug("HWP3.x: Paragraph[%u, %d]: separate character font style segment not stored\n", level, p);
838 }
839
840 if (!term)
841 hwp3_debug("HWP3.x: Paragraph[%u, %d]: content starts @ offset %zu\n", level, p, offset);
842
843 /* scan for end-of-paragraph [0x0d00 on offset parity to current content] */
844 while ((!term) &&
845 (offset < map->len)) {
846
847 if (fmap_readn(map, &content, offset, sizeof(content)) != sizeof(content))
848 return CL_EREAD;
849
850 content = le16_to_host(content);
851
852 /* special character handling */
853 if (content < 32) {
854 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected special character %u @ offset %zu\n", level, p, content, offset);
855
856 switch (content) {
857 case 0:
858 case 1:
859 case 2:
860 case 3:
861 case 4:
862 case 12:
863 case 27: {
864 /* reserved */
865 uint32_t length;
866
867 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected special character as [reserved]\n", level, p);
868
869 /*
870 * offset 0 (2 bytes) - special character ID
871 * offset 2 (4 bytes) - length of information = n
872 * offset 6 (2 bytes) - special character ID
873 * offset 8 (n bytes) - information
874 */
875
876 /* id block verification (only on HWP3_VERIFY) */
877 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
878
879 if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
880 return CL_EREAD;
881
882 length = le32_to_host(length);
883 new_offset = offset + (8 + length);
884 if ((new_offset <= offset) || (new_offset > map->len)) {
885 cli_errmsg("HWP3.x: Paragraph[%u, %d]: length value is too high, invalid. %u\n", level, p, length);
886 return CL_EPARSE;
887 }
888 offset = new_offset;
889
890 #if HWP3_DEBUG
891 cli_errmsg("HWP3.x: Paragraph[%u, %d]: possible invalid usage of reserved special character %u\n", level, p, content);
892 return CL_EFORMAT;
893 #endif
894 break;
895 }
896 case 5: /* field codes */
897 {
898 uint32_t length;
899
900 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected field code marker @ offset %zu\n", level, p, offset);
901
902 /*
903 * offset 0 (2 bytes) - special character ID
904 * offset 2 (4 bytes) - length of information = n
905 * offset 6 (2 bytes) - special character ID
906 * offset 8 (n bytes) - field code details
907 */
908
909 /* id block verification (only on HWP3_VERIFY) */
910 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
911
912 if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
913 return CL_EREAD;
914
915 length = le32_to_host(length);
916 new_offset = offset + (8 + length);
917 if ((new_offset <= offset) || (new_offset > map->len)) {
918 cli_errmsg("HWP3.x: Paragraph[%u, %d]: length value is too high, invalid. %u\n", level, p, length);
919 return CL_EPARSE;
920 }
921 offset = new_offset;
922 break;
923 }
924 case 6: /* bookmark */
925 {
926 #if HWP3_VERIFY
927 uint32_t length;
928 #endif
929
930 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected bookmark marker @ offset %zu\n", level, p, offset);
931
932 /*
933 * offset 0 (2 bytes) - special character ID
934 * offset 2 (4 bytes) - length of information = 34
935 * offset 6 (2 bytes) - special character ID
936 * offset 8 (16 x 2 bytes) - bookmark name
937 * offset 40 (2 bytes) - bookmark type
938 * total is always 42 bytes
939 */
940
941 #if HWP3_VERIFY
942 /* id block verification (only on HWP3_VERIFY) */
943 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
944
945 /* length check - always 34 bytes */
946 if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
947 return CL_EREAD;
948
949 length = le32_to_host(length);
950
951 if (length != 34) {
952 cli_errmsg("HWP3.x: Bookmark has incorrect length: %u != 34)\n", length);
953 return CL_EFORMAT;
954 }
955 #endif
956 offset += 42;
957 break;
958 }
959 case 7: /* date format */
960 {
961 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected date format marker @ offset %zu\n", level, p, offset);
962
963 /*
964 * offset 0 (2 bytes) - special character ID
965 * offset 2 (40 x 2 bytes) - date format as user-defined dialog
966 * offset 82 (2 bytes) - special character ID
967 * total is always 84 bytes
968 */
969
970 /* id block verification (only on HWP3_VERIFY) */
971 HWP3_PSPECIAL_VERIFY(map, offset, 82, content, match);
972
973 offset += 84;
974 break;
975 }
976 case 8: /* date code */
977 {
978 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected date code marker @ offset %zu\n", level, p, offset);
979
980 /*
981 * offset 0 (2 bytes) - special character ID
982 * offset 2 (40 x 2 bytes) - date format string
983 * offset 82 (4 x 2 bytes) - date (year, month, day of week)
984 * offset 90 (2 x 2 bytes) - time (hour, minute)
985 * offset 94 (2 bytes) - special character ID
986 * total is always 96 bytes
987 */
988
989 /* id block verification (only on HWP3_VERIFY) */
990 HWP3_PSPECIAL_VERIFY(map, offset, 94, content, match);
991
992 offset += 96;
993 break;
994 }
995 case 9: /* tab */
996 {
997 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected tab marker @ offset %zu\n", level, p, offset);
998
999 /*
1000 * offset 0 (2 bytes) - special character ID
1001 * offset 2 (2 bytes) - tab width
1002 * offset 4 (2 bytes) - unknown(?)
1003 * offset 6 (2 bytes) - special character ID
1004 * total is always 8 bytes
1005 */
1006
1007 /* id block verification (only on HWP3_VERIFY) */
1008 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1009
1010 offset += 8;
1011 break;
1012 }
1013 case 10: /* table, test box, equation, button, hypertext */
1014 {
1015 uint16_t ncells;
1016 #if HWP3_DEBUG
1017 uint16_t type;
1018 #endif
1019 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected box object marker @ offset %zu\n", level, p, offset);
1020
1021 /* verification (only on HWP3_VERIFY) */
1022 /* id block verify */
1023 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1024 /* extra data block verify */
1025 HWP3_PSPECIAL_VERIFY(map, offset, 24, content, match);
1026
1027 /* ID block is 8 bytes */
1028 offset += 8;
1029
1030 /* box information (84 bytes) */
1031 #if HWP3_DEBUG
1032 /* box type located at offset 78 of box information */
1033 if (fmap_readn(map, &type, offset + 78, sizeof(type)) != sizeof(type))
1034 return CL_EREAD;
1035
1036 type = le16_to_host(type);
1037 if (type == 0)
1038 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as table\n", level, p);
1039 else if (type == 1)
1040 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as text box\n", level, p);
1041 else if (type == 2)
1042 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as equation\n", level, p);
1043 else if (type == 3)
1044 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as button\n", level, p);
1045 else
1046 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as UNKNOWN(%u)\n", level, p, type);
1047 #endif
1048
1049 /* ncells is located at offset 80 of box information */
1050 if (fmap_readn(map, &ncells, offset + 80, sizeof(ncells)) != sizeof(ncells))
1051 return CL_EREAD;
1052
1053 ncells = le16_to_host(ncells);
1054 offset += 84;
1055
1056 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object contains %u cell(s)\n", level, p, ncells);
1057
1058 /* cell information (27 bytes x ncells(offset 80 of table)) */
1059 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box cell info array starts @ %zu\n", level, p, offset);
1060
1061 new_offset = offset + (27 * ncells);
1062 if ((new_offset < offset) || (new_offset >= map->len)) {
1063 cli_errmsg("HWP3.x: Paragraph[%u, %d]: number of box cells is too high, invalid. %u\n", level, p, ncells);
1064 return CL_EPARSE;
1065 }
1066 offset = new_offset;
1067
1068 /* cell paragraph list */
1069 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box cell paragraph list starts @ %zu\n", level, p, offset);
1070 for (i = 0; i < ncells; i++) {
1071 l = 0;
1072 while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1073 if (ret != CL_SUCCESS)
1074 return ret;
1075 }
1076
1077 /* box caption paragraph list */
1078 hwp3_debug("HWP3.x: Paragraph[%u, %d]: box cell caption paragraph list starts @ %zu\n", level, p, offset);
1079 l = 0;
1080 while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1081 if (ret != CL_SUCCESS)
1082 return ret;
1083 break;
1084 }
1085 case 11: /* drawing */
1086 {
1087 uint32_t size;
1088 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected drawing marker @ offset %zu\n", level, p, offset);
1089
1090 /* verification (only on HWP3_VERIFY) */
1091 /* id block verify */
1092 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1093 /* extra data block verify */
1094 HWP3_PSPECIAL_VERIFY(map, offset, 24, content, match);
1095
1096 /* ID block is 8 bytes */
1097 offset += 8;
1098
1099 /* Drawing Info Block is 328+n bytes with n = size of image */
1100 /* n is located at offset 0 of info block */
1101 if (fmap_readn(map, &size, offset, sizeof(size)) != sizeof(size))
1102 return CL_EREAD;
1103
1104 hwp3_debug("HWP3.x: Paragraph[%u, %d]: drawing is %u additional bytes\n", level, p, size);
1105
1106 size = le32_to_host(size);
1107 new_offset = offset + (348 + size);
1108 if ((new_offset <= offset) || (new_offset >= map->len)) {
1109 cli_errmsg("HWP3.x: Paragraph[%u, %d]: image size value is too high, invalid. %u\n", level, p, size);
1110 return CL_EPARSE;
1111 }
1112 offset = new_offset;
1113
1114 /* caption paragraph list */
1115 hwp3_debug("HWP3.x: Paragraph[%u, %d]: drawing caption paragraph list starts @ %zu\n", level, p, offset);
1116 l = 0;
1117 while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1118 if (ret != CL_SUCCESS)
1119 return ret;
1120 break;
1121 }
1122 case 13: /* end-of-paragraph marker - treated identically as character */
1123 hwp3_debug("HWP3.x: Detected end-of-paragraph marker @ offset %zu\n", offset);
1124 term = 1;
1125
1126 offset += sizeof(content);
1127 break;
1128 case 14: /* line information */
1129 {
1130 hwp3_debug("HWP3.x: Detected line information marker @ offset %zu\n", offset);
1131
1132 /* verification (only on HWP3_VERIFY) */
1133 /* id block verify */
1134 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1135 /* extra data block verify */
1136 HWP3_PSPECIAL_VERIFY(map, offset, 24, content, match);
1137
1138 /* ID block is 8 bytes + line information is always 84 bytes */
1139 offset += 92;
1140 break;
1141 }
1142 case 15: /* hidden description */
1143 {
1144 hwp3_debug("HWP3.x: Detected hidden description marker @ offset %zu\n", offset);
1145
1146 /*
1147 * offset 0 (2 bytes) - special character ID
1148 * offset 2 (4 bytes) - reserved
1149 * offset 6 (2 bytes) - special character ID
1150 * offset 8 (8 bytes) - reserved
1151 * total is always 16 bytes
1152 */
1153
1154 /* id block verification (only on HWP3_VERIFY) */
1155 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1156
1157 offset += 16;
1158
1159 /* hidden description paragraph list */
1160 hwp3_debug("HWP3.x: Paragraph[%u, %d]: hidden description paragraph list starts @ %zu\n", level, p, offset);
1161 l = 0;
1162 while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1163 if (ret != CL_SUCCESS)
1164 return ret;
1165 break;
1166 }
1167 case 16: /* header/footer */
1168 {
1169 #if HWP3_DEBUG
1170 uint8_t type;
1171 #endif
1172
1173 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer marker @ offset %zu\n", level, p, offset);
1174
1175 /*
1176 * offset 0 (2 bytes) - special character ID
1177 * offset 2 (4 bytes) - reserved
1178 * offset 6 (2 bytes) - special character ID
1179 * offset 8 (8 x 1 byte) - reserved
1180 * offset 16 (1 byte) - type (header/footer)
1181 * offset 17 (1 byte) - kind
1182 * total is always 18 bytes
1183 */
1184
1185 /* id block verification (only on HWP3_VERIFY) */
1186 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1187
1188 #if HWP3_DEBUG
1189 if (fmap_readn(map, &type, offset + 16, sizeof(type)) != sizeof(type))
1190 return CL_EREAD;
1191
1192 if (type == 0)
1193 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer as header\n", level, p);
1194 else if (type == 1)
1195 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer as footer\n", level, p);
1196 else
1197 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer as UNKNOWN(%u)\n", level, p, type);
1198 #endif
1199 offset += 18;
1200
1201 /* content paragraph list */
1202 hwp3_debug("HWP3.x: Paragraph[%u, %d]: header/footer paragraph list starts @ %zu\n", level, p, offset);
1203 l = 0;
1204 while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1205 if (ret != CL_SUCCESS)
1206 return ret;
1207 break;
1208 }
1209 case 17: /* footnote/endnote */
1210 {
1211 hwp3_debug("HWP3.x: Detected footnote/endnote marker @ offset %zu\n", offset);
1212
1213 /*
1214 * offset 0 (2 bytes) - special character ID
1215 * offset 2 (4 bytes) - reserved
1216 * offset 6 (2 bytes) - special character ID
1217 * offset 8 (8 x 1 bytes) - reserved
1218 * offset 16 (2 bytes) - number
1219 * offset 18 (2 bytes) - type
1220 * offset 20 (2 bytes) - alignment
1221 * total is always 22 bytes
1222 */
1223
1224 /* id block verification (only on HWP3_VERIFY) */
1225 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1226
1227 offset += 22;
1228
1229 /* content paragraph list */
1230 hwp3_debug("HWP3.x: Paragraph[%u, %d]: footnote/endnote paragraph list starts @ %zu\n", level, p, offset);
1231 l = 0;
1232 while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1233 if (ret != CL_SUCCESS)
1234 return ret;
1235 break;
1236 }
1237 case 18: /* paste code number */
1238 {
1239 #if HWP3_DEBUG
1240 uint8_t type;
1241 #endif
1242
1243 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number marker @ offset %zu\n", level, p, offset);
1244
1245 /*
1246 * offset 0 (2 bytes) - special character ID
1247 * offset 2 (2 bytes) - type
1248 * offset 4 (2 bytes) - number value
1249 * offset 6 (2 bytes) - special character ID
1250 * total is always 8 bytes
1251 */
1252
1253 /* id block verification (only on HWP3_VERIFY) */
1254 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1255
1256 #if HWP3_DEBUG
1257 if (fmap_readn(map, &type, offset + 2, sizeof(type)) != sizeof(type))
1258 return CL_EREAD;
1259
1260 if (type == 0)
1261 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as side\n", level, p);
1262 else if (type == 1)
1263 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as footnote\n", level, p);
1264 else if (type == 2)
1265 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as North America???\n", level, p);
1266 else if (type == 3)
1267 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as drawing\n", level, p);
1268 else if (type == 4)
1269 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as table\n", level, p);
1270 else if (type == 5)
1271 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as equation\n", level, p);
1272 else
1273 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as UNKNOWN(%u)\n", level, p, type);
1274 #endif
1275 offset += 8;
1276 break;
1277 }
1278 case 19: /* code number change */
1279 {
1280 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected code number change marker @ offset %zu\n", level, p, offset);
1281
1282 /*
1283 * offset 0 (2 bytes) - special character ID
1284 * offset 2 (2 bytes) - type
1285 * offset 4 (2 bytes) - new number value
1286 * offset 6 (2 bytes) - special character ID
1287 * total is always 8 bytes
1288 */
1289
1290 /* id block verification (only on HWP3_VERIFY) */
1291 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1292
1293 offset += 8;
1294 break;
1295 }
1296 case 20: {
1297 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected thread page number marker @ offset %zu\n", level, p, offset);
1298
1299 /*
1300 * offset 0 (2 bytes) - special character ID
1301 * offset 2 (2 bytes) - location
1302 * offset 4 (2 bytes) - shape
1303 * offset 6 (2 bytes) - special character ID
1304 * total is always 8 bytes
1305 */
1306
1307 /* id block verification (only on HWP3_VERIFY) */
1308 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1309
1310 offset += 8;
1311 break;
1312 }
1313 case 21: /* hide special */
1314 {
1315 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected hide special marker @ offset %zu\n", level, p, offset);
1316
1317 /*
1318 * offset 0 (2 bytes) - special character ID
1319 * offset 2 (2 bytes) - type
1320 * offset 4 (2 bytes) - target
1321 * offset 6 (2 bytes) - special character ID
1322 * total is always 8 bytes
1323 */
1324
1325 /* id block verification (only on HWP3_VERIFY) */
1326 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1327
1328 offset += 8;
1329 break;
1330 }
1331 case 22: /* mail merge display */
1332 {
1333 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected mail merge display marker @ offset %zu\n", level, p, offset);
1334
1335 /*
1336 * offset 0 (2 bytes) - special character ID
1337 * offset 2 (20 x 1 bytes) - field name (in ASCII)
1338 * offset 22 (2 bytes) - special character ID
1339 * total is always 24 bytes
1340 */
1341
1342 /* id block verification (only on HWP3_VERIFY) */
1343 HWP3_PSPECIAL_VERIFY(map, offset, 22, content, match);
1344
1345 offset += 24;
1346 break;
1347 }
1348 case 23: /* overlapping letters */
1349 {
1350 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected overlapping marker @ offset %zu\n", level, p, offset);
1351
1352 /*
1353 * offset 0 (2 bytes) - special character ID
1354 * offset 2 (3 x 2 bytes) - overlapping letters
1355 * offset 8 (2 bytes) - special character ID
1356 * total is always 10 bytes
1357 */
1358
1359 /* id block verification (only on HWP3_VERIFY) */
1360 HWP3_PSPECIAL_VERIFY(map, offset, 8, content, match);
1361
1362 offset += 10;
1363 break;
1364 }
1365 case 24: /* hyphen */
1366 {
1367 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected hyphen marker @ offset %zu\n", level, p, offset);
1368
1369 /*
1370 * offset 0 (2 bytes) - special character ID
1371 * offset 2 (2 bytes) - width of hyphen
1372 * offset 4 (2 bytes) - special character ID
1373 * total is always 6 bytes
1374 */
1375
1376 /* id block verification (only on HWP3_VERIFY) */
1377 HWP3_PSPECIAL_VERIFY(map, offset, 4, content, match);
1378
1379 offset += 6;
1380 break;
1381 }
1382 case 25: /* title/table/picture show times */
1383 {
1384 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected title/table/picture show times marker @ offset %zu\n", level, p, offset);
1385
1386 /*
1387 * offset 0 (2 bytes) - special character ID
1388 * offset 2 (2 bytes) - type
1389 * offset 4 (2 bytes) - special character ID
1390 * total is always 6 bytes
1391 */
1392
1393 /* id block verification (only on HWP3_VERIFY) */
1394 HWP3_PSPECIAL_VERIFY(map, offset, 4, content, match);
1395
1396 offset += 6;
1397 break;
1398 }
1399 case 26: /* browse displayed */
1400 {
1401 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected browse displayed marker @ offset %zu\n", level, p, offset);
1402
1403 /*
1404 * offset 0 (2 bytes) - special character ID
1405 * offset 2 (60 x 2 bytes) - keyword 1
1406 * offset 122 (60 x 2 bytes) - keyword 2
1407 * offset 242 (2 bytes) - page number
1408 * offset 244 (2 bytes) - special character ID
1409 * total is always 246 bytes
1410 */
1411
1412 /* id block verification (only on HWP3_VERIFY) */
1413 HWP3_PSPECIAL_VERIFY(map, offset, 244, content, match);
1414
1415 offset += 246;
1416 break;
1417 }
1418 case 28: /* overview shape/summary number */
1419 {
1420 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected overview shape/summary number marker @ offset %zu\n", level, p, offset);
1421
1422 /*
1423 * offset 0 (2 bytes) - special character ID
1424 * offset 2 (2 bytes) - type
1425 * offset 4 (1 byte) - form
1426 * offset 5 (1 byte) - step
1427 * offset 6 (7 x 2 bytes) - summary number
1428 * offset 20 (7 x 2 bytes) - custom
1429 * offset 34 (2 x 7 x 2 bytes) - decorative letters
1430 * offset 62 (2 bytes) - special character ID
1431 * total is always 64 bytes
1432 */
1433
1434 /* id block verification (only on HWP3_VERIFY) */
1435 HWP3_PSPECIAL_VERIFY(map, offset, 62, content, match);
1436
1437 offset += 64;
1438 break;
1439 }
1440 case 29: /* cross-reference */
1441 {
1442 uint32_t length;
1443
1444 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected cross-reference marker @ offset %zu\n", level, p, offset);
1445
1446 /*
1447 * offset 0 (2 bytes) - special character ID
1448 * offset 2 (4 bytes) - length of information
1449 * offset 6 (2 bytes) - special character ID
1450 * offset 8 (n bytes) - ...
1451 */
1452
1453 /* id block verification (only on HWP3_VERIFY) */
1454 HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1455
1456 if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
1457 return CL_EREAD;
1458
1459 length = le32_to_host(length);
1460 new_offset = offset + (8 + length);
1461 if ((new_offset <= offset) || (new_offset > map->len)) {
1462 cli_errmsg("HWP3.x: Paragraph[%u, %d]: length value is too high, invalid. %u\n", level, p, length);
1463 return CL_EPARSE;
1464 }
1465 offset = new_offset;
1466 break;
1467 }
1468 case 30: /* bundle of blanks (ON SALE for 2.99!) */
1469 {
1470 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected title/table/picture show times marker @ offset %zu\n", level, p, offset);
1471
1472 /*
1473 * offset 0 (2 bytes) - special character ID
1474 * offset 2 (2 bytes) - special character ID
1475 * total is always 4 bytes
1476 */
1477
1478 /* id block verification (only on HWP3_VERIFY) */
1479 HWP3_PSPECIAL_VERIFY(map, offset, 2, content, match);
1480
1481 offset += 4;
1482 break;
1483 }
1484 case 31: /* fixed-width space */
1485 {
1486 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected title/table/picture show times marker @ offset %zu\n", level, p, offset);
1487
1488 /*
1489 * offset 0 (2 bytes) - special character ID
1490 * offset 2 (2 bytes) - special character ID
1491 * total is always 4 bytes
1492 */
1493
1494 /* id block verification (only on HWP3_VERIFY) */
1495 HWP3_PSPECIAL_VERIFY(map, offset, 2, content, match);
1496
1497 offset += 4;
1498 break;
1499 }
1500 default:
1501 hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected special character as [UNKNOWN]\n", level, p);
1502 cli_errmsg("HWP3.x: Paragraph[%u, %d]: cannot understand special character %u\n", level, p, content);
1503 return CL_EPARSE;
1504 }
1505 } else { /* normal characters */
1506 offset += sizeof(content);
1507 }
1508 }
1509
1510 hwp3_debug("HWP3.x: end recursion level: %d\n", level);
1511
1512 (*roffset) = offset;
1513 return CL_SUCCESS;
1514 }
1515
parsehwp3_infoblk_1(cli_ctx * ctx,fmap_t * dmap,size_t * offset,int * last)1516 static inline cl_error_t parsehwp3_infoblk_1(cli_ctx *ctx, fmap_t *dmap, size_t *offset, int *last)
1517 {
1518 cl_error_t ret = CL_SUCCESS;
1519
1520 uint32_t infoid, infolen;
1521 fmap_t *map = (dmap ? dmap : ctx->fmap);
1522 int i, count;
1523 long long unsigned infoloc = (long long unsigned)(*offset);
1524 #if HWP3_DEBUG
1525 char field[HWP3_FIELD_LENGTH];
1526 #endif
1527 #if HAVE_JSON
1528 json_object *infoblk_1, *contents, *counter, *entry;
1529 #endif
1530
1531 hwp3_debug("HWP3.x: Information Block @ offset %llu\n", infoloc);
1532
1533 #if HAVE_JSON
1534 if (SCAN_COLLECT_METADATA) {
1535 infoblk_1 = cli_jsonobj(ctx->wrkproperty, "InfoBlk_1");
1536 if (!infoblk_1) {
1537 cli_errmsg("HWP5.x: No memory for information block object\n");
1538 return CL_EMEM;
1539 }
1540
1541 contents = cli_jsonarray(infoblk_1, "Contents");
1542 if (!contents) {
1543 cli_errmsg("HWP5.x: No memory for information block contents array\n");
1544 return CL_EMEM;
1545 }
1546
1547 if (!json_object_object_get_ex(infoblk_1, "Count", &counter)) { /* object not found */
1548 cli_jsonint(infoblk_1, "Count", 1);
1549 } else {
1550 int value = json_object_get_int(counter);
1551 cli_jsonint(infoblk_1, "Count", value + 1);
1552 }
1553 }
1554 #endif
1555
1556 if (fmap_readn(map, &infoid, *offset, sizeof(infoid)) != sizeof(infoid)) {
1557 cli_errmsg("HWP3.x: Failed to read information block id @ %zu\n", *offset);
1558 return CL_EREAD;
1559 }
1560 *offset += sizeof(infoid);
1561 infoid = le32_to_host(infoid);
1562
1563 #if HAVE_JSON
1564 if (SCAN_COLLECT_METADATA) {
1565 entry = cli_jsonobj(contents, NULL);
1566 if (!entry) {
1567 cli_errmsg("HWP5.x: No memory for information block entry object\n");
1568 return CL_EMEM;
1569 }
1570
1571 cli_jsonint(entry, "ID", infoid);
1572 }
1573 #endif
1574 hwp3_debug("HWP3.x: Information Block[%llu]: ID: %u\n", infoloc, infoid);
1575
1576 /* Booking Information(5) - no length field and no content */
1577 if (infoid == 5) {
1578 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Booking Information\n", infoloc);
1579 #if HAVE_JSON
1580 if (SCAN_COLLECT_METADATA)
1581 cli_jsonstr(entry, "Type", "Booking Information");
1582 #endif
1583 return CL_SUCCESS;
1584 }
1585
1586 if (fmap_readn(map, &infolen, *offset, sizeof(infolen)) != sizeof(infolen)) {
1587 cli_errmsg("HWP3.x: Failed to read information block len @ %zu\n", *offset);
1588 return CL_EREAD;
1589 }
1590 *offset += sizeof(infolen);
1591 infolen = le32_to_host(infolen);
1592
1593 #if HAVE_JSON
1594 if (SCAN_COLLECT_METADATA) {
1595 cli_jsonint64(entry, "Offset", infoloc);
1596 cli_jsonint(entry, "Length", infolen);
1597 }
1598 #endif
1599 hwp3_debug("HWP3.x: Information Block[%llu]: LEN: %u\n", infoloc, infolen);
1600
1601 /* check information block bounds */
1602 if (*offset + infolen > map->len) {
1603 cli_errmsg("HWP3.x: Information blocks length exceeds remaining map length, %zu > %zu\n", *offset + infolen, map->len);
1604 return CL_EREAD;
1605 }
1606
1607 /* Information Blocks */
1608 switch (infoid) {
1609 case 0: /* Terminating */
1610 if (infolen == 0) {
1611 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Terminating Entry\n", infoloc);
1612 #if HAVE_JSON
1613 if (SCAN_COLLECT_METADATA)
1614 cli_jsonstr(entry, "Type", "Terminating Entry");
1615 #endif
1616 if (last) *last = 1;
1617 return CL_SUCCESS;
1618 } else {
1619 cli_errmsg("HWP3.x: Information Block[%llu]: TYPE: Invalid Terminating Entry\n", infoloc);
1620 return CL_EFORMAT;
1621 }
1622 case 1: /* Image Data */
1623 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Image Data\n", infoloc);
1624 #if HAVE_JSON
1625 if (SCAN_COLLECT_METADATA)
1626 cli_jsonstr(entry, "Type", "Image Data");
1627 #endif
1628 #if HWP3_DEBUG /* additional fields can be added */
1629 memset(field, 0, HWP3_FIELD_LENGTH);
1630 if (fmap_readn(map, field, *offset, 16) != 16) {
1631 cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1632 return CL_EREAD;
1633 }
1634 hwp3_debug("HWP3.x: Information Block[%llu]: NAME: %s\n", infoloc, field);
1635
1636 memset(field, 0, HWP3_FIELD_LENGTH);
1637 if (fmap_readn(map, field, *offset + 16, 16) != 16) {
1638 cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1639 return CL_EREAD;
1640 }
1641 hwp3_debug("HWP3.x: Information Block[%llu]: FORM: %s\n", infoloc, field);
1642 #endif
1643 /* 32 bytes for extra data fields */
1644 if (infolen > 0)
1645 ret = cli_magic_scan_nested_fmap_type(map, *offset + 32, infolen - 32, ctx, CL_TYPE_ANY, NULL);
1646 break;
1647 case 2: /* OLE2 Data */
1648 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: OLE2 Data\n", infoloc);
1649 #if HAVE_JSON
1650 if (SCAN_COLLECT_METADATA)
1651 cli_jsonstr(entry, "Type", "OLE2 Data");
1652 #endif
1653 if (infolen > 0)
1654 ret = cli_magic_scan_nested_fmap_type(map, *offset, infolen, ctx, CL_TYPE_ANY, NULL);
1655 break;
1656 case 3: /* Hypertext/Hyperlink Information */
1657 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Hypertext/Hyperlink Information\n", infoloc);
1658 if (infolen % 617) {
1659 cli_errmsg("HWP3.x: Information Block[%llu]: Invalid multiple of 617 => %u\n", infoloc, infolen);
1660 return CL_EFORMAT;
1661 }
1662
1663 count = (infolen / 617);
1664 hwp3_debug("HWP3.x: Information Block[%llu]: COUNT: %d entries\n", infoloc, count);
1665 #if HAVE_JSON
1666 if (SCAN_COLLECT_METADATA) {
1667 cli_jsonstr(entry, "Type", "Hypertext/Hyperlink Information");
1668 cli_jsonint(entry, "Count", count);
1669 }
1670 #endif
1671
1672 for (i = 0; i < count; i++) {
1673 #if HWP3_DEBUG /* additional fields can be added */
1674 memset(field, 0, HWP3_FIELD_LENGTH);
1675 if (fmap_readn(map, field, *offset, 256) != 256) {
1676 cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1677 return CL_EREAD;
1678 }
1679 hwp3_debug("HWP3.x: Information Block[%llu]: %d: NAME: %s\n", infoloc, i, field);
1680 #endif
1681 /* scanning macros - TODO - check numbers */
1682 ret = cli_magic_scan_nested_fmap_type(map, *offset + (617 * i) + 288, 325, ctx, CL_TYPE_ANY, NULL);
1683 }
1684 break;
1685 case 4: /* Presentation Information */
1686 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Presentation Information\n", infoloc);
1687 #if HAVE_JSON
1688 if (SCAN_COLLECT_METADATA)
1689 cli_jsonstr(entry, "Type", "Presentation Information");
1690 #endif
1691 /* contains nothing of interest to scan */
1692 break;
1693 case 5: /* Booking Information */
1694 /* should never run this as it is short-circuited above */
1695 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Booking Information\n", infoloc);
1696 #if HAVE_JSON
1697 if (SCAN_COLLECT_METADATA)
1698 cli_jsonstr(entry, "Type", "Booking Information");
1699 #endif
1700 break;
1701 case 6: /* Background Image Data */
1702 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Background Image Data\n", infoloc);
1703 #if HAVE_JSON
1704 if (SCAN_COLLECT_METADATA) {
1705 cli_jsonstr(entry, "Type", "Background Image Data");
1706 cli_jsonint(entry, "ImageSize", infolen - 324);
1707 }
1708 #endif
1709 #if HWP3_DEBUG /* additional fields can be added */
1710 memset(field, 0, HWP3_FIELD_LENGTH);
1711 if (fmap_readn(map, field, *offset + 24, 256) != 256) {
1712 cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1713 return CL_EREAD;
1714 }
1715 hwp3_debug("HWP3.x: Information Block[%llu]: NAME: %s\n", infoloc, field);
1716 #endif
1717 /* 324 bytes for extra data fields */
1718 if (infolen > 0)
1719 ret = cli_magic_scan_nested_fmap_type(map, *offset + 324, infolen - 324, ctx, CL_TYPE_ANY, NULL);
1720 break;
1721 case 0x100: /* Table Extension */
1722 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Table Extension\n", infoloc);
1723 #if HAVE_JSON
1724 if (SCAN_COLLECT_METADATA)
1725 cli_jsonstr(entry, "Type", "Table Extension");
1726 #endif
1727 /* contains nothing of interest to scan */
1728 break;
1729 case 0x101: /* Press Frame Information Field Name */
1730 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Press Frame Information Field Name\n", infoloc);
1731 #if HAVE_JSON
1732 if (SCAN_COLLECT_METADATA)
1733 cli_jsonstr(entry, "Type", "Press Frame Information Field Name");
1734 #endif
1735 /* contains nothing of interest to scan */
1736 break;
1737 default:
1738 cli_warnmsg("HWP3.x: Information Block[%llu]: TYPE: UNKNOWN(%u)\n", infoloc, infoid);
1739 if (infolen > 0)
1740 ret = cli_magic_scan_nested_fmap_type(map, *offset, infolen, ctx, CL_TYPE_ANY, NULL);
1741 }
1742
1743 *offset += infolen;
1744 return ret;
1745 }
1746
hwp3_cb(void * cbdata,int fd,const char * filepath,cli_ctx * ctx)1747 static cl_error_t hwp3_cb(void *cbdata, int fd, const char *filepath, cli_ctx *ctx)
1748 {
1749 cl_error_t ret = CL_SUCCESS;
1750 fmap_t *map, *dmap;
1751 size_t offset, start, new_offset;
1752 int i, p = 0, last = 0;
1753 uint16_t nstyles;
1754 #if HAVE_JSON
1755 json_object *fonts;
1756 #endif
1757
1758 UNUSEDPARAM(filepath);
1759
1760 offset = start = cbdata ? *(size_t *)cbdata : 0;
1761
1762 if (offset == 0) {
1763 if (fd < 0) {
1764 cli_errmsg("HWP3.x: Invalid file descriptor argument\n");
1765 return CL_ENULLARG;
1766 } else {
1767 STATBUF statbuf;
1768
1769 if (FSTAT(fd, &statbuf) == -1) {
1770 cli_errmsg("HWP3.x: Can't stat file descriptor\n");
1771 return CL_ESTAT;
1772 }
1773
1774 map = dmap = fmap(fd, 0, statbuf.st_size, NULL);
1775 if (!map) {
1776 cli_errmsg("HWP3.x: Failed to get fmap for uncompressed stream\n");
1777 return CL_EMAP;
1778 }
1779 }
1780 } else {
1781 hwp3_debug("HWP3.x: Document Content Stream starts @ offset %zu\n", offset);
1782
1783 map = ctx->fmap;
1784 dmap = NULL;
1785 }
1786
1787 /* Fonts - 7 entries of 2 + (n x 40) bytes where n is the first 2 bytes of the entry */
1788 #if HAVE_JSON
1789 if (SCAN_COLLECT_METADATA)
1790 fonts = cli_jsonarray(ctx->wrkproperty, "FontCounts");
1791 #endif
1792 for (i = 0; i < 7; i++) {
1793 uint16_t nfonts;
1794
1795 if (fmap_readn(map, &nfonts, offset, sizeof(nfonts)) != sizeof(nfonts)) {
1796 if (dmap)
1797 funmap(dmap);
1798 return CL_EREAD;
1799 }
1800 nfonts = le16_to_host(nfonts);
1801
1802 #if HAVE_JSON
1803 if (SCAN_COLLECT_METADATA)
1804 cli_jsonint(fonts, NULL, nfonts);
1805 #endif
1806 hwp3_debug("HWP3.x: Font Entry %d with %u entries @ offset %zu\n", i + 1, nfonts, offset);
1807 new_offset = offset + (2 + nfonts * 40);
1808 if ((new_offset <= offset) || (new_offset >= map->len)) {
1809 cli_errmsg("HWP3.x: Font Entry: number of fonts is too high, invalid. %u\n", nfonts);
1810 if (dmap)
1811 funmap(dmap);
1812 return CL_EPARSE;
1813 }
1814 offset = new_offset;
1815 }
1816
1817 /* Styles - 2 + (n x 238) bytes where n is the first 2 bytes of the section */
1818 if (fmap_readn(map, &nstyles, offset, sizeof(nstyles)) != sizeof(nstyles)) {
1819 if (dmap)
1820 funmap(dmap);
1821 return CL_EREAD;
1822 }
1823 nstyles = le16_to_host(nstyles);
1824
1825 #if HAVE_JSON
1826 if (SCAN_COLLECT_METADATA)
1827 cli_jsonint(ctx->wrkproperty, "StyleCount", nstyles);
1828 #endif
1829 hwp3_debug("HWP3.x: %u Styles @ offset %zu\n", nstyles, offset);
1830 new_offset = offset + (2 + nstyles * 238);
1831 if ((new_offset <= offset) || (new_offset >= map->len)) {
1832 cli_errmsg("HWP3.x: Font Entry: number of font styles is too high, invalid. %u\n", nstyles);
1833 if (dmap)
1834 funmap(dmap);
1835 return CL_EPARSE;
1836 }
1837 offset += (2 + nstyles * 238);
1838
1839 last = 0;
1840 /* Paragraphs - variable */
1841 /* Paragraphs - are terminated with 0x0d00[13(CR) as hchar], empty paragraph marks end of section and do NOT end with 0x0d00 */
1842 while (!last && ((ret = parsehwp3_paragraph(ctx, map, p++, 0, &offset, &last)) == CL_SUCCESS)) continue;
1843 /* return is never a virus */
1844 if (ret != CL_SUCCESS) {
1845 if (dmap)
1846 funmap(dmap);
1847 return ret;
1848 }
1849 #if HAVE_JSON
1850 if (SCAN_COLLECT_METADATA)
1851 cli_jsonint(ctx->wrkproperty, "ParagraphCount", p);
1852 #endif
1853
1854 last = 0;
1855 /* 'additional information block #1's - attachments and media */
1856 while (!last && ((ret = parsehwp3_infoblk_1(ctx, map, &offset, &last)) == CL_SUCCESS)) continue;
1857
1858 /* scan the uncompressed stream - both compressed and uncompressed cases [ALLMATCH] */
1859 if ((ret == CL_SUCCESS) || ((SCAN_ALLMATCHES) && (ret == CL_VIRUS))) {
1860 cl_error_t subret = ret;
1861 size_t dlen = offset - start;
1862
1863 ret = cli_magic_scan_nested_fmap_type(map, start, dlen, ctx, CL_TYPE_ANY, NULL);
1864 //ret = cli_magic_scan_nested_fmap_type(map, 0, 0, ctx, CL_TYPE_ANY);
1865
1866 if (ret == CL_SUCCESS)
1867 ret = subret;
1868 }
1869
1870 if (dmap)
1871 funmap(dmap);
1872 return ret;
1873 }
1874
cli_scanhwp3(cli_ctx * ctx)1875 cl_error_t cli_scanhwp3(cli_ctx *ctx)
1876 {
1877 cl_error_t ret = CL_SUCCESS;
1878
1879 struct hwp3_docinfo docinfo;
1880 size_t offset = 0, new_offset = 0;
1881 fmap_t *map = ctx->fmap;
1882
1883 #if HAVE_JSON
1884 /*
1885 // version
1886 cli_jsonint(header, "RawVersion", hwp5->version);
1887 */
1888 #endif
1889 offset += HWP3_IDENTITY_INFO_SIZE;
1890
1891 if ((ret = parsehwp3_docinfo(ctx, offset, &docinfo)) != CL_SUCCESS)
1892 return ret;
1893
1894 offset += HWP3_DOCINFO_SIZE;
1895
1896 if ((ret = parsehwp3_docsummary(ctx, offset)) != CL_SUCCESS)
1897 return ret;
1898
1899 offset += HWP3_DOCSUMMARY_SIZE;
1900
1901 /* password-protected document - cannot parse */
1902 if (docinfo.di_passwd) {
1903 cli_dbgmsg("HWP3.x: password-protected file, skip parsing\n");
1904 return CL_SUCCESS;
1905 }
1906
1907 if (docinfo.di_infoblksize) {
1908 /* OPTIONAL TODO: HANDLE OPTIONAL INFORMATION BLOCK #0's FOR PRECLASS */
1909 new_offset = offset + docinfo.di_infoblksize;
1910 if ((new_offset <= offset) || (new_offset >= map->len)) {
1911 cli_errmsg("HWP3.x: Doc info block size is too high, invalid. %u\n", docinfo.di_infoblksize);
1912 return CL_EPARSE;
1913 }
1914 offset = new_offset;
1915 }
1916
1917 if (docinfo.di_compressed)
1918 ret = decompress_and_callback(ctx, ctx->fmap, offset, 0, "HWP3.x", hwp3_cb, NULL);
1919 else
1920 ret = hwp3_cb(&offset, 0, ctx->sub_filepath, ctx);
1921
1922 if (ret != CL_SUCCESS)
1923 return ret;
1924
1925 /* OPTIONAL TODO: HANDLE OPTIONAL ADDITIONAL INFORMATION BLOCK #2's FOR PRECLASS*/
1926
1927 return ret;
1928 }
1929
1930 /*** HWPML (hijacking the msxml parser) ***/
1931 #if HAVE_LIBXML2
1932 static const struct key_entry hwpml_keys[] = {
1933 {"hwpml", "HWPML", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB},
1934
1935 /* HEAD - Document Properties */
1936 //{ "head", "Head", MSXML_JSON_WRKPTR },
1937 {"docsummary", "DocumentProperties", MSXML_JSON_WRKPTR},
1938 {"title", "Title", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1939 {"author", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1940 {"date", "Date", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1941 {"docsetting", "DocumentSettings", MSXML_JSON_WRKPTR},
1942 {"beginnumber", "BeginNumber", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1943 {"caretpos", "CaretPos", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1944 //{ "bindatalist", "BinDataList", MSXML_JSON_WRKPTR },
1945 //{ "binitem", "BinItem", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
1946 {"facenamelist", "FaceNameList", MSXML_IGNORE_ELEM}, /* fonts list */
1947 {"borderfilllist", "BorderFillList", MSXML_IGNORE_ELEM}, /* borders list */
1948 {"charshapelist", "CharShapeList", MSXML_IGNORE_ELEM}, /* character shapes */
1949 {"tabdeflist", "TableDefList", MSXML_IGNORE_ELEM}, /* table defs */
1950 {"numberinglist", "NumberingList", MSXML_IGNORE_ELEM}, /* numbering list */
1951 {"parashapelist", "ParagraphShapeList", MSXML_IGNORE_ELEM}, /* paragraph shapes */
1952 {"stylelist", "StyleList", MSXML_IGNORE_ELEM}, /* styles */
1953 {"compatibledocument", "WordCompatibility", MSXML_IGNORE_ELEM}, /* word compatibility data */
1954
1955 /* BODY - Document Contents */
1956 {"body", "Body", MSXML_IGNORE_ELEM}, /* document contents (we could build a document contents summary */
1957
1958 /* TAIL - Document Attachments */
1959 //{ "tail", "Tail", MSXML_JSON_WRKPTR },
1960 {"bindatastorage", "BinaryDataStorage", MSXML_JSON_WRKPTR},
1961 {"bindata", "BinaryData", MSXML_SCAN_CB | MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1962 {"scriptcode", "ScriptCodeStorage", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1963 {"scriptheader", "ScriptHeader", MSXML_SCAN_CB | MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1964 {"scriptsource", "ScriptSource", MSXML_SCAN_CB | MSXML_JSON_WRKPTR | MSXML_JSON_VALUE}};
1965 static size_t num_hwpml_keys = sizeof(hwpml_keys) / sizeof(struct key_entry);
1966
1967 /* binary streams needs to be base64-decoded then decompressed if fields are set */
hwpml_scan_cb(void * cbdata,int fd,const char * filepath,cli_ctx * ctx)1968 static cl_error_t hwpml_scan_cb(void *cbdata, int fd, const char *filepath, cli_ctx *ctx)
1969 {
1970 UNUSEDPARAM(cbdata);
1971
1972 if (fd < 0 || !ctx)
1973 return CL_ENULLARG;
1974
1975 return cli_magic_scan_desc(fd, filepath, ctx, NULL);
1976 }
1977
hwpml_binary_cb(int fd,const char * filepath,cli_ctx * ctx,int num_attribs,struct attrib_entry * attribs,void * cbdata)1978 static cl_error_t hwpml_binary_cb(int fd, const char *filepath, cli_ctx *ctx, int num_attribs, struct attrib_entry *attribs, void *cbdata)
1979 {
1980 cl_error_t ret;
1981
1982 int i, df = 0, com = 0, enc = 0;
1983 char *tempfile;
1984
1985 UNUSEDPARAM(cbdata);
1986
1987 /* check attributes for compression and encoding */
1988 for (i = 0; i < num_attribs; i++) {
1989 if (!strcmp(attribs[i].key, "Compress")) {
1990 if (!strcmp(attribs[i].value, "true"))
1991 com = 1;
1992 else if (!strcmp(attribs[i].value, "false"))
1993 com = 0;
1994 else
1995 com = -1;
1996 }
1997
1998 if (!strcmp(attribs[i].key, "Encoding")) {
1999 if (!strcmp(attribs[i].value, "Base64"))
2000 enc = 1;
2001 else
2002 enc = -1;
2003 }
2004 }
2005
2006 hwpml_debug("HWPML: Checking attributes: com: %d, enc: %d\n", com, enc);
2007
2008 /* decode the binary data if needed - base64 */
2009 if (enc < 0) {
2010 cli_errmsg("HWPML: Unrecognized encoding method\n");
2011 return cli_magic_scan_desc(fd, filepath, ctx, NULL);
2012 } else if (enc == 1) {
2013 STATBUF statbuf;
2014 fmap_t *input;
2015 const char *instream;
2016 char *decoded;
2017 size_t decodedlen;
2018
2019 hwpml_debug("HWPML: Decoding base64-encoded binary data\n");
2020
2021 /* fmap the input file for easier manipulation */
2022 if (FSTAT(fd, &statbuf) == -1) {
2023 cli_errmsg("HWPML: Can't stat file descriptor\n");
2024 return CL_ESTAT;
2025 }
2026
2027 if (!(input = fmap(fd, 0, statbuf.st_size, NULL))) {
2028 cli_errmsg("HWPML: Failed to get fmap for binary data\n");
2029 return CL_EMAP;
2030 }
2031
2032 /* send data for base64 conversion - TODO: what happens with really big files? */
2033 if (!(instream = fmap_need_off_once(input, 0, input->len))) {
2034 cli_errmsg("HWPML: Failed to get input stream from binary data\n");
2035 funmap(input);
2036 return CL_EMAP;
2037 }
2038
2039 decoded = (char *)cl_base64_decode((char *)instream, input->len, NULL, &decodedlen, 0);
2040 funmap(input);
2041 if (!decoded) {
2042 cli_errmsg("HWPML: Failed to get base64 decode binary data\n");
2043 return cli_magic_scan_desc(fd, filepath, ctx, NULL);
2044 }
2045
2046 /* open file for writing and scanning */
2047 if ((ret = cli_gentempfd(ctx->sub_tmpdir, &tempfile, &df)) != CL_SUCCESS) {
2048 cli_warnmsg("HWPML: Failed to create temporary file for decoded stream scanning\n");
2049 return ret;
2050 }
2051
2052 if (cli_writen(df, decoded, decodedlen) != decodedlen) {
2053 free(decoded);
2054 ret = CL_EWRITE;
2055 goto hwpml_end;
2056 }
2057 free(decoded);
2058
2059 /* keeps the later logic simpler */
2060 fd = df;
2061
2062 cli_dbgmsg("HWPML: Decoded binary data to %s\n", tempfile);
2063 }
2064
2065 /* decompress the file if needed - zlib */
2066 if (com) {
2067 STATBUF statbuf;
2068 fmap_t *input;
2069
2070 hwpml_debug("HWPML: Decompressing binary data\n");
2071
2072 /* fmap the input file for easier manipulation */
2073 if (FSTAT(fd, &statbuf) == -1) {
2074 cli_errmsg("HWPML: Can't stat file descriptor\n");
2075 ret = CL_ESTAT;
2076 goto hwpml_end;
2077 }
2078
2079 input = fmap(fd, 0, statbuf.st_size, NULL);
2080 if (!input) {
2081 cli_errmsg("HWPML: Failed to get fmap for binary data\n");
2082 ret = CL_EMAP;
2083 goto hwpml_end;
2084 }
2085 ret = decompress_and_callback(ctx, input, 0, 0, "HWPML", hwpml_scan_cb, NULL);
2086 funmap(input);
2087 } else {
2088 if (fd == df) { /* fd is a decoded tempfile */
2089 ret = hwpml_scan_cb(NULL, fd, tempfile, ctx);
2090 } else { /* fd is the original filepath, no decoding necessary */
2091 ret = hwpml_scan_cb(NULL, fd, filepath, ctx);
2092 }
2093 }
2094
2095 /* close decoded file descriptor if used */
2096 hwpml_end:
2097 if (df) {
2098 close(df);
2099 if (!(ctx->engine->keeptmp))
2100 cli_unlink(tempfile);
2101 free(tempfile);
2102 }
2103 return ret;
2104 }
2105 #endif /* HAVE_LIBXML2 */
2106
cli_scanhwpml(cli_ctx * ctx)2107 cl_error_t cli_scanhwpml(cli_ctx *ctx)
2108 {
2109 cl_error_t ret = CL_SUCCESS;
2110
2111 #if HAVE_LIBXML2
2112 struct msxml_cbdata cbdata;
2113 struct msxml_ctx mxctx;
2114 xmlTextReaderPtr reader = NULL;
2115
2116 cli_dbgmsg("in cli_scanhwpml()\n");
2117
2118 if (!ctx)
2119 return CL_ENULLARG;
2120
2121 memset(&cbdata, 0, sizeof(cbdata));
2122 cbdata.map = ctx->fmap;
2123
2124 reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "hwpml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
2125 if (!reader) {
2126 cli_dbgmsg("cli_scanhwpml: cannot initialize xmlReader\n");
2127
2128 #if HAVE_JSON
2129 ret = cli_json_parse_error(ctx->wrkproperty, "HWPML_ERROR_XML_READER_IO");
2130 #endif
2131 return ret; // libxml2 failed!
2132 }
2133
2134 memset(&mxctx, 0, sizeof(mxctx));
2135 mxctx.scan_cb = hwpml_binary_cb;
2136 ret = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, MSXML_FLAG_JSON, &mxctx);
2137
2138 xmlTextReaderClose(reader);
2139 xmlFreeTextReader(reader);
2140 #else
2141 UNUSEDPARAM(ctx);
2142 cli_dbgmsg("in cli_scanhwpml()\n");
2143 cli_dbgmsg("cli_scanhwpml: scanning hwpml documents requires libxml2!\n");
2144 #endif
2145
2146 return ret;
2147 }
2148