1 /*
2  * HWP Stuff
3  *
4  * Copyright (C) 2015-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *
6  * Authors: Kevin Lin
7  *
8  * This program is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU General Public License version 2 as published by the
10  * Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; if not, write to the Free Software Foundation, Inc., 51
19  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  */
21 
22 #if HAVE_CONFIG_H
23 #include "clamav-config.h"
24 #endif
25 
26 #if HAVE_LIBXML2
27 #include <libxml/xmlreader.h>
28 #endif
29 
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <fcntl.h>
33 #include <string.h>
34 #include <ctype.h>
35 #include <zlib.h>
36 
37 #if HAVE_ICONV
38 #include <iconv.h>
39 #endif
40 
41 #include "clamav.h"
42 #include "fmap.h"
43 #include "str.h"
44 #include "conv.h"
45 #include "others.h"
46 #include "scanners.h"
47 #include "msxml_parser.h"
48 #include "msxml.h"
49 #include "json_api.h"
50 #include "hwp.h"
51 #if HAVE_JSON
52 #include "msdoc.h"
53 #endif
54 
55 #define HWP5_DEBUG 0
56 #define HWP3_DEBUG 0
57 #define HWP3_VERIFY 0
58 #define HWPML_DEBUG 0
59 #if HWP5_DEBUG
60 #define hwp5_debug(...) cli_dbgmsg(__VA_ARGS__)
61 #else
62 #define hwp5_debug(...) {};
63 #endif
64 #if HWP3_DEBUG
65 #define hwp3_debug(...) cli_dbgmsg(__VA_ARGS__)
66 #else
67 #define hwp3_debug(...) {};
68 #endif
69 #if HWPML_DEBUG
70 #define hwpml_debug(...) cli_dbgmsg(__VA_ARGS__)
71 #else
72 #define hwpml_debug(...) {};
73 #endif
74 
75 typedef cl_error_t (*hwp_cb)(void *cbdata, int fd, const char *filepath, cli_ctx *ctx);
76 
decompress_and_callback(cli_ctx * ctx,fmap_t * input,size_t at,size_t len,const char * parent,hwp_cb cb,void * cbdata)77 static cl_error_t decompress_and_callback(cli_ctx *ctx, fmap_t *input, size_t at, size_t len, const char *parent, hwp_cb cb, void *cbdata)
78 {
79     cl_error_t ret = CL_SUCCESS;
80     int zret, ofd;
81     size_t in;
82     size_t off_in = at;
83     size_t count, remain = 1, outsize = 0;
84     z_stream zstrm;
85     char *tmpname;
86     unsigned char inbuf[FILEBUFF], outbuf[FILEBUFF];
87 
88     if (!ctx || !input || !cb)
89         return CL_ENULLARG;
90 
91     if (len)
92         remain = len;
93 
94     /* reserve tempfile for output and callback */
95     if ((ret = cli_gentempfd(ctx->sub_tmpdir, &tmpname, &ofd)) != CL_SUCCESS) {
96         cli_errmsg("%s: Can't generate temporary file\n", parent);
97         return ret;
98     }
99 
100     /* initialize zlib inflation stream */
101     memset(&zstrm, 0, sizeof(zstrm));
102     zstrm.zalloc    = Z_NULL;
103     zstrm.zfree     = Z_NULL;
104     zstrm.opaque    = Z_NULL;
105     zstrm.next_in   = inbuf;
106     zstrm.next_out  = outbuf;
107     zstrm.avail_in  = 0;
108     zstrm.avail_out = FILEBUFF;
109 
110     zret = inflateInit2(&zstrm, -15);
111     if (zret != Z_OK) {
112         cli_errmsg("%s: Can't initialize zlib inflation stream\n", parent);
113         ret = CL_EUNPACK;
114         goto dc_end;
115     }
116 
117     /* inflation loop */
118     do {
119         if (zstrm.avail_in == 0) {
120             zstrm.next_in = inbuf;
121 
122             in = fmap_readn(input, inbuf, off_in, FILEBUFF);
123             if (in == (size_t)-1) {
124                 cli_errmsg("%s: Error reading stream\n", parent);
125                 ret = CL_EUNPACK;
126                 goto dc_end;
127             }
128             if (!in)
129                 break;
130 
131             if (len) {
132                 if (remain < in)
133                     in = remain;
134                 remain -= in;
135             }
136             zstrm.avail_in = in;
137             off_in += in;
138         }
139         zret  = inflate(&zstrm, Z_SYNC_FLUSH);
140         count = FILEBUFF - zstrm.avail_out;
141         if (count) {
142             if ((ret = cli_checklimits("HWP", ctx, outsize + count, 0, 0)) != CL_SUCCESS)
143                 break;
144 
145             if (cli_writen(ofd, outbuf, count) != count) {
146                 cli_errmsg("%s: Can't write to file %s\n", parent, tmpname);
147                 ret = CL_EWRITE;
148                 goto dc_end;
149             }
150             outsize += count;
151         }
152         zstrm.next_out  = outbuf;
153         zstrm.avail_out = FILEBUFF;
154     } while (zret == Z_OK && remain);
155 
156     cli_dbgmsg("%s: Decompressed %zu bytes to %s\n", parent, outsize, tmpname);
157 
158     /* post inflation checks */
159     if (zret != Z_STREAM_END && zret != Z_OK) {
160         if (outsize == 0) {
161             cli_infomsg(ctx, "%s: Error decompressing stream. No data decompressed.\n", parent);
162             ret = CL_EUNPACK;
163             goto dc_end;
164         }
165 
166         cli_infomsg(ctx, "%s: Error decompressing stream. Scanning what was decompressed.\n", parent);
167     }
168 
169     /* check for limits exceeded or zlib failure */
170     if (ret == CL_SUCCESS && (zret == Z_STREAM_END || zret == Z_OK)) {
171         if (len && remain > 0)
172             cli_infomsg(ctx, "%s: Error decompressing stream. Not all requested input was converted\n", parent);
173 
174         /* scanning inflated stream */
175         ret = cb(cbdata, ofd, tmpname, ctx);
176     } else {
177         /* default to scanning what we got */
178         ret = cli_magic_scan_desc(ofd, tmpname, ctx, NULL);
179     }
180 
181     /* clean-up */
182 dc_end:
183     zret = inflateEnd(&zstrm);
184     if (zret != Z_OK) {
185         cli_errmsg("%s: Error closing zlib inflation stream\n", parent);
186         if (ret == CL_SUCCESS)
187             ret = CL_EUNPACK;
188     }
189     close(ofd);
190     if (!ctx->engine->keeptmp)
191         if (cli_unlink(tmpname))
192             ret = CL_EUNLINK;
193     free(tmpname);
194     return ret;
195 }
196 
197 /* convert HANGUL_NUMERICAL to UTF-8 encoding using iconv library, converts to base64 encoding if no iconv or failure */
198 #define HANGUL_NUMERICAL 0
convert_hstr_to_utf8(const char * begin,size_t sz,const char * parent,cl_error_t * ret)199 static char *convert_hstr_to_utf8(const char *begin, size_t sz, const char *parent, cl_error_t *ret)
200 {
201     cl_error_t rc = CL_SUCCESS;
202     char *res     = NULL;
203 #if HANGUL_NUMERICAL && HAVE_ICONV
204     char *p1, *p2, *inbuf = NULL, *outbuf = NULL;
205     size_t inlen, outlen;
206     iconv_t cd;
207 
208     do {
209         p1 = inbuf = cli_calloc(1, sz + 1);
210         if (!inbuf) {
211             cli_errmsg("%s: Failed to allocate memory for encoding conversion buffer\n", parent);
212             rc = CL_EMEM;
213             break;
214         }
215         memcpy(inbuf, begin, sz);
216         p2 = outbuf = cli_calloc(1, sz + 1);
217         if (!outbuf) {
218             cli_errmsg("%s: Failed to allocate memory for encoding conversion buffer\n", parent);
219             rc = CL_EMEM;
220             break;
221         }
222         inlen = outlen = sz;
223 
224         cd = iconv_open("UTF-8", "UNICODE");
225         if (cd == (iconv_t)(-1)) {
226             char errbuf[128];
227             cli_strerror(errno, errbuf, sizeof(errbuf));
228             cli_errmsg("%s: Failed to initialize iconv for encoding %s: %s\n", parent, HANGUL_NUMERICAL, errbuf);
229             break;
230         }
231 
232         iconv(cd, (char **)(&p1), &inlen, &p2, &outlen);
233         iconv_close(cd);
234 
235         /* no data was converted */
236         if (outlen == sz)
237             break;
238 
239         outbuf[sz - outlen] = '\0';
240 
241         if (!(res = strdup(outbuf))) {
242             cli_errmsg("%s: Failed to allocate memory for encoding conversion buffer\n", parent);
243             rc = CL_EMEM;
244             break;
245         }
246     } while (0);
247 
248     if (inbuf)
249         free(inbuf);
250     if (outbuf)
251         free(outbuf);
252 #endif
253     /* safety base64 encoding */
254     if (!res && (rc == CL_SUCCESS)) {
255         char *tmpbuf;
256 
257         tmpbuf = cli_calloc(1, sz + 1);
258         if (tmpbuf) {
259             memcpy(tmpbuf, begin, sz);
260 
261             res = (char *)cl_base64_encode(tmpbuf, sz);
262             if (res)
263                 rc = CL_VIRUS; /* used as placeholder */
264             else
265                 rc = CL_EMEM;
266 
267             free(tmpbuf);
268         } else {
269             cli_errmsg("%s: Failed to allocate memory for temporary buffer\n", parent);
270             rc = CL_EMEM;
271         }
272     }
273 
274     (*ret) = rc;
275     return res;
276 }
277 
278 /*** HWPOLE2 ***/
cli_scanhwpole2(cli_ctx * ctx)279 cl_error_t cli_scanhwpole2(cli_ctx *ctx)
280 {
281     fmap_t *map = ctx->fmap;
282     uint32_t usize, asize;
283 
284     asize = (uint32_t)(map->len - sizeof(usize));
285 
286     if (fmap_readn(map, &usize, 0, sizeof(usize)) != sizeof(usize)) {
287         cli_errmsg("HWPOLE2: Failed to read uncompressed ole2 filesize\n");
288         return CL_EREAD;
289     }
290 
291     if (usize != asize)
292         cli_warnmsg("HWPOLE2: Mismatched uncompressed prefix and size: %u != %u\n", usize, asize);
293     else
294         cli_dbgmsg("HWPOLE2: Matched uncompressed prefix and size: %u == %u\n", usize, asize);
295 
296     return cli_magic_scan_nested_fmap_type(map, 4, 0, ctx, CL_TYPE_ANY, NULL);
297     //return cli_magic_scan_nested_fmap_type(map, 4, 0, ctx, CL_TYPE_OLE2);
298 }
299 
300 /*** HWP5 ***/
301 
cli_hwp5header(cli_ctx * ctx,hwp5_header_t * hwp5)302 cl_error_t cli_hwp5header(cli_ctx *ctx, hwp5_header_t *hwp5)
303 {
304     if (!ctx || !hwp5)
305         return CL_ENULLARG;
306 
307 #if HAVE_JSON
308     if (SCAN_COLLECT_METADATA) {
309         json_object *header, *flags;
310 
311         header = cli_jsonobj(ctx->wrkproperty, "Hwp5Header");
312         if (!header) {
313             cli_errmsg("HWP5.x: No memory for Hwp5Header object\n");
314             return CL_EMEM;
315         }
316 
317         /* version */
318         cli_jsonint(header, "RawVersion", hwp5->version);
319 
320         /* flags */
321         cli_jsonint(header, "RawFlags", hwp5->flags);
322 
323         flags = cli_jsonarray(header, "Flags");
324         if (!flags) {
325             cli_errmsg("HWP5.x: No memory for Hwp5Header/Flags array\n");
326             return CL_EMEM;
327         }
328 
329         if (hwp5->flags & HWP5_COMPRESSED) {
330             cli_jsonstr(flags, NULL, "HWP5_COMPRESSED");
331         }
332         if (hwp5->flags & HWP5_PASSWORD) {
333             cli_jsonstr(flags, NULL, "HWP5_PASSWORD");
334         }
335         if (hwp5->flags & HWP5_DISTRIBUTABLE) {
336             cli_jsonstr(flags, NULL, "HWP5_DISTRIBUTABLE");
337         }
338         if (hwp5->flags & HWP5_SCRIPT) {
339             cli_jsonstr(flags, NULL, "HWP5_SCRIPT");
340         }
341         if (hwp5->flags & HWP5_DRM) {
342             cli_jsonstr(flags, NULL, "HWP5_DRM");
343         }
344         if (hwp5->flags & HWP5_XMLTEMPLATE) {
345             cli_jsonstr(flags, NULL, "HWP5_XMLTEMPLATE");
346         }
347         if (hwp5->flags & HWP5_HISTORY) {
348             cli_jsonstr(flags, NULL, "HWP5_HISTORY");
349         }
350         if (hwp5->flags & HWP5_CERT_SIGNED) {
351             cli_jsonstr(flags, NULL, "HWP5_CERT_SIGNED");
352         }
353         if (hwp5->flags & HWP5_CERT_ENCRYPTED) {
354             cli_jsonstr(flags, NULL, "HWP5_CERT_ENCRYPTED");
355         }
356         if (hwp5->flags & HWP5_CERT_EXTRA) {
357             cli_jsonstr(flags, NULL, "HWP5_CERT_EXTRA");
358         }
359         if (hwp5->flags & HWP5_CERT_DRM) {
360             cli_jsonstr(flags, NULL, "HWP5_CERT_DRM");
361         }
362         if (hwp5->flags & HWP5_CCL) {
363             cli_jsonstr(flags, NULL, "HWP5_CCL");
364         }
365     }
366 #endif
367     return CL_SUCCESS;
368 }
369 
hwp5_cb(void * cbdata,int fd,const char * filepath,cli_ctx * ctx)370 static cl_error_t hwp5_cb(void *cbdata, int fd, const char *filepath, cli_ctx *ctx)
371 {
372     UNUSEDPARAM(cbdata);
373 
374     if (fd < 0 || !ctx)
375         return CL_ENULLARG;
376 
377     return cli_magic_scan_desc(fd, filepath, ctx, NULL);
378 }
379 
cli_scanhwp5_stream(cli_ctx * ctx,hwp5_header_t * hwp5,char * name,int fd,const char * filepath)380 cl_error_t cli_scanhwp5_stream(cli_ctx *ctx, hwp5_header_t *hwp5, char *name, int fd, const char *filepath)
381 {
382     hwp5_debug("HWP5.x: NAME: %s\n", name ? name : "(NULL)");
383 
384     if (fd < 0) {
385         cli_errmsg("HWP5.x: Invalid file descriptor argument\n");
386         return CL_ENULLARG;
387     }
388 
389     if (name) {
390         /* encrypted and compressed streams */
391         if (!strncmp(name, "bin", 3) || !strncmp(name, "jscriptversion", 14) ||
392             !strncmp(name, "defaultjscript", 14) || !strncmp(name, "section", 7) ||
393             !strncmp(name, "viewtext", 8) || !strncmp(name, "docinfo", 7)) {
394 
395             if (hwp5->flags & HWP5_PASSWORD) {
396                 cli_dbgmsg("HWP5.x: Password encrypted stream, scanning as-is\n");
397                 return cli_magic_scan_desc(fd, filepath, ctx, name);
398             }
399 
400             if (hwp5->flags & HWP5_COMPRESSED) {
401                 /* DocInfo JSON Handling */
402                 STATBUF statbuf;
403                 fmap_t *input;
404                 cl_error_t ret;
405 
406                 hwp5_debug("HWP5.x: Sending %s for decompress and scan\n", name);
407 
408                 /* fmap the input file for easier manipulation */
409                 if (FSTAT(fd, &statbuf) == -1) {
410                     cli_errmsg("HWP5.x: Can't stat file descriptor\n");
411                     return CL_ESTAT;
412                 }
413 
414                 input = fmap(fd, 0, statbuf.st_size, NULL);
415                 if (!input) {
416                     cli_errmsg("HWP5.x: Failed to get fmap for input stream\n");
417                     return CL_EMAP;
418                 }
419                 ret = decompress_and_callback(ctx, input, 0, 0, "HWP5.x", hwp5_cb, NULL);
420                 funmap(input);
421                 return ret;
422             }
423         }
424 
425 #if HAVE_JSON
426         /* JSON Output Summary Information */
427         if (SCAN_COLLECT_METADATA && ctx->properties != NULL) {
428             if (name && !strncmp(name, "_5_hwpsummaryinformation", 24)) {
429                 cli_dbgmsg("HWP5.x: Detected a '_5_hwpsummaryinformation' stream\n");
430                 /* JSONOLE2 - what to do if something breaks? */
431                 if (cli_ole2_summary_json(ctx, fd, 2) == CL_ETIMEOUT)
432                     return CL_ETIMEOUT;
433             }
434         }
435 
436 #endif
437     }
438 
439     /* normal streams */
440     return cli_magic_scan_desc(fd, filepath, ctx, name);
441 }
442 
443 /*** HWP3 ***/
444 
445 /* all fields use little endian and unicode encoding, if appliable */
446 
447 //File Identification Information - (30 total bytes)
448 #define HWP3_IDENTITY_INFO_SIZE 30
449 
450 //Document Information - (128 total bytes)
451 #define HWP3_DOCINFO_SIZE 128
452 
453 #define DI_WRITEPROT 24    /* offset 24 (4 bytes) - write protection */
454 #define DI_EXTERNAPP 28    /* offset 28 (2 bytes) - external application */
455 #define DI_PNAME 32        /* offset 32 (40 x 1 bytes) - print name */
456 #define DI_ANNOTE 72       /* offset 72 (24 x 1 bytes) - annotation */
457 #define DI_PASSWD 96       /* offset 96 (2 bytes) - password protected */
458 #define DI_COMPRESSED 124  /* offset 124 (1 byte) - compression */
459 #define DI_INFOBLKSIZE 126 /* offset 126 (2 bytes) - information block length */
460 struct hwp3_docinfo {
461     uint32_t di_writeprot;
462     uint16_t di_externapp;
463     uint16_t di_passwd;
464     uint8_t di_compressed;
465     uint16_t di_infoblksize;
466 };
467 
468 //Document Summary - (1008 total bytes)
469 #define HWP3_DOCSUMMARY_SIZE 1008
470 struct hwp3_docsummary_entry {
471     size_t offset;
472     const char *name;
473 } hwp3_docsummary_fields[] = {
474     {0, "Title"},      /* offset 0 (56 x 2 bytes) - title */
475     {112, "Subject"},  /* offset 112 (56 x 2 bytes) - subject */
476     {224, "Author"},   /* offset 224 (56 x 2 bytes) - author */
477     {336, "Date"},     /* offset 336 (56 x 2 bytes) - date */
478     {448, "Keyword1"}, /* offset 448 (2 x 56 x 2 bytes) - keywords */
479     {560, "Keyword2"},
480 
481     {672, "Etc0"}, /* offset 672 (3 x 56 x 2 bytes) - etc */
482     {784, "Etc1"},
483     {896, "Etc2"}};
484 #define NUM_DOCSUMMARY_FIELDS sizeof(hwp3_docsummary_fields) / sizeof(struct hwp3_docsummary_entry)
485 
486 //Document Paragraph Information - (43 or 230 total bytes)
487 #define HWP3_PARAINFO_SIZE_S 43
488 #define HWP3_PARAINFO_SIZE_L 230
489 #define HWP3_LINEINFO_SIZE 14
490 #define HWP3_CHARSHPDATA_SIZE 31
491 
492 #define HWP3_FIELD_LENGTH 512
493 
494 #define PI_PPFS 0    /* offset 0 (1 byte)  - prior paragraph format style */
495 #define PI_NCHARS 1  /* offset 1 (2 bytes) - character count */
496 #define PI_NLINES 3  /* offset 3 (2 bytes) - line count */
497 #define PI_IFSC 5    /* offset 5 (1 byte)  - including font style of characters */
498 #define PI_FLAGS 6   /* offset 6 (1 byte)  - other flags */
499 #define PI_SPECIAL 7 /* offset 7 (4 bytes) - special characters markers */
500 #define PI_ISTYLE 11 /* offset 11 (1 byte) - paragraph style index */
501 
502 #define PLI_LOFF 0  /* offset 0 (2 bytes) - line starting offset */
503 #define PLI_LCOR 2  /* offset 2 (2 bytes) - line blank correction */
504 #define PLI_LHEI 4  /* offset 4 (2 bytes) - line max char height */
505 #define PLI_LPAG 12 /* offset 12 (2 bytes) - line pagination*/
506 
507 #define PCSD_SIZE 0  /* offset 0 (2 bytes) - size of characters */
508 #define PCSD_PROP 26 /* offset 26 (1 byte) - properties */
509 
parsehwp3_docinfo(cli_ctx * ctx,size_t offset,struct hwp3_docinfo * docinfo)510 static inline cl_error_t parsehwp3_docinfo(cli_ctx *ctx, size_t offset, struct hwp3_docinfo *docinfo)
511 {
512     const uint8_t *hwp3_ptr;
513     cl_error_t iret;
514 
515     //TODO: use fmap_readn?
516     if (!(hwp3_ptr = fmap_need_off_once(ctx->fmap, offset, HWP3_DOCINFO_SIZE))) {
517         cli_errmsg("HWP3.x: Failed to read fmap for hwp docinfo\n");
518         return CL_EMAP;
519     }
520 
521     memcpy(&(docinfo->di_writeprot), hwp3_ptr + DI_WRITEPROT, sizeof(docinfo->di_writeprot));
522     memcpy(&(docinfo->di_externapp), hwp3_ptr + DI_EXTERNAPP, sizeof(docinfo->di_externapp));
523     memcpy(&(docinfo->di_passwd), hwp3_ptr + DI_PASSWD, sizeof(docinfo->di_passwd));
524     memcpy(&(docinfo->di_compressed), hwp3_ptr + DI_COMPRESSED, sizeof(docinfo->di_compressed));
525     memcpy(&(docinfo->di_infoblksize), hwp3_ptr + DI_INFOBLKSIZE, sizeof(docinfo->di_infoblksize));
526 
527     docinfo->di_writeprot   = le32_to_host(docinfo->di_writeprot);
528     docinfo->di_externapp   = le16_to_host(docinfo->di_externapp);
529     docinfo->di_passwd      = le16_to_host(docinfo->di_passwd);
530     docinfo->di_infoblksize = le16_to_host(docinfo->di_infoblksize);
531 
532     hwp3_debug("HWP3.x: di_writeprot:   %u\n", docinfo->di_writeprot);
533     hwp3_debug("HWP3.x: di_externapp:   %u\n", docinfo->di_externapp);
534     hwp3_debug("HWP3.x: di_passwd:      %u\n", docinfo->di_passwd);
535     hwp3_debug("HWP3.x: di_compressed:  %u\n", docinfo->di_compressed);
536     hwp3_debug("HWP3.x: di_infoblksize: %u\n", docinfo->di_infoblksize);
537 
538 #if HAVE_JSON
539     if (SCAN_COLLECT_METADATA) {
540         json_object *header, *flags;
541         char *str;
542 
543         header = cli_jsonobj(ctx->wrkproperty, "Hwp3Header");
544         if (!header) {
545             cli_errmsg("HWP3.x: No memory for Hwp3Header object\n");
546             return CL_EMEM;
547         }
548 
549         flags = cli_jsonarray(header, "Flags");
550         if (!flags) {
551             cli_errmsg("HWP5.x: No memory for Hwp5Header/Flags array\n");
552             return CL_EMEM;
553         }
554 
555         if (docinfo->di_writeprot) {
556             cli_jsonstr(flags, NULL, "HWP3_WRITEPROTECTED"); /* HWP3_DISTRIBUTABLE */
557         }
558         if (docinfo->di_externapp) {
559             cli_jsonstr(flags, NULL, "HWP3_EXTERNALAPPLICATION");
560         }
561         if (docinfo->di_passwd) {
562             cli_jsonstr(flags, NULL, "HWP3_PASSWORD");
563         }
564         if (docinfo->di_compressed) {
565             cli_jsonstr(flags, NULL, "HWP3_COMPRESSED");
566         }
567 
568         /* Printed File Name */
569         str = convert_hstr_to_utf8((char *)(hwp3_ptr + DI_PNAME), 40, "HWP3.x", &iret);
570         if (!str)
571             return CL_EMEM;
572 
573         if (iret == CL_VIRUS)
574             cli_jsonbool(header, "PrintName_base64", 1);
575 
576         hwp3_debug("HWP3.x: di_pname:   %s\n", str);
577         cli_jsonstr(header, "PrintName", str);
578         free(str);
579 
580         /* Annotation */
581         str = convert_hstr_to_utf8((char *)(hwp3_ptr + DI_ANNOTE), 24, "HWP3.x", &iret);
582         if (!str)
583             return CL_EMEM;
584 
585         if (iret == CL_VIRUS)
586             cli_jsonbool(header, "Annotation_base64", 1);
587 
588         hwp3_debug("HWP3.x: di_annote:  %s\n", str);
589         cli_jsonstr(header, "Annotation", str);
590         free(str);
591     }
592 #endif
593 
594     return CL_SUCCESS;
595 }
596 
parsehwp3_docsummary(cli_ctx * ctx,size_t offset)597 static inline cl_error_t parsehwp3_docsummary(cli_ctx *ctx, size_t offset)
598 {
599 #if HAVE_JSON
600     const uint8_t *hwp3_ptr;
601     char *str;
602     size_t i;
603     cl_error_t ret, iret;
604 
605     json_object *summary;
606 
607     if (!SCAN_COLLECT_METADATA)
608         return CL_SUCCESS;
609 
610     if (!(hwp3_ptr = fmap_need_off_once(ctx->fmap, offset, HWP3_DOCSUMMARY_SIZE))) {
611         cli_errmsg("HWP3.x: Failed to read fmap for hwp docinfo\n");
612         return CL_EMAP;
613     }
614 
615     summary = cli_jsonobj(ctx->wrkproperty, "Hwp3SummaryInfo");
616     if (!summary) {
617         cli_errmsg("HWP3.x: No memory for json object\n");
618         return CL_EMEM;
619     }
620 
621     for (i = 0; i < NUM_DOCSUMMARY_FIELDS; i++) {
622         str = convert_hstr_to_utf8((char *)(hwp3_ptr + hwp3_docsummary_fields[i].offset), 112, "HWP3.x", &iret);
623         if (!str)
624             return CL_EMEM;
625 
626         if (iret == CL_VIRUS) {
627             char *b64;
628             size_t b64len = strlen(hwp3_docsummary_fields[i].name) + 8;
629             b64           = cli_calloc(1, b64len);
630             if (!b64) {
631                 cli_errmsg("HWP3.x: Failed to allocate memory for b64 boolean\n");
632                 free(str);
633                 return CL_EMEM;
634             }
635             snprintf(b64, b64len, "%s_base64", hwp3_docsummary_fields[i].name);
636             cli_jsonbool(summary, b64, 1);
637             free(b64);
638         }
639 
640         hwp3_debug("HWP3.x: %s, %s\n", hwp3_docsummary_fields[i].name, str);
641         ret = cli_jsonstr(summary, hwp3_docsummary_fields[i].name, str);
642         free(str);
643         if (ret != CL_SUCCESS)
644             return ret;
645     }
646 #else
647     UNUSEDPARAM(ctx);
648     UNUSEDPARAM(offset);
649 #endif
650     return CL_SUCCESS;
651 }
652 
653 #if HWP3_VERIFY
654 #define HWP3_PSPECIAL_VERIFY(map, offset, second, id, match)                          \
655     do {                                                                              \
656         if (fmap_readn(map, &match, offset + second, sizeof(match)) != sizeof(match)) \
657             return CL_EREAD;                                                          \
658                                                                                       \
659         match = le16_to_host(match);                                                  \
660                                                                                       \
661         if (id != match) {                                                            \
662             cli_errmsg("HWP3.x: ID %u block fails verification\n", id);               \
663             return CL_EFORMAT;                                                        \
664         }                                                                             \
665     } while (0)
666 
667 #else
668 #define HWP3_PSPECIAL_VERIFY(map, offset, second, id, match)
669 #endif
670 
parsehwp3_paragraph(cli_ctx * ctx,fmap_t * map,int p,uint32_t level,size_t * roffset,int * last)671 static inline cl_error_t parsehwp3_paragraph(cli_ctx *ctx, fmap_t *map, int p, uint32_t level, size_t *roffset, int *last)
672 {
673     cl_error_t ret = CL_SUCCESS;
674 
675     size_t offset = *roffset;
676     size_t new_offset;
677     uint16_t nchars, nlines, content;
678     uint8_t ppfs, ifsc, cfsb;
679     uint16_t i;
680     int c, l, sp = 0, term = 0;
681 #if HWP3_VERIFY
682     uint16_t match;
683 #endif
684 #if HWP3_DEBUG
685     /* other paragraph info */
686     uint8_t flags, istyle;
687     uint16_t fsize;
688     uint32_t special;
689 
690     /* line info */
691     uint16_t loff, lcor, lhei, lpag;
692 
693     /* char shape data */
694     uint16_t pcsd_size;
695     uint8_t pcsd_prop;
696 #endif
697 
698     hwp3_debug("HWP3.x: recursion level: %u\n", level);
699     hwp3_debug("HWP3.x: Paragraph[%u, %d] starts @ offset %zu\n", level, p, offset);
700 
701     if (level >= ctx->engine->maxrechwp3)
702         return CL_EMAXREC;
703 
704     if (fmap_readn(map, &ppfs, offset + PI_PPFS, sizeof(ppfs)) != sizeof(ppfs))
705         return CL_EREAD;
706 
707     if (fmap_readn(map, &nchars, offset + PI_NCHARS, sizeof(nchars)) != sizeof(nchars))
708         return CL_EREAD;
709 
710     nchars = le16_to_host(nchars);
711 
712     if (fmap_readn(map, &nlines, offset + PI_NLINES, sizeof(nlines)) != sizeof(nlines))
713         return CL_EREAD;
714 
715     nlines = le16_to_host(nlines);
716 
717     if (fmap_readn(map, &ifsc, offset + PI_IFSC, sizeof(ifsc)) != sizeof(ifsc))
718         return CL_EREAD;
719 
720     hwp3_debug("HWP3.x: Paragraph[%u, %d]: ppfs   %u\n", level, p, ppfs);
721     hwp3_debug("HWP3.x: Paragraph[%u, %d]: nchars %u\n", level, p, nchars);
722     hwp3_debug("HWP3.x: Paragraph[%u, %d]: nlines %u\n", level, p, nlines);
723     hwp3_debug("HWP3.x: Paragraph[%u, %d]: ifsc   %u\n", level, p, ifsc);
724 
725 #if HWP3_DEBUG
726     if (fmap_readn(map, &flags, offset + PI_FLAGS, sizeof(flags)) != sizeof(flags))
727         return CL_EREAD;
728 
729     if (fmap_readn(map, &special, offset + PI_SPECIAL, sizeof(special)) != sizeof(special))
730         return CL_EREAD;
731 
732     if (fmap_readn(map, &istyle, offset + PI_ISTYLE, sizeof(istyle)) != sizeof(istyle))
733         return CL_EREAD;
734 
735     if (fmap_readn(map, &fsize, offset + 12, sizeof(fsize)) != sizeof(fsize))
736         return CL_EREAD;
737 
738     hwp3_debug("HWP3.x: Paragraph[%u, %d]: flags  %x\n", level, p, flags);
739     hwp3_debug("HWP3.x: Paragraph[%u, %d]: spcl   %x\n", level, p, special);
740     hwp3_debug("HWP3.x: Paragraph[%u, %d]: istyle %u\n", level, p, istyle);
741     hwp3_debug("HWP3.x: Paragraph[%u, %d]: fsize  %u\n", level, p, fsize);
742 #endif
743 
744     /* detected empty paragraph marker => end-of-paragraph list */
745     if (nchars == 0) {
746         hwp3_debug("HWP3.x: Detected end-of-paragraph list @ offset %zu\n", offset);
747         hwp3_debug("HWP3.x: end recursion level: %u\n", level);
748         (*roffset) = offset + HWP3_PARAINFO_SIZE_S;
749         (*last)    = 1;
750         return CL_SUCCESS;
751     }
752 
753     if (ppfs)
754         offset += HWP3_PARAINFO_SIZE_S;
755     else
756         offset += HWP3_PARAINFO_SIZE_L;
757 
758         /* line information blocks */
759 #if HWP3_DEBUG
760     for (i = 0; (i < nlines) && (offset < map->len); i++) {
761         hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d information starts @ offset %zu\n", level, p, i, offset);
762         if (fmap_readn(map, &loff, offset + PLI_LOFF, sizeof(loff)) != sizeof(loff))
763             return CL_EREAD;
764 
765         if (fmap_readn(map, &lcor, offset + PLI_LCOR, sizeof(lcor)) != sizeof(lcor))
766             return CL_EREAD;
767 
768         if (fmap_readn(map, &lhei, offset + PLI_LHEI, sizeof(lhei)) != sizeof(lhei))
769             return CL_EREAD;
770 
771         if (fmap_readn(map, &lpag, offset + PLI_LPAG, sizeof(lpag)) != sizeof(lpag))
772             return CL_EREAD;
773 
774         loff = le16_to_host(loff);
775         lcor = le16_to_host(lcor);
776         lhei = le16_to_host(lhei);
777         lpag = le16_to_host(lpag);
778 
779         hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: loff %u\n", level, p, i, loff);
780         hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: lcor %x\n", level, p, i, lcor);
781         hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: lhei %u\n", level, p, i, lhei);
782         hwp3_debug("HWP3.x: Paragraph[%u, %d]: Line %d: lpag %u\n", level, p, i, lpag);
783 
784         offset += HWP3_LINEINFO_SIZE;
785     }
786 #else
787     new_offset = offset + (nlines * HWP3_LINEINFO_SIZE);
788     if ((new_offset < offset) || (new_offset >= map->len)) {
789         cli_errmsg("HWP3.x: Paragraph[%u, %d]: nlines value is too high, invalid. %u\n", level, p, nlines);
790         return CL_EPARSE;
791     }
792     offset = new_offset;
793 #endif
794 
795     if (offset >= map->len)
796         return CL_EFORMAT;
797 
798     if (ifsc) {
799         for (i = 0, c = 0; i < nchars; i++) {
800             /* examine byte for cs data type */
801             if (fmap_readn(map, &cfsb, offset, sizeof(cfsb)) != sizeof(cfsb))
802                 return CL_EREAD;
803 
804             offset += sizeof(cfsb);
805 
806             switch (cfsb) {
807                 case 0: /* character shape block */
808                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: character font style data @ offset %zu\n", level, p, offset);
809 
810 #if HWP3_DEBUG
811                     if (fmap_readn(map, &pcsd_size, offset + PCSD_SIZE, sizeof(pcsd_size)) != sizeof(pcsd_size))
812                         return CL_EREAD;
813 
814                     if (fmap_readn(map, &pcsd_prop, offset + PCSD_PROP, sizeof(pcsd_prop)) != sizeof(pcsd_prop))
815                         return CL_EREAD;
816 
817                     pcsd_size = le16_to_host(pcsd_size);
818 
819                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: CFS %u: pcsd_size %u\n", level, p, 0, pcsd_size);
820                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: CFS %u: pcsd_prop %x\n", level, p, 0, pcsd_prop);
821 #endif
822 
823                     c++;
824                     offset += HWP3_CHARSHPDATA_SIZE;
825                     break;
826                 case 1: /* normal character - as representation of another character for previous cs block */
827                     break;
828                 default:
829                     cli_errmsg("HWP3.x: Paragraph[%u, %d]: unknown CFS type 0x%x @ offset %zu\n", level, p, cfsb, offset);
830                     cli_errmsg("HWP3.x: Paragraph parsing detected %d of %u characters\n", i, nchars);
831                     return CL_EPARSE;
832             }
833         }
834 
835         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected %d CFS block(s) and %d characters\n", level, p, c, i);
836     } else {
837         hwp3_debug("HWP3.x: Paragraph[%u, %d]: separate character font style segment not stored\n", level, p);
838     }
839 
840     if (!term)
841         hwp3_debug("HWP3.x: Paragraph[%u, %d]: content starts @ offset %zu\n", level, p, offset);
842 
843     /* scan for end-of-paragraph [0x0d00 on offset parity to current content] */
844     while ((!term) &&
845            (offset < map->len)) {
846 
847         if (fmap_readn(map, &content, offset, sizeof(content)) != sizeof(content))
848             return CL_EREAD;
849 
850         content = le16_to_host(content);
851 
852         /* special character handling */
853         if (content < 32) {
854             hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected special character %u @ offset %zu\n", level, p, content, offset);
855 
856             switch (content) {
857                 case 0:
858                 case 1:
859                 case 2:
860                 case 3:
861                 case 4:
862                 case 12:
863                 case 27: {
864                     /* reserved */
865                     uint32_t length;
866 
867                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected special character as [reserved]\n", level, p);
868 
869                     /*
870                      * offset 0 (2 bytes) - special character ID
871                      * offset 2 (4 bytes) - length of information = n
872                      * offset 6 (2 bytes) - special character ID
873                      * offset 8 (n bytes) - information
874                      */
875 
876                     /* id block verification (only on HWP3_VERIFY) */
877                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
878 
879                     if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
880                         return CL_EREAD;
881 
882                     length     = le32_to_host(length);
883                     new_offset = offset + (8 + length);
884                     if ((new_offset <= offset) || (new_offset > map->len)) {
885                         cli_errmsg("HWP3.x: Paragraph[%u, %d]: length value is too high, invalid. %u\n", level, p, length);
886                         return CL_EPARSE;
887                     }
888                     offset = new_offset;
889 
890 #if HWP3_DEBUG
891                     cli_errmsg("HWP3.x: Paragraph[%u, %d]: possible invalid usage of reserved special character %u\n", level, p, content);
892                     return CL_EFORMAT;
893 #endif
894                     break;
895                 }
896                 case 5: /* field codes */
897                 {
898                     uint32_t length;
899 
900                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected field code marker @ offset %zu\n", level, p, offset);
901 
902                     /*
903                      * offset 0 (2 bytes) - special character ID
904                      * offset 2 (4 bytes) - length of information = n
905                      * offset 6 (2 bytes) - special character ID
906                      * offset 8 (n bytes) - field code details
907                      */
908 
909                     /* id block verification (only on HWP3_VERIFY) */
910                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
911 
912                     if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
913                         return CL_EREAD;
914 
915                     length     = le32_to_host(length);
916                     new_offset = offset + (8 + length);
917                     if ((new_offset <= offset) || (new_offset > map->len)) {
918                         cli_errmsg("HWP3.x: Paragraph[%u, %d]: length value is too high, invalid. %u\n", level, p, length);
919                         return CL_EPARSE;
920                     }
921                     offset = new_offset;
922                     break;
923                 }
924                 case 6: /* bookmark */
925                 {
926 #if HWP3_VERIFY
927                     uint32_t length;
928 #endif
929 
930                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected bookmark marker @ offset %zu\n", level, p, offset);
931 
932                     /*
933                      * offset 0 (2 bytes) - special character ID
934                      * offset 2 (4 bytes) - length of information = 34
935                      * offset 6 (2 bytes) - special character ID
936                      * offset 8 (16 x 2 bytes) - bookmark name
937                      * offset 40 (2 bytes) - bookmark type
938                      * total is always 42 bytes
939                      */
940 
941 #if HWP3_VERIFY
942                     /* id block verification (only on HWP3_VERIFY) */
943                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
944 
945                     /* length check - always 34 bytes */
946                     if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
947                         return CL_EREAD;
948 
949                     length = le32_to_host(length);
950 
951                     if (length != 34) {
952                         cli_errmsg("HWP3.x: Bookmark has incorrect length: %u != 34)\n", length);
953                         return CL_EFORMAT;
954                     }
955 #endif
956                     offset += 42;
957                     break;
958                 }
959                 case 7: /* date format */
960                 {
961                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected date format marker @ offset %zu\n", level, p, offset);
962 
963                     /*
964                      * offset 0 (2 bytes) - special character ID
965                      * offset 2 (40 x 2 bytes) - date format as user-defined dialog
966                      * offset 82 (2 bytes) - special character ID
967                      * total is always 84 bytes
968                      */
969 
970                     /* id block verification (only on HWP3_VERIFY) */
971                     HWP3_PSPECIAL_VERIFY(map, offset, 82, content, match);
972 
973                     offset += 84;
974                     break;
975                 }
976                 case 8: /* date code */
977                 {
978                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected date code marker @ offset %zu\n", level, p, offset);
979 
980                     /*
981                      * offset 0 (2 bytes) - special character ID
982                      * offset 2 (40 x 2 bytes) - date format string
983                      * offset 82 (4 x 2 bytes) - date (year, month, day of week)
984                      * offset 90 (2 x 2 bytes) - time (hour, minute)
985                      * offset 94 (2 bytes) - special character ID
986                      * total is always 96 bytes
987                      */
988 
989                     /* id block verification (only on HWP3_VERIFY) */
990                     HWP3_PSPECIAL_VERIFY(map, offset, 94, content, match);
991 
992                     offset += 96;
993                     break;
994                 }
995                 case 9: /* tab */
996                 {
997                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected tab marker @ offset %zu\n", level, p, offset);
998 
999                     /*
1000                      * offset 0 (2 bytes) - special character ID
1001                      * offset 2 (2 bytes) - tab width
1002                      * offset 4 (2 bytes) - unknown(?)
1003                      * offset 6 (2 bytes) - special character ID
1004                      * total is always 8 bytes
1005                      */
1006 
1007                     /* id block verification (only on HWP3_VERIFY) */
1008                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1009 
1010                     offset += 8;
1011                     break;
1012                 }
1013                 case 10: /* table, test box, equation, button, hypertext */
1014                 {
1015                     uint16_t ncells;
1016 #if HWP3_DEBUG
1017                     uint16_t type;
1018 #endif
1019                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected box object marker @ offset %zu\n", level, p, offset);
1020 
1021                     /* verification (only on HWP3_VERIFY) */
1022                     /* id block verify */
1023                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1024                     /* extra data block verify */
1025                     HWP3_PSPECIAL_VERIFY(map, offset, 24, content, match);
1026 
1027                     /* ID block is 8 bytes */
1028                     offset += 8;
1029 
1030                     /* box information (84 bytes) */
1031 #if HWP3_DEBUG
1032                     /* box type located at offset 78 of box information */
1033                     if (fmap_readn(map, &type, offset + 78, sizeof(type)) != sizeof(type))
1034                         return CL_EREAD;
1035 
1036                     type = le16_to_host(type);
1037                     if (type == 0)
1038                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as table\n", level, p);
1039                     else if (type == 1)
1040                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as text box\n", level, p);
1041                     else if (type == 2)
1042                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as equation\n", level, p);
1043                     else if (type == 3)
1044                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as button\n", level, p);
1045                     else
1046                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object detected as UNKNOWN(%u)\n", level, p, type);
1047 #endif
1048 
1049                     /* ncells is located at offset 80 of box information */
1050                     if (fmap_readn(map, &ncells, offset + 80, sizeof(ncells)) != sizeof(ncells))
1051                         return CL_EREAD;
1052 
1053                     ncells = le16_to_host(ncells);
1054                     offset += 84;
1055 
1056                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: box object contains %u cell(s)\n", level, p, ncells);
1057 
1058                     /* cell information (27 bytes x ncells(offset 80 of table)) */
1059                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: box cell info array starts @ %zu\n", level, p, offset);
1060 
1061                     new_offset = offset + (27 * ncells);
1062                     if ((new_offset < offset) || (new_offset >= map->len)) {
1063                         cli_errmsg("HWP3.x: Paragraph[%u, %d]: number of box cells is too high, invalid. %u\n", level, p, ncells);
1064                         return CL_EPARSE;
1065                     }
1066                     offset = new_offset;
1067 
1068                     /* cell paragraph list */
1069                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: box cell paragraph list starts @ %zu\n", level, p, offset);
1070                     for (i = 0; i < ncells; i++) {
1071                         l = 0;
1072                         while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1073                         if (ret != CL_SUCCESS)
1074                             return ret;
1075                     }
1076 
1077                     /* box caption paragraph list */
1078                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: box cell caption paragraph list starts @ %zu\n", level, p, offset);
1079                     l = 0;
1080                     while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1081                     if (ret != CL_SUCCESS)
1082                         return ret;
1083                     break;
1084                 }
1085                 case 11: /* drawing */
1086                 {
1087                     uint32_t size;
1088                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected drawing marker @ offset %zu\n", level, p, offset);
1089 
1090                     /* verification (only on HWP3_VERIFY) */
1091                     /* id block verify */
1092                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1093                     /* extra data block verify */
1094                     HWP3_PSPECIAL_VERIFY(map, offset, 24, content, match);
1095 
1096                     /* ID block is 8 bytes */
1097                     offset += 8;
1098 
1099                     /* Drawing Info Block is 328+n bytes with n = size of image */
1100                     /* n is located at offset 0 of info block */
1101                     if (fmap_readn(map, &size, offset, sizeof(size)) != sizeof(size))
1102                         return CL_EREAD;
1103 
1104                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: drawing is %u additional bytes\n", level, p, size);
1105 
1106                     size       = le32_to_host(size);
1107                     new_offset = offset + (348 + size);
1108                     if ((new_offset <= offset) || (new_offset >= map->len)) {
1109                         cli_errmsg("HWP3.x: Paragraph[%u, %d]: image size value is too high, invalid. %u\n", level, p, size);
1110                         return CL_EPARSE;
1111                     }
1112                     offset = new_offset;
1113 
1114                     /* caption paragraph list */
1115                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: drawing caption paragraph list starts @ %zu\n", level, p, offset);
1116                     l = 0;
1117                     while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1118                     if (ret != CL_SUCCESS)
1119                         return ret;
1120                     break;
1121                 }
1122                 case 13: /* end-of-paragraph marker - treated identically as character */
1123                     hwp3_debug("HWP3.x: Detected end-of-paragraph marker @ offset %zu\n", offset);
1124                     term = 1;
1125 
1126                     offset += sizeof(content);
1127                     break;
1128                 case 14: /* line information */
1129                 {
1130                     hwp3_debug("HWP3.x: Detected line information marker @ offset %zu\n", offset);
1131 
1132                     /* verification (only on HWP3_VERIFY) */
1133                     /* id block verify */
1134                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1135                     /* extra data block verify */
1136                     HWP3_PSPECIAL_VERIFY(map, offset, 24, content, match);
1137 
1138                     /* ID block is 8 bytes + line information is always 84 bytes */
1139                     offset += 92;
1140                     break;
1141                 }
1142                 case 15: /* hidden description */
1143                 {
1144                     hwp3_debug("HWP3.x: Detected hidden description marker @ offset %zu\n", offset);
1145 
1146                     /*
1147                      * offset 0 (2 bytes) - special character ID
1148                      * offset 2 (4 bytes) - reserved
1149                      * offset 6 (2 bytes) - special character ID
1150                      * offset 8 (8 bytes) - reserved
1151                      * total is always 16 bytes
1152                      */
1153 
1154                     /* id block verification (only on HWP3_VERIFY) */
1155                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1156 
1157                     offset += 16;
1158 
1159                     /* hidden description paragraph list */
1160                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: hidden description paragraph list starts @ %zu\n", level, p, offset);
1161                     l = 0;
1162                     while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1163                     if (ret != CL_SUCCESS)
1164                         return ret;
1165                     break;
1166                 }
1167                 case 16: /* header/footer */
1168                 {
1169 #if HWP3_DEBUG
1170                     uint8_t type;
1171 #endif
1172 
1173                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer marker @ offset %zu\n", level, p, offset);
1174 
1175                     /*
1176                      * offset 0 (2 bytes) - special character ID
1177                      * offset 2 (4 bytes) - reserved
1178                      * offset 6 (2 bytes) - special character ID
1179                      * offset 8 (8 x 1 byte) - reserved
1180                      * offset 16 (1 byte) - type (header/footer)
1181                      * offset 17 (1 byte) - kind
1182                      * total is always 18 bytes
1183                      */
1184 
1185                     /* id block verification (only on HWP3_VERIFY) */
1186                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1187 
1188 #if HWP3_DEBUG
1189                     if (fmap_readn(map, &type, offset + 16, sizeof(type)) != sizeof(type))
1190                         return CL_EREAD;
1191 
1192                     if (type == 0)
1193                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer as header\n", level, p);
1194                     else if (type == 1)
1195                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer as footer\n", level, p);
1196                     else
1197                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected header/footer as UNKNOWN(%u)\n", level, p, type);
1198 #endif
1199                     offset += 18;
1200 
1201                     /* content paragraph list */
1202                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: header/footer paragraph list starts @ %zu\n", level, p, offset);
1203                     l = 0;
1204                     while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1205                     if (ret != CL_SUCCESS)
1206                         return ret;
1207                     break;
1208                 }
1209                 case 17: /* footnote/endnote */
1210                 {
1211                     hwp3_debug("HWP3.x: Detected footnote/endnote marker @ offset %zu\n", offset);
1212 
1213                     /*
1214                      * offset 0 (2 bytes) - special character ID
1215                      * offset 2 (4 bytes) - reserved
1216                      * offset 6 (2 bytes) - special character ID
1217                      * offset 8 (8 x 1 bytes) - reserved
1218                      * offset 16 (2 bytes) - number
1219                      * offset 18 (2 bytes) - type
1220                      * offset 20 (2 bytes) - alignment
1221                      * total is always 22 bytes
1222                      */
1223 
1224                     /* id block verification (only on HWP3_VERIFY) */
1225                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1226 
1227                     offset += 22;
1228 
1229                     /* content paragraph list */
1230                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: footnote/endnote paragraph list starts @ %zu\n", level, p, offset);
1231                     l = 0;
1232                     while (!l && ((ret = parsehwp3_paragraph(ctx, map, sp++, level + 1, &offset, &l)) == CL_SUCCESS)) continue;
1233                     if (ret != CL_SUCCESS)
1234                         return ret;
1235                     break;
1236                 }
1237                 case 18: /* paste code number */
1238                 {
1239 #if HWP3_DEBUG
1240                     uint8_t type;
1241 #endif
1242 
1243                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number marker @ offset %zu\n", level, p, offset);
1244 
1245                     /*
1246                      * offset 0 (2 bytes) - special character ID
1247                      * offset 2 (2 bytes) - type
1248                      * offset 4 (2 bytes) - number value
1249                      * offset 6 (2 bytes) - special character ID
1250                      * total is always 8 bytes
1251                      */
1252 
1253                     /* id block verification (only on HWP3_VERIFY) */
1254                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1255 
1256 #if HWP3_DEBUG
1257                     if (fmap_readn(map, &type, offset + 2, sizeof(type)) != sizeof(type))
1258                         return CL_EREAD;
1259 
1260                     if (type == 0)
1261                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as side\n", level, p);
1262                     else if (type == 1)
1263                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as footnote\n", level, p);
1264                     else if (type == 2)
1265                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as North America???\n", level, p);
1266                     else if (type == 3)
1267                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as drawing\n", level, p);
1268                     else if (type == 4)
1269                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as table\n", level, p);
1270                     else if (type == 5)
1271                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as equation\n", level, p);
1272                     else
1273                         hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected paste code number as UNKNOWN(%u)\n", level, p, type);
1274 #endif
1275                     offset += 8;
1276                     break;
1277                 }
1278                 case 19: /* code number change */
1279                 {
1280                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected code number change marker @ offset %zu\n", level, p, offset);
1281 
1282                     /*
1283                      * offset 0 (2 bytes) - special character ID
1284                      * offset 2 (2 bytes) - type
1285                      * offset 4 (2 bytes) - new number value
1286                      * offset 6 (2 bytes) - special character ID
1287                      * total is always 8 bytes
1288                      */
1289 
1290                     /* id block verification (only on HWP3_VERIFY) */
1291                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1292 
1293                     offset += 8;
1294                     break;
1295                 }
1296                 case 20: {
1297                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected thread page number marker @ offset %zu\n", level, p, offset);
1298 
1299                     /*
1300                      * offset 0 (2 bytes) - special character ID
1301                      * offset 2 (2 bytes) - location
1302                      * offset 4 (2 bytes) - shape
1303                      * offset 6 (2 bytes) - special character ID
1304                      * total is always 8 bytes
1305                      */
1306 
1307                     /* id block verification (only on HWP3_VERIFY) */
1308                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1309 
1310                     offset += 8;
1311                     break;
1312                 }
1313                 case 21: /* hide special */
1314                 {
1315                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected hide special marker @ offset %zu\n", level, p, offset);
1316 
1317                     /*
1318                      * offset 0 (2 bytes) - special character ID
1319                      * offset 2 (2 bytes) - type
1320                      * offset 4 (2 bytes) - target
1321                      * offset 6 (2 bytes) - special character ID
1322                      * total is always 8 bytes
1323                      */
1324 
1325                     /* id block verification (only on HWP3_VERIFY) */
1326                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1327 
1328                     offset += 8;
1329                     break;
1330                 }
1331                 case 22: /* mail merge display */
1332                 {
1333                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected mail merge display marker @ offset %zu\n", level, p, offset);
1334 
1335                     /*
1336                      * offset 0 (2 bytes) - special character ID
1337                      * offset 2 (20 x 1 bytes) - field name (in ASCII)
1338                      * offset 22 (2 bytes) - special character ID
1339                      * total is always 24 bytes
1340                      */
1341 
1342                     /* id block verification (only on HWP3_VERIFY) */
1343                     HWP3_PSPECIAL_VERIFY(map, offset, 22, content, match);
1344 
1345                     offset += 24;
1346                     break;
1347                 }
1348                 case 23: /* overlapping letters */
1349                 {
1350                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected overlapping marker @ offset %zu\n", level, p, offset);
1351 
1352                     /*
1353                      * offset 0 (2 bytes) - special character ID
1354                      * offset 2 (3 x 2 bytes) - overlapping letters
1355                      * offset 8 (2 bytes) - special character ID
1356                      * total is always 10 bytes
1357                      */
1358 
1359                     /* id block verification (only on HWP3_VERIFY) */
1360                     HWP3_PSPECIAL_VERIFY(map, offset, 8, content, match);
1361 
1362                     offset += 10;
1363                     break;
1364                 }
1365                 case 24: /* hyphen */
1366                 {
1367                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected hyphen marker @ offset %zu\n", level, p, offset);
1368 
1369                     /*
1370                      * offset 0 (2 bytes) - special character ID
1371                      * offset 2 (2 bytes) - width of hyphen
1372                      * offset 4 (2 bytes) - special character ID
1373                      * total is always 6 bytes
1374                      */
1375 
1376                     /* id block verification (only on HWP3_VERIFY) */
1377                     HWP3_PSPECIAL_VERIFY(map, offset, 4, content, match);
1378 
1379                     offset += 6;
1380                     break;
1381                 }
1382                 case 25: /* title/table/picture show times */
1383                 {
1384                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected title/table/picture show times marker @ offset %zu\n", level, p, offset);
1385 
1386                     /*
1387                      * offset 0 (2 bytes) - special character ID
1388                      * offset 2 (2 bytes) - type
1389                      * offset 4 (2 bytes) - special character ID
1390                      * total is always 6 bytes
1391                      */
1392 
1393                     /* id block verification (only on HWP3_VERIFY) */
1394                     HWP3_PSPECIAL_VERIFY(map, offset, 4, content, match);
1395 
1396                     offset += 6;
1397                     break;
1398                 }
1399                 case 26: /* browse displayed */
1400                 {
1401                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected browse displayed marker @ offset %zu\n", level, p, offset);
1402 
1403                     /*
1404                      * offset 0 (2 bytes) - special character ID
1405                      * offset 2 (60 x 2 bytes) - keyword 1
1406                      * offset 122 (60 x 2 bytes) - keyword 2
1407                      * offset 242 (2 bytes) - page number
1408                      * offset 244 (2 bytes) - special character ID
1409                      * total is always 246 bytes
1410                      */
1411 
1412                     /* id block verification (only on HWP3_VERIFY) */
1413                     HWP3_PSPECIAL_VERIFY(map, offset, 244, content, match);
1414 
1415                     offset += 246;
1416                     break;
1417                 }
1418                 case 28: /* overview shape/summary number */
1419                 {
1420                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected overview shape/summary number marker @ offset %zu\n", level, p, offset);
1421 
1422                     /*
1423                      * offset 0 (2 bytes) - special character ID
1424                      * offset 2 (2 bytes) - type
1425                      * offset 4 (1 byte)  - form
1426                      * offset 5 (1 byte)  - step
1427                      * offset 6 (7 x 2 bytes)  - summary number
1428                      * offset 20 (7 x 2 bytes) - custom
1429                      * offset 34 (2 x 7 x 2 bytes) - decorative letters
1430                      * offset 62 (2 bytes) - special character ID
1431                      * total is always 64 bytes
1432                      */
1433 
1434                     /* id block verification (only on HWP3_VERIFY) */
1435                     HWP3_PSPECIAL_VERIFY(map, offset, 62, content, match);
1436 
1437                     offset += 64;
1438                     break;
1439                 }
1440                 case 29: /* cross-reference */
1441                 {
1442                     uint32_t length;
1443 
1444                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected cross-reference marker @ offset %zu\n", level, p, offset);
1445 
1446                     /*
1447                      * offset 0 (2 bytes) - special character ID
1448                      * offset 2 (4 bytes) - length of information
1449                      * offset 6 (2 bytes) - special character ID
1450                      * offset 8 (n bytes) - ...
1451                      */
1452 
1453                     /* id block verification (only on HWP3_VERIFY) */
1454                     HWP3_PSPECIAL_VERIFY(map, offset, 6, content, match);
1455 
1456                     if (fmap_readn(map, &length, offset + 2, sizeof(length)) != sizeof(length))
1457                         return CL_EREAD;
1458 
1459                     length     = le32_to_host(length);
1460                     new_offset = offset + (8 + length);
1461                     if ((new_offset <= offset) || (new_offset > map->len)) {
1462                         cli_errmsg("HWP3.x: Paragraph[%u, %d]: length value is too high, invalid. %u\n", level, p, length);
1463                         return CL_EPARSE;
1464                     }
1465                     offset = new_offset;
1466                     break;
1467                 }
1468                 case 30: /* bundle of blanks (ON SALE for 2.99!) */
1469                 {
1470                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected title/table/picture show times marker @ offset %zu\n", level, p, offset);
1471 
1472                     /*
1473                      * offset 0 (2 bytes) - special character ID
1474                      * offset 2 (2 bytes) - special character ID
1475                      * total is always 4 bytes
1476                      */
1477 
1478                     /* id block verification (only on HWP3_VERIFY) */
1479                     HWP3_PSPECIAL_VERIFY(map, offset, 2, content, match);
1480 
1481                     offset += 4;
1482                     break;
1483                 }
1484                 case 31: /* fixed-width space */
1485                 {
1486                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected title/table/picture show times marker @ offset %zu\n", level, p, offset);
1487 
1488                     /*
1489                      * offset 0 (2 bytes) - special character ID
1490                      * offset 2 (2 bytes) - special character ID
1491                      * total is always 4 bytes
1492                      */
1493 
1494                     /* id block verification (only on HWP3_VERIFY) */
1495                     HWP3_PSPECIAL_VERIFY(map, offset, 2, content, match);
1496 
1497                     offset += 4;
1498                     break;
1499                 }
1500                 default:
1501                     hwp3_debug("HWP3.x: Paragraph[%u, %d]: detected special character as [UNKNOWN]\n", level, p);
1502                     cli_errmsg("HWP3.x: Paragraph[%u, %d]: cannot understand special character %u\n", level, p, content);
1503                     return CL_EPARSE;
1504             }
1505         } else { /* normal characters */
1506             offset += sizeof(content);
1507         }
1508     }
1509 
1510     hwp3_debug("HWP3.x: end recursion level: %d\n", level);
1511 
1512     (*roffset) = offset;
1513     return CL_SUCCESS;
1514 }
1515 
parsehwp3_infoblk_1(cli_ctx * ctx,fmap_t * dmap,size_t * offset,int * last)1516 static inline cl_error_t parsehwp3_infoblk_1(cli_ctx *ctx, fmap_t *dmap, size_t *offset, int *last)
1517 {
1518     cl_error_t ret = CL_SUCCESS;
1519 
1520     uint32_t infoid, infolen;
1521     fmap_t *map = (dmap ? dmap : ctx->fmap);
1522     int i, count;
1523     long long unsigned infoloc = (long long unsigned)(*offset);
1524 #if HWP3_DEBUG
1525     char field[HWP3_FIELD_LENGTH];
1526 #endif
1527 #if HAVE_JSON
1528     json_object *infoblk_1, *contents, *counter, *entry;
1529 #endif
1530 
1531     hwp3_debug("HWP3.x: Information Block @ offset %llu\n", infoloc);
1532 
1533 #if HAVE_JSON
1534     if (SCAN_COLLECT_METADATA) {
1535         infoblk_1 = cli_jsonobj(ctx->wrkproperty, "InfoBlk_1");
1536         if (!infoblk_1) {
1537             cli_errmsg("HWP5.x: No memory for information block object\n");
1538             return CL_EMEM;
1539         }
1540 
1541         contents = cli_jsonarray(infoblk_1, "Contents");
1542         if (!contents) {
1543             cli_errmsg("HWP5.x: No memory for information block contents array\n");
1544             return CL_EMEM;
1545         }
1546 
1547         if (!json_object_object_get_ex(infoblk_1, "Count", &counter)) { /* object not found */
1548             cli_jsonint(infoblk_1, "Count", 1);
1549         } else {
1550             int value = json_object_get_int(counter);
1551             cli_jsonint(infoblk_1, "Count", value + 1);
1552         }
1553     }
1554 #endif
1555 
1556     if (fmap_readn(map, &infoid, *offset, sizeof(infoid)) != sizeof(infoid)) {
1557         cli_errmsg("HWP3.x: Failed to read information block id @ %zu\n", *offset);
1558         return CL_EREAD;
1559     }
1560     *offset += sizeof(infoid);
1561     infoid = le32_to_host(infoid);
1562 
1563 #if HAVE_JSON
1564     if (SCAN_COLLECT_METADATA) {
1565         entry = cli_jsonobj(contents, NULL);
1566         if (!entry) {
1567             cli_errmsg("HWP5.x: No memory for information block entry object\n");
1568             return CL_EMEM;
1569         }
1570 
1571         cli_jsonint(entry, "ID", infoid);
1572     }
1573 #endif
1574     hwp3_debug("HWP3.x: Information Block[%llu]: ID:  %u\n", infoloc, infoid);
1575 
1576     /* Booking Information(5) - no length field and no content */
1577     if (infoid == 5) {
1578         hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Booking Information\n", infoloc);
1579 #if HAVE_JSON
1580         if (SCAN_COLLECT_METADATA)
1581             cli_jsonstr(entry, "Type", "Booking Information");
1582 #endif
1583         return CL_SUCCESS;
1584     }
1585 
1586     if (fmap_readn(map, &infolen, *offset, sizeof(infolen)) != sizeof(infolen)) {
1587         cli_errmsg("HWP3.x: Failed to read information block len @ %zu\n", *offset);
1588         return CL_EREAD;
1589     }
1590     *offset += sizeof(infolen);
1591     infolen = le32_to_host(infolen);
1592 
1593 #if HAVE_JSON
1594     if (SCAN_COLLECT_METADATA) {
1595         cli_jsonint64(entry, "Offset", infoloc);
1596         cli_jsonint(entry, "Length", infolen);
1597     }
1598 #endif
1599     hwp3_debug("HWP3.x: Information Block[%llu]: LEN: %u\n", infoloc, infolen);
1600 
1601     /* check information block bounds */
1602     if (*offset + infolen > map->len) {
1603         cli_errmsg("HWP3.x: Information blocks length exceeds remaining map length, %zu > %zu\n", *offset + infolen, map->len);
1604         return CL_EREAD;
1605     }
1606 
1607     /* Information Blocks */
1608     switch (infoid) {
1609         case 0: /* Terminating */
1610             if (infolen == 0) {
1611                 hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Terminating Entry\n", infoloc);
1612 #if HAVE_JSON
1613                 if (SCAN_COLLECT_METADATA)
1614                     cli_jsonstr(entry, "Type", "Terminating Entry");
1615 #endif
1616                 if (last) *last = 1;
1617                 return CL_SUCCESS;
1618             } else {
1619                 cli_errmsg("HWP3.x: Information Block[%llu]: TYPE: Invalid Terminating Entry\n", infoloc);
1620                 return CL_EFORMAT;
1621             }
1622         case 1: /* Image Data */
1623             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Image Data\n", infoloc);
1624 #if HAVE_JSON
1625             if (SCAN_COLLECT_METADATA)
1626                 cli_jsonstr(entry, "Type", "Image Data");
1627 #endif
1628 #if HWP3_DEBUG /* additional fields can be added */
1629             memset(field, 0, HWP3_FIELD_LENGTH);
1630             if (fmap_readn(map, field, *offset, 16) != 16) {
1631                 cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1632                 return CL_EREAD;
1633             }
1634             hwp3_debug("HWP3.x: Information Block[%llu]: NAME: %s\n", infoloc, field);
1635 
1636             memset(field, 0, HWP3_FIELD_LENGTH);
1637             if (fmap_readn(map, field, *offset + 16, 16) != 16) {
1638                 cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1639                 return CL_EREAD;
1640             }
1641             hwp3_debug("HWP3.x: Information Block[%llu]: FORM: %s\n", infoloc, field);
1642 #endif
1643             /* 32 bytes for extra data fields */
1644             if (infolen > 0)
1645                 ret = cli_magic_scan_nested_fmap_type(map, *offset + 32, infolen - 32, ctx, CL_TYPE_ANY, NULL);
1646             break;
1647         case 2: /* OLE2 Data */
1648             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: OLE2 Data\n", infoloc);
1649 #if HAVE_JSON
1650             if (SCAN_COLLECT_METADATA)
1651                 cli_jsonstr(entry, "Type", "OLE2 Data");
1652 #endif
1653             if (infolen > 0)
1654                 ret = cli_magic_scan_nested_fmap_type(map, *offset, infolen, ctx, CL_TYPE_ANY, NULL);
1655             break;
1656         case 3: /* Hypertext/Hyperlink Information */
1657             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Hypertext/Hyperlink Information\n", infoloc);
1658             if (infolen % 617) {
1659                 cli_errmsg("HWP3.x: Information Block[%llu]: Invalid multiple of 617 => %u\n", infoloc, infolen);
1660                 return CL_EFORMAT;
1661             }
1662 
1663             count = (infolen / 617);
1664             hwp3_debug("HWP3.x: Information Block[%llu]: COUNT: %d entries\n", infoloc, count);
1665 #if HAVE_JSON
1666             if (SCAN_COLLECT_METADATA) {
1667                 cli_jsonstr(entry, "Type", "Hypertext/Hyperlink Information");
1668                 cli_jsonint(entry, "Count", count);
1669             }
1670 #endif
1671 
1672             for (i = 0; i < count; i++) {
1673 #if HWP3_DEBUG /* additional fields can be added */
1674                 memset(field, 0, HWP3_FIELD_LENGTH);
1675                 if (fmap_readn(map, field, *offset, 256) != 256) {
1676                     cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1677                     return CL_EREAD;
1678                 }
1679                 hwp3_debug("HWP3.x: Information Block[%llu]: %d: NAME: %s\n", infoloc, i, field);
1680 #endif
1681                 /* scanning macros - TODO - check numbers */
1682                 ret = cli_magic_scan_nested_fmap_type(map, *offset + (617 * i) + 288, 325, ctx, CL_TYPE_ANY, NULL);
1683             }
1684             break;
1685         case 4: /* Presentation Information */
1686             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Presentation Information\n", infoloc);
1687 #if HAVE_JSON
1688             if (SCAN_COLLECT_METADATA)
1689                 cli_jsonstr(entry, "Type", "Presentation Information");
1690 #endif
1691             /* contains nothing of interest to scan */
1692             break;
1693         case 5: /* Booking Information */
1694             /* should never run this as it is short-circuited above */
1695             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Booking Information\n", infoloc);
1696 #if HAVE_JSON
1697             if (SCAN_COLLECT_METADATA)
1698                 cli_jsonstr(entry, "Type", "Booking Information");
1699 #endif
1700             break;
1701         case 6: /* Background Image Data */
1702             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Background Image Data\n", infoloc);
1703 #if HAVE_JSON
1704             if (SCAN_COLLECT_METADATA) {
1705                 cli_jsonstr(entry, "Type", "Background Image Data");
1706                 cli_jsonint(entry, "ImageSize", infolen - 324);
1707             }
1708 #endif
1709 #if HWP3_DEBUG /* additional fields can be added */
1710             memset(field, 0, HWP3_FIELD_LENGTH);
1711             if (fmap_readn(map, field, *offset + 24, 256) != 256) {
1712                 cli_errmsg("HWP3.x: Failed to read information block field @ %zu\n", *offset);
1713                 return CL_EREAD;
1714             }
1715             hwp3_debug("HWP3.x: Information Block[%llu]: NAME: %s\n", infoloc, field);
1716 #endif
1717             /* 324 bytes for extra data fields */
1718             if (infolen > 0)
1719                 ret = cli_magic_scan_nested_fmap_type(map, *offset + 324, infolen - 324, ctx, CL_TYPE_ANY, NULL);
1720             break;
1721         case 0x100: /* Table Extension */
1722             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Table Extension\n", infoloc);
1723 #if HAVE_JSON
1724             if (SCAN_COLLECT_METADATA)
1725                 cli_jsonstr(entry, "Type", "Table Extension");
1726 #endif
1727             /* contains nothing of interest to scan */
1728             break;
1729         case 0x101: /* Press Frame Information Field Name */
1730             hwp3_debug("HWP3.x: Information Block[%llu]: TYPE: Press Frame Information Field Name\n", infoloc);
1731 #if HAVE_JSON
1732             if (SCAN_COLLECT_METADATA)
1733                 cli_jsonstr(entry, "Type", "Press Frame Information Field Name");
1734 #endif
1735             /* contains nothing of interest to scan */
1736             break;
1737         default:
1738             cli_warnmsg("HWP3.x: Information Block[%llu]: TYPE: UNKNOWN(%u)\n", infoloc, infoid);
1739             if (infolen > 0)
1740                 ret = cli_magic_scan_nested_fmap_type(map, *offset, infolen, ctx, CL_TYPE_ANY, NULL);
1741     }
1742 
1743     *offset += infolen;
1744     return ret;
1745 }
1746 
hwp3_cb(void * cbdata,int fd,const char * filepath,cli_ctx * ctx)1747 static cl_error_t hwp3_cb(void *cbdata, int fd, const char *filepath, cli_ctx *ctx)
1748 {
1749     cl_error_t ret = CL_SUCCESS;
1750     fmap_t *map, *dmap;
1751     size_t offset, start, new_offset;
1752     int i, p = 0, last = 0;
1753     uint16_t nstyles;
1754 #if HAVE_JSON
1755     json_object *fonts;
1756 #endif
1757 
1758     UNUSEDPARAM(filepath);
1759 
1760     offset = start = cbdata ? *(size_t *)cbdata : 0;
1761 
1762     if (offset == 0) {
1763         if (fd < 0) {
1764             cli_errmsg("HWP3.x: Invalid file descriptor argument\n");
1765             return CL_ENULLARG;
1766         } else {
1767             STATBUF statbuf;
1768 
1769             if (FSTAT(fd, &statbuf) == -1) {
1770                 cli_errmsg("HWP3.x: Can't stat file descriptor\n");
1771                 return CL_ESTAT;
1772             }
1773 
1774             map = dmap = fmap(fd, 0, statbuf.st_size, NULL);
1775             if (!map) {
1776                 cli_errmsg("HWP3.x: Failed to get fmap for uncompressed stream\n");
1777                 return CL_EMAP;
1778             }
1779         }
1780     } else {
1781         hwp3_debug("HWP3.x: Document Content Stream starts @ offset %zu\n", offset);
1782 
1783         map  = ctx->fmap;
1784         dmap = NULL;
1785     }
1786 
1787     /* Fonts - 7 entries of 2 + (n x 40) bytes where n is the first 2 bytes of the entry */
1788 #if HAVE_JSON
1789     if (SCAN_COLLECT_METADATA)
1790         fonts = cli_jsonarray(ctx->wrkproperty, "FontCounts");
1791 #endif
1792     for (i = 0; i < 7; i++) {
1793         uint16_t nfonts;
1794 
1795         if (fmap_readn(map, &nfonts, offset, sizeof(nfonts)) != sizeof(nfonts)) {
1796             if (dmap)
1797                 funmap(dmap);
1798             return CL_EREAD;
1799         }
1800         nfonts = le16_to_host(nfonts);
1801 
1802 #if HAVE_JSON
1803         if (SCAN_COLLECT_METADATA)
1804             cli_jsonint(fonts, NULL, nfonts);
1805 #endif
1806         hwp3_debug("HWP3.x: Font Entry %d with %u entries @ offset %zu\n", i + 1, nfonts, offset);
1807         new_offset = offset + (2 + nfonts * 40);
1808         if ((new_offset <= offset) || (new_offset >= map->len)) {
1809             cli_errmsg("HWP3.x: Font Entry: number of fonts is too high, invalid. %u\n", nfonts);
1810             if (dmap)
1811                 funmap(dmap);
1812             return CL_EPARSE;
1813         }
1814         offset = new_offset;
1815     }
1816 
1817     /* Styles - 2 + (n x 238) bytes where n is the first 2 bytes of the section */
1818     if (fmap_readn(map, &nstyles, offset, sizeof(nstyles)) != sizeof(nstyles)) {
1819         if (dmap)
1820             funmap(dmap);
1821         return CL_EREAD;
1822     }
1823     nstyles = le16_to_host(nstyles);
1824 
1825 #if HAVE_JSON
1826     if (SCAN_COLLECT_METADATA)
1827         cli_jsonint(ctx->wrkproperty, "StyleCount", nstyles);
1828 #endif
1829     hwp3_debug("HWP3.x: %u Styles @ offset %zu\n", nstyles, offset);
1830     new_offset = offset + (2 + nstyles * 238);
1831     if ((new_offset <= offset) || (new_offset >= map->len)) {
1832         cli_errmsg("HWP3.x: Font Entry: number of font styles is too high, invalid. %u\n", nstyles);
1833         if (dmap)
1834             funmap(dmap);
1835         return CL_EPARSE;
1836     }
1837     offset += (2 + nstyles * 238);
1838 
1839     last = 0;
1840     /* Paragraphs - variable */
1841     /* Paragraphs - are terminated with 0x0d00[13(CR) as hchar], empty paragraph marks end of section and do NOT end with 0x0d00 */
1842     while (!last && ((ret = parsehwp3_paragraph(ctx, map, p++, 0, &offset, &last)) == CL_SUCCESS)) continue;
1843     /* return is never a virus */
1844     if (ret != CL_SUCCESS) {
1845         if (dmap)
1846             funmap(dmap);
1847         return ret;
1848     }
1849 #if HAVE_JSON
1850     if (SCAN_COLLECT_METADATA)
1851         cli_jsonint(ctx->wrkproperty, "ParagraphCount", p);
1852 #endif
1853 
1854     last = 0;
1855     /* 'additional information block #1's - attachments and media */
1856     while (!last && ((ret = parsehwp3_infoblk_1(ctx, map, &offset, &last)) == CL_SUCCESS)) continue;
1857 
1858     /* scan the uncompressed stream - both compressed and uncompressed cases [ALLMATCH] */
1859     if ((ret == CL_SUCCESS) || ((SCAN_ALLMATCHES) && (ret == CL_VIRUS))) {
1860         cl_error_t subret = ret;
1861         size_t dlen       = offset - start;
1862 
1863         ret = cli_magic_scan_nested_fmap_type(map, start, dlen, ctx, CL_TYPE_ANY, NULL);
1864         //ret = cli_magic_scan_nested_fmap_type(map, 0, 0, ctx, CL_TYPE_ANY);
1865 
1866         if (ret == CL_SUCCESS)
1867             ret = subret;
1868     }
1869 
1870     if (dmap)
1871         funmap(dmap);
1872     return ret;
1873 }
1874 
cli_scanhwp3(cli_ctx * ctx)1875 cl_error_t cli_scanhwp3(cli_ctx *ctx)
1876 {
1877     cl_error_t ret = CL_SUCCESS;
1878 
1879     struct hwp3_docinfo docinfo;
1880     size_t offset = 0, new_offset = 0;
1881     fmap_t *map = ctx->fmap;
1882 
1883 #if HAVE_JSON
1884     /*
1885     // version
1886     cli_jsonint(header, "RawVersion", hwp5->version);
1887     */
1888 #endif
1889     offset += HWP3_IDENTITY_INFO_SIZE;
1890 
1891     if ((ret = parsehwp3_docinfo(ctx, offset, &docinfo)) != CL_SUCCESS)
1892         return ret;
1893 
1894     offset += HWP3_DOCINFO_SIZE;
1895 
1896     if ((ret = parsehwp3_docsummary(ctx, offset)) != CL_SUCCESS)
1897         return ret;
1898 
1899     offset += HWP3_DOCSUMMARY_SIZE;
1900 
1901     /* password-protected document - cannot parse */
1902     if (docinfo.di_passwd) {
1903         cli_dbgmsg("HWP3.x: password-protected file, skip parsing\n");
1904         return CL_SUCCESS;
1905     }
1906 
1907     if (docinfo.di_infoblksize) {
1908         /* OPTIONAL TODO: HANDLE OPTIONAL INFORMATION BLOCK #0's FOR PRECLASS */
1909         new_offset = offset + docinfo.di_infoblksize;
1910         if ((new_offset <= offset) || (new_offset >= map->len)) {
1911             cli_errmsg("HWP3.x: Doc info block size is too high, invalid. %u\n", docinfo.di_infoblksize);
1912             return CL_EPARSE;
1913         }
1914         offset = new_offset;
1915     }
1916 
1917     if (docinfo.di_compressed)
1918         ret = decompress_and_callback(ctx, ctx->fmap, offset, 0, "HWP3.x", hwp3_cb, NULL);
1919     else
1920         ret = hwp3_cb(&offset, 0, ctx->sub_filepath, ctx);
1921 
1922     if (ret != CL_SUCCESS)
1923         return ret;
1924 
1925     /* OPTIONAL TODO: HANDLE OPTIONAL ADDITIONAL INFORMATION BLOCK #2's FOR PRECLASS*/
1926 
1927     return ret;
1928 }
1929 
1930 /*** HWPML (hijacking the msxml parser) ***/
1931 #if HAVE_LIBXML2
1932 static const struct key_entry hwpml_keys[] = {
1933     {"hwpml", "HWPML", MSXML_JSON_ROOT | MSXML_JSON_ATTRIB},
1934 
1935     /* HEAD - Document Properties */
1936     //{ "head",               "Head",               MSXML_JSON_WRKPTR },
1937     {"docsummary", "DocumentProperties", MSXML_JSON_WRKPTR},
1938     {"title", "Title", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1939     {"author", "Author", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1940     {"date", "Date", MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1941     {"docsetting", "DocumentSettings", MSXML_JSON_WRKPTR},
1942     {"beginnumber", "BeginNumber", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1943     {"caretpos", "CaretPos", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1944     //{ "bindatalist",        "BinDataList",        MSXML_JSON_WRKPTR },
1945     //{ "binitem",            "BinItem",            MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB },
1946     {"facenamelist", "FaceNameList", MSXML_IGNORE_ELEM},            /* fonts list */
1947     {"borderfilllist", "BorderFillList", MSXML_IGNORE_ELEM},        /* borders list */
1948     {"charshapelist", "CharShapeList", MSXML_IGNORE_ELEM},          /* character shapes */
1949     {"tabdeflist", "TableDefList", MSXML_IGNORE_ELEM},              /* table defs */
1950     {"numberinglist", "NumberingList", MSXML_IGNORE_ELEM},          /* numbering list */
1951     {"parashapelist", "ParagraphShapeList", MSXML_IGNORE_ELEM},     /* paragraph shapes */
1952     {"stylelist", "StyleList", MSXML_IGNORE_ELEM},                  /* styles */
1953     {"compatibledocument", "WordCompatibility", MSXML_IGNORE_ELEM}, /* word compatibility data */
1954 
1955     /* BODY - Document Contents */
1956     {"body", "Body", MSXML_IGNORE_ELEM}, /* document contents (we could build a document contents summary */
1957 
1958     /* TAIL - Document Attachments */
1959     //{ "tail",               "Tail",               MSXML_JSON_WRKPTR },
1960     {"bindatastorage", "BinaryDataStorage", MSXML_JSON_WRKPTR},
1961     {"bindata", "BinaryData", MSXML_SCAN_CB | MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1962     {"scriptcode", "ScriptCodeStorage", MSXML_JSON_WRKPTR | MSXML_JSON_ATTRIB},
1963     {"scriptheader", "ScriptHeader", MSXML_SCAN_CB | MSXML_JSON_WRKPTR | MSXML_JSON_VALUE},
1964     {"scriptsource", "ScriptSource", MSXML_SCAN_CB | MSXML_JSON_WRKPTR | MSXML_JSON_VALUE}};
1965 static size_t num_hwpml_keys = sizeof(hwpml_keys) / sizeof(struct key_entry);
1966 
1967 /* binary streams needs to be base64-decoded then decompressed if fields are set */
hwpml_scan_cb(void * cbdata,int fd,const char * filepath,cli_ctx * ctx)1968 static cl_error_t hwpml_scan_cb(void *cbdata, int fd, const char *filepath, cli_ctx *ctx)
1969 {
1970     UNUSEDPARAM(cbdata);
1971 
1972     if (fd < 0 || !ctx)
1973         return CL_ENULLARG;
1974 
1975     return cli_magic_scan_desc(fd, filepath, ctx, NULL);
1976 }
1977 
hwpml_binary_cb(int fd,const char * filepath,cli_ctx * ctx,int num_attribs,struct attrib_entry * attribs,void * cbdata)1978 static cl_error_t hwpml_binary_cb(int fd, const char *filepath, cli_ctx *ctx, int num_attribs, struct attrib_entry *attribs, void *cbdata)
1979 {
1980     cl_error_t ret;
1981 
1982     int i, df = 0, com = 0, enc = 0;
1983     char *tempfile;
1984 
1985     UNUSEDPARAM(cbdata);
1986 
1987     /* check attributes for compression and encoding */
1988     for (i = 0; i < num_attribs; i++) {
1989         if (!strcmp(attribs[i].key, "Compress")) {
1990             if (!strcmp(attribs[i].value, "true"))
1991                 com = 1;
1992             else if (!strcmp(attribs[i].value, "false"))
1993                 com = 0;
1994             else
1995                 com = -1;
1996         }
1997 
1998         if (!strcmp(attribs[i].key, "Encoding")) {
1999             if (!strcmp(attribs[i].value, "Base64"))
2000                 enc = 1;
2001             else
2002                 enc = -1;
2003         }
2004     }
2005 
2006     hwpml_debug("HWPML: Checking attributes: com: %d, enc: %d\n", com, enc);
2007 
2008     /* decode the binary data if needed - base64 */
2009     if (enc < 0) {
2010         cli_errmsg("HWPML: Unrecognized encoding method\n");
2011         return cli_magic_scan_desc(fd, filepath, ctx, NULL);
2012     } else if (enc == 1) {
2013         STATBUF statbuf;
2014         fmap_t *input;
2015         const char *instream;
2016         char *decoded;
2017         size_t decodedlen;
2018 
2019         hwpml_debug("HWPML: Decoding base64-encoded binary data\n");
2020 
2021         /* fmap the input file for easier manipulation */
2022         if (FSTAT(fd, &statbuf) == -1) {
2023             cli_errmsg("HWPML: Can't stat file descriptor\n");
2024             return CL_ESTAT;
2025         }
2026 
2027         if (!(input = fmap(fd, 0, statbuf.st_size, NULL))) {
2028             cli_errmsg("HWPML: Failed to get fmap for binary data\n");
2029             return CL_EMAP;
2030         }
2031 
2032         /* send data for base64 conversion - TODO: what happens with really big files? */
2033         if (!(instream = fmap_need_off_once(input, 0, input->len))) {
2034             cli_errmsg("HWPML: Failed to get input stream from binary data\n");
2035             funmap(input);
2036             return CL_EMAP;
2037         }
2038 
2039         decoded = (char *)cl_base64_decode((char *)instream, input->len, NULL, &decodedlen, 0);
2040         funmap(input);
2041         if (!decoded) {
2042             cli_errmsg("HWPML: Failed to get base64 decode binary data\n");
2043             return cli_magic_scan_desc(fd, filepath, ctx, NULL);
2044         }
2045 
2046         /* open file for writing and scanning */
2047         if ((ret = cli_gentempfd(ctx->sub_tmpdir, &tempfile, &df)) != CL_SUCCESS) {
2048             cli_warnmsg("HWPML: Failed to create temporary file for decoded stream scanning\n");
2049             return ret;
2050         }
2051 
2052         if (cli_writen(df, decoded, decodedlen) != decodedlen) {
2053             free(decoded);
2054             ret = CL_EWRITE;
2055             goto hwpml_end;
2056         }
2057         free(decoded);
2058 
2059         /* keeps the later logic simpler */
2060         fd = df;
2061 
2062         cli_dbgmsg("HWPML: Decoded binary data to %s\n", tempfile);
2063     }
2064 
2065     /* decompress the file if needed - zlib */
2066     if (com) {
2067         STATBUF statbuf;
2068         fmap_t *input;
2069 
2070         hwpml_debug("HWPML: Decompressing binary data\n");
2071 
2072         /* fmap the input file for easier manipulation */
2073         if (FSTAT(fd, &statbuf) == -1) {
2074             cli_errmsg("HWPML: Can't stat file descriptor\n");
2075             ret = CL_ESTAT;
2076             goto hwpml_end;
2077         }
2078 
2079         input = fmap(fd, 0, statbuf.st_size, NULL);
2080         if (!input) {
2081             cli_errmsg("HWPML: Failed to get fmap for binary data\n");
2082             ret = CL_EMAP;
2083             goto hwpml_end;
2084         }
2085         ret = decompress_and_callback(ctx, input, 0, 0, "HWPML", hwpml_scan_cb, NULL);
2086         funmap(input);
2087     } else {
2088         if (fd == df) { /* fd is a decoded tempfile */
2089             ret = hwpml_scan_cb(NULL, fd, tempfile, ctx);
2090         } else { /* fd is the original filepath, no decoding necessary */
2091             ret = hwpml_scan_cb(NULL, fd, filepath, ctx);
2092         }
2093     }
2094 
2095     /* close decoded file descriptor if used */
2096 hwpml_end:
2097     if (df) {
2098         close(df);
2099         if (!(ctx->engine->keeptmp))
2100             cli_unlink(tempfile);
2101         free(tempfile);
2102     }
2103     return ret;
2104 }
2105 #endif /* HAVE_LIBXML2 */
2106 
cli_scanhwpml(cli_ctx * ctx)2107 cl_error_t cli_scanhwpml(cli_ctx *ctx)
2108 {
2109     cl_error_t ret = CL_SUCCESS;
2110 
2111 #if HAVE_LIBXML2
2112     struct msxml_cbdata cbdata;
2113     struct msxml_ctx mxctx;
2114     xmlTextReaderPtr reader = NULL;
2115 
2116     cli_dbgmsg("in cli_scanhwpml()\n");
2117 
2118     if (!ctx)
2119         return CL_ENULLARG;
2120 
2121     memset(&cbdata, 0, sizeof(cbdata));
2122     cbdata.map = ctx->fmap;
2123 
2124     reader = xmlReaderForIO(msxml_read_cb, NULL, &cbdata, "hwpml.xml", NULL, CLAMAV_MIN_XMLREADER_FLAGS);
2125     if (!reader) {
2126         cli_dbgmsg("cli_scanhwpml: cannot initialize xmlReader\n");
2127 
2128 #if HAVE_JSON
2129         ret = cli_json_parse_error(ctx->wrkproperty, "HWPML_ERROR_XML_READER_IO");
2130 #endif
2131         return ret; // libxml2 failed!
2132     }
2133 
2134     memset(&mxctx, 0, sizeof(mxctx));
2135     mxctx.scan_cb = hwpml_binary_cb;
2136     ret           = cli_msxml_parse_document(ctx, reader, hwpml_keys, num_hwpml_keys, MSXML_FLAG_JSON, &mxctx);
2137 
2138     xmlTextReaderClose(reader);
2139     xmlFreeTextReader(reader);
2140 #else
2141     UNUSEDPARAM(ctx);
2142     cli_dbgmsg("in cli_scanhwpml()\n");
2143     cli_dbgmsg("cli_scanhwpml: scanning hwpml documents requires libxml2!\n");
2144 #endif
2145 
2146     return ret;
2147 }
2148