1 /* $OpenBSD: preconv.c,v 1.9 2018/12/13 11:55:14 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #include <sys/types.h> 19 20 #include <assert.h> 21 #include <stdio.h> 22 #include <string.h> 23 24 #include "mandoc.h" 25 #include "roff.h" 26 #include "mandoc_parse.h" 27 #include "libmandoc.h" 28 29 int 30 preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, 31 int *filenc) 32 { 33 const unsigned char *cu; 34 int nby; 35 unsigned int accum; 36 37 cu = (const unsigned char *)ib->buf + *ii; 38 assert(*cu & 0x80); 39 40 if ( ! (*filenc & MPARSE_UTF8)) 41 goto latin; 42 43 nby = 1; 44 while (nby < 5 && *cu & (1 << (7 - nby))) 45 nby++; 46 47 switch (nby) { 48 case 2: 49 accum = *cu & 0x1f; 50 if (accum < 0x02) /* Obfuscated ASCII. */ 51 goto latin; 52 break; 53 case 3: 54 accum = *cu & 0x0f; 55 break; 56 case 4: 57 accum = *cu & 0x07; 58 if (accum > 0x04) /* Beyond Unicode. */ 59 goto latin; 60 break; 61 default: /* Bad sequence header. */ 62 goto latin; 63 } 64 65 cu++; 66 switch (nby) { 67 case 3: 68 if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ 69 (accum == 0x0d && *cu & 0x20)) /* Surrogates. */ 70 goto latin; 71 break; 72 case 4: 73 if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ 74 (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ 75 goto latin; 76 break; 77 default: 78 break; 79 } 80 81 while (--nby) { 82 if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ 83 goto latin; 84 accum <<= 6; 85 accum += *cu & 0x3f; 86 cu++; 87 } 88 89 assert(accum > 0x7f); 90 assert(accum < 0x110000); 91 assert(accum < 0xd800 || accum > 0xdfff); 92 93 *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); 94 *ii = (const char *)cu - ib->buf; 95 *filenc &= ~MPARSE_LATIN1; 96 return 1; 97 98 latin: 99 if ( ! (*filenc & MPARSE_LATIN1)) 100 return 0; 101 102 *oi += snprintf(ob->buf + *oi, 11, 103 "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); 104 105 *filenc &= ~MPARSE_UTF8; 106 return 1; 107 } 108 109 int 110 preconv_cue(const struct buf *b, size_t offset) 111 { 112 const char *ln, *eoln, *eoph; 113 size_t sz, phsz; 114 115 ln = b->buf + offset; 116 sz = b->sz - offset; 117 118 /* Look for the end-of-line. */ 119 120 if (NULL == (eoln = memchr(ln, '\n', sz))) 121 eoln = ln + sz; 122 123 /* Check if we have the correct header/trailer. */ 124 125 if ((sz = (size_t)(eoln - ln)) < 10 || 126 memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) 127 return MPARSE_UTF8 | MPARSE_LATIN1; 128 129 /* Move after the header and adjust for the trailer. */ 130 131 ln += 7; 132 sz -= 10; 133 134 while (sz > 0) { 135 while (sz > 0 && ' ' == *ln) { 136 ln++; 137 sz--; 138 } 139 if (0 == sz) 140 break; 141 142 /* Find the end-of-phrase marker (or eoln). */ 143 144 if (NULL == (eoph = memchr(ln, ';', sz))) 145 eoph = eoln - 3; 146 else 147 eoph++; 148 149 /* Only account for the "coding" phrase. */ 150 151 if ((phsz = eoph - ln) < 7 || 152 strncasecmp(ln, "coding:", 7)) { 153 sz -= phsz; 154 ln += phsz; 155 continue; 156 } 157 158 sz -= 7; 159 ln += 7; 160 161 while (sz > 0 && ' ' == *ln) { 162 ln++; 163 sz--; 164 } 165 if (0 == sz) 166 return 0; 167 168 /* Check us against known encodings. */ 169 170 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) 171 return MPARSE_UTF8; 172 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) 173 return MPARSE_LATIN1; 174 return 0; 175 } 176 return MPARSE_UTF8 | MPARSE_LATIN1; 177 } 178