1 /* $Id: chars.c,v 1.46 2011/05/24 21:31:23 kristaps Exp $ */ 2 /* 3 * Copyright (c) 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv> 4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 #ifdef HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include <assert.h> 23 #include <ctype.h> 24 #include <stdio.h> 25 #include <stdlib.h> 26 #include <string.h> 27 28 #include "mandoc.h" 29 #include "libmandoc.h" 30 31 #define PRINT_HI 126 32 #define PRINT_LO 32 33 34 struct ln { 35 struct ln *next; 36 const char *code; 37 const char *ascii; 38 int unicode; 39 }; 40 41 #define LINES_MAX 325 42 43 #define CHAR(in, ch, code) \ 44 { NULL, (in), (ch), (code) }, 45 46 #define CHAR_TBL_START static struct ln lines[LINES_MAX] = { 47 #define CHAR_TBL_END }; 48 49 #include "chars.in" 50 51 struct mchars { 52 struct ln **htab; 53 }; 54 55 static inline int match(const struct ln *, const char *, size_t); 56 static const struct ln *find(struct mchars *, const char *, size_t); 57 58 void 59 mchars_free(struct mchars *arg) 60 { 61 62 free(arg->htab); 63 free(arg); 64 } 65 66 struct mchars * 67 mchars_alloc(void) 68 { 69 struct mchars *tab; 70 struct ln **htab; 71 struct ln *pp; 72 int i, hash; 73 74 /* 75 * Constructs a very basic chaining hashtable. The hash routine 76 * is simply the integral value of the first character. 77 * Subsequent entries are chained in the order they're processed 78 * (they're in-line re-ordered during lookup). 79 */ 80 81 tab = mandoc_malloc(sizeof(struct mchars)); 82 htab = mandoc_calloc(PRINT_HI - PRINT_LO + 1, sizeof(struct ln **)); 83 84 for (i = 0; i < LINES_MAX; i++) { 85 hash = (int)lines[i].code[0] - PRINT_LO; 86 87 if (NULL == (pp = htab[hash])) { 88 htab[hash] = &lines[i]; 89 continue; 90 } 91 92 for ( ; pp->next; pp = pp->next) 93 /* Scan ahead. */ ; 94 pp->next = &lines[i]; 95 } 96 97 tab->htab = htab; 98 return(tab); 99 } 100 101 102 /* 103 * Special character to Unicode codepoint. 104 */ 105 int 106 mchars_spec2cp(struct mchars *arg, const char *p, size_t sz) 107 { 108 const struct ln *ln; 109 110 ln = find(arg, p, sz); 111 if (NULL == ln) 112 return(-1); 113 return(ln->unicode); 114 } 115 116 /* 117 * Numbered character string to ASCII codepoint. 118 * This can only be a printable character (i.e., alnum, punct, space) so 119 * prevent the character from ruining our state (backspace, newline, and 120 * so on). 121 * If the character is illegal, returns '\0'. 122 */ 123 char 124 mchars_num2char(const char *p, size_t sz) 125 { 126 int i; 127 128 if ((i = mandoc_strntou(p, sz, 10)) < 0) 129 return('\0'); 130 return(isprint(i) ? i : '\0'); 131 } 132 133 /* 134 * Hex character string to Unicode codepoint. 135 * If the character is illegal, returns '\0'. 136 */ 137 int 138 mchars_num2uc(const char *p, size_t sz) 139 { 140 int i; 141 142 if ((i = mandoc_strntou(p, sz, 16)) < 0) 143 return('\0'); 144 /* FIXME: make sure we're not in a bogus range. */ 145 return(i > 0x80 && i <= 0x10FFFF ? i : '\0'); 146 } 147 148 /* 149 * Special character to string array. 150 */ 151 const char * 152 mchars_spec2str(struct mchars *arg, const char *p, size_t sz, size_t *rsz) 153 { 154 const struct ln *ln; 155 156 ln = find(arg, p, sz); 157 if (NULL == ln) 158 return(NULL); 159 160 *rsz = strlen(ln->ascii); 161 return(ln->ascii); 162 } 163 164 static const struct ln * 165 find(struct mchars *tab, const char *p, size_t sz) 166 { 167 struct ln *pp, *prev; 168 struct ln **htab; 169 int hash; 170 171 assert(p); 172 if (0 == sz) 173 return(NULL); 174 175 if (p[0] < PRINT_LO || p[0] > PRINT_HI) 176 return(NULL); 177 178 /* 179 * Lookup the symbol in the symbol hash. See ascii2htab for the 180 * hashtable specs. This dynamically re-orders the hash chain 181 * to optimise for repeat hits. 182 */ 183 184 hash = (int)p[0] - PRINT_LO; 185 htab = tab->htab; 186 187 if (NULL == (pp = htab[hash])) 188 return(NULL); 189 190 for (prev = NULL; pp; pp = pp->next) { 191 if ( ! match(pp, p, sz)) { 192 prev = pp; 193 continue; 194 } 195 196 if (prev) { 197 prev->next = pp->next; 198 pp->next = htab[hash]; 199 htab[hash] = pp; 200 } 201 202 return(pp); 203 } 204 205 return(NULL); 206 } 207 208 static inline int 209 match(const struct ln *ln, const char *p, size_t sz) 210 { 211 212 if (strncmp(ln->code, p, sz)) 213 return(0); 214 return('\0' == ln->code[(int)sz]); 215 } 216