1 #include <assert.h>
2 #include <stdio.h>
3 #include <string.h>
4 
5 #include "buffer.h"
6 #include "houdini.h"
7 #include "utf8.h"
8 #include "entities.inc"
9 
10 /* Binary tree lookup code for entities added by JGM */
11 
S_lookup(int i,int low,int hi,const unsigned char * s,int len)12 static const unsigned char *S_lookup(int i, int low, int hi,
13                                      const unsigned char *s, int len) {
14   int j;
15   int cmp =
16       strncmp((const char *)s, (const char *)cmark_entities[i].entity, len);
17   if (cmp == 0 && cmark_entities[i].entity[len] == 0) {
18     return (const unsigned char *)cmark_entities[i].bytes;
19   } else if (cmp <= 0 && i > low) {
20     j = i - ((i - low) / 2);
21     if (j == i)
22       j -= 1;
23     return S_lookup(j, low, i - 1, s, len);
24   } else if (cmp > 0 && i < hi) {
25     j = i + ((hi - i) / 2);
26     if (j == i)
27       j += 1;
28     return S_lookup(j, i + 1, hi, s, len);
29   } else {
30     return NULL;
31   }
32 }
33 
S_lookup_entity(const unsigned char * s,int len)34 static const unsigned char *S_lookup_entity(const unsigned char *s, int len) {
35   return S_lookup(CMARK_NUM_ENTITIES / 2, 0, CMARK_NUM_ENTITIES - 1, s, len);
36 }
37 
houdini_unescape_ent(cmark_strbuf * ob,const uint8_t * src,bufsize_t size)38 bufsize_t houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src,
39                                bufsize_t size) {
40   bufsize_t i = 0;
41 
42   if (size >= 3 && src[0] == '#') {
43     int codepoint = 0;
44     int num_digits = 0;
45 
46     if (_isdigit(src[1])) {
47       for (i = 1; i < size && _isdigit(src[i]); ++i) {
48         codepoint = (codepoint * 10) + (src[i] - '0');
49 
50         if (codepoint >= 0x110000) {
51           // Keep counting digits but
52           // avoid integer overflow.
53           codepoint = 0x110000;
54         }
55       }
56 
57       num_digits = i - 1;
58     }
59 
60     else if (src[1] == 'x' || src[1] == 'X') {
61       for (i = 2; i < size && _isxdigit(src[i]); ++i) {
62         codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9);
63 
64         if (codepoint >= 0x110000) {
65           // Keep counting digits but
66           // avoid integer overflow.
67           codepoint = 0x110000;
68         }
69       }
70 
71       num_digits = i - 2;
72     }
73 
74     if (num_digits >= 1 && num_digits <= 8 && i < size && src[i] == ';') {
75       if (codepoint == 0 || (codepoint >= 0xD800 && codepoint < 0xE000) ||
76           codepoint >= 0x110000) {
77         codepoint = 0xFFFD;
78       }
79       cmark_utf8proc_encode_char(codepoint, ob);
80       return i + 1;
81     }
82   }
83 
84   else {
85     if (size > CMARK_ENTITY_MAX_LENGTH)
86       size = CMARK_ENTITY_MAX_LENGTH;
87 
88     for (i = CMARK_ENTITY_MIN_LENGTH; i < size; ++i) {
89       if (src[i] == ' ')
90         break;
91 
92       if (src[i] == ';') {
93         const unsigned char *entity = S_lookup_entity(src, i);
94 
95         if (entity != NULL) {
96           cmark_strbuf_puts(ob, (const char *)entity);
97           return i + 1;
98         }
99 
100         break;
101       }
102     }
103   }
104 
105   return 0;
106 }
107 
houdini_unescape_html(cmark_strbuf * ob,const uint8_t * src,bufsize_t size)108 int houdini_unescape_html(cmark_strbuf *ob, const uint8_t *src,
109                           bufsize_t size) {
110   bufsize_t i = 0, org, ent;
111 
112   while (i < size) {
113     org = i;
114     while (i < size && src[i] != '&')
115       i++;
116 
117     if (likely(i > org)) {
118       if (unlikely(org == 0)) {
119         if (i >= size)
120           return 0;
121 
122         cmark_strbuf_grow(ob, HOUDINI_UNESCAPED_SIZE(size));
123       }
124 
125       cmark_strbuf_put(ob, src + org, i - org);
126     }
127 
128     /* escaping */
129     if (i >= size)
130       break;
131 
132     i++;
133 
134     ent = houdini_unescape_ent(ob, src + i, size - i);
135     i += ent;
136 
137     /* not really an entity */
138     if (ent == 0)
139       cmark_strbuf_putc(ob, '&');
140   }
141 
142   return 1;
143 }
144 
houdini_unescape_html_f(cmark_strbuf * ob,const uint8_t * src,bufsize_t size)145 void houdini_unescape_html_f(cmark_strbuf *ob, const uint8_t *src,
146                              bufsize_t size) {
147   if (!houdini_unescape_html(ob, src, size))
148     cmark_strbuf_put(ob, src, size);
149 }
150