1 #include <stdio.h>
2 #include <ctype.h>
3 #include <string.h>
4 #include <stdlib.h>
5 #include "scan.h"
6 #include <assert.h>
7
8 #include "scan/scan_ulong.c"
9 #include "scan/scan_ulongn.c"
10 #include "fmt/fmt_utf8.c"
11 #include "fmt/fmt_tohex.c"
12 #include "fmt/fmt_escapecharc.c"
13
14 char tmp[20];
15 char tmp2[20];
16 size_t n,m;
17 unsigned long l;
18
19 struct entity {
20 const char* entity;
21 char utf8[10];
22 struct entity* next;
23 }* root,** cur=&root;
24
25 struct letter {
26 char c;
27 struct letters* weiter;
28 uint32_t marshaled; // lower 8 bits: char. rest: ofs from start of marshaled blob
29 };
30
31 struct letters {
32 size_t n;
33 struct letter liste[256];
34 };
35
36 struct letters* d;
37 size_t nodes,datasize;
38
addword(struct letters ** s,const char * t,void * pointer)39 void addword(struct letters** s,const char* t, void* pointer) {
40 size_t i;
41 if (!*s) {
42 *s=malloc(sizeof(**s));
43 memset(*s,0,sizeof(**s));
44 (*s)->liste[0].c='?';
45 }
46 i=(unsigned char)*t;
47 if ((*s)->liste[i].c==*t) {
48 if (!*t) {
49 datasize+=strlen((char*)pointer)+1;
50 (*s)->liste[i].weiter=pointer;
51 } else
52 addword(&(*s)->liste[i].weiter,t+1,pointer);
53 return;
54 }
55
56 ++nodes;
57 (*s)->n++;
58 (*s)->liste[i].c=*t;
59 if (!*t) {
60 datasize+=strlen((char*)pointer)+1;
61 (*s)->liste[i].weiter=pointer;
62 } else {
63 (*s)->liste[i].weiter=0;
64 addword(&(*s)->liste[i].weiter,t+1,pointer);
65 }
66 }
67
dump(struct letters * s,size_t depth)68 void dump(struct letters* s,size_t depth) {
69 size_t i,j;
70 if (!s) return;
71 for (i=0; i<256; ++i) {
72 if (s->liste[i].c!=i) continue;
73 for (j=0; j<depth; ++j) printf(" ");
74 printf("'%c' -> {\n",s->liste[i].c);
75 if (s->liste[i].c)
76 dump(s->liste[i].weiter,depth+1);
77 for (j=0; j<depth; ++j) printf(" ");
78 printf("}\n");
79 }
80 }
81
82 size_t used;
83 size_t useddata;
84 char* heap;
85 uint32_t* marshaled;
86 char* data;
87
marshalhelper(struct letters * s)88 void marshalhelper(struct letters* s) {
89 size_t i;
90 uint32_t myindex=used;
91 if (!s) return;
92 used+=s->n;
93 assert(used<nodes+2);
94 for (i=1; i!=0; ++i) { // start at 1, go to 256, then access modulo 256; effect: sort but put 0 last
95 uint32_t x;
96 i&=0xff;
97 // printf("%c ",i);
98 if (s->liste[i].c!=i) {
99 if (i==0) return;
100 continue;
101 }
102 // printf("marshalhelper: %c\n",i);
103 x=(unsigned char)s->liste[i].c;
104 if (!x) {
105 size_t l=strlen((char*)s->liste[i].weiter)+1;
106 // puts((char*)s->liste[i].weiter);
107 x|=useddata<<8;
108 assert(useddata+l<=datasize);
109 memcpy(data+useddata,s->liste[i].weiter,l);
110 useddata+=l;
111 marshaled[++myindex]=x;
112 return;
113 } else {
114 x|=(used+1)<<8;
115 marshalhelper(s->liste[i].weiter);
116 }
117 marshaled[++myindex]=x;
118 }
119 // printf("return\n");
120 }
121
marshal(struct letters * s)122 void marshal(struct letters* s) {
123 fprintf(stderr,"nodes=%lu, datasize=%lu\n",nodes,datasize);
124 heap=malloc((nodes+1)*sizeof(uint32_t)+datasize);
125 if (!heap) return;
126 marshaled=(uint32_t*)heap;
127 marshaled[0]=nodes+1;
128 data=heap+(nodes+1)*sizeof(uint32_t);
129 marshalhelper(s);
130 fprintf(stderr,"actually used: %lu nodes, %lu bytes data\n",used,useddata);
131 }
132
lookup(char * ds,size_t ofs,const char * t)133 char* lookup(char* ds,size_t ofs,const char* t) {
134 uint32_t* tab=(uint32_t*)ds;
135 if (ofs>tab[0]) return 0;
136 while (ofs<tab[0]) {
137 unsigned char ch=tab[ofs]&0xff;
138 if (ch==(unsigned char)*t) {
139 if (!ch)
140 return ds+tab[0]*sizeof(uint32_t)+(tab[ofs]>>8);
141 else
142 return lookup(ds,tab[ofs]>>8,t+1);
143 } else
144 ++ofs;
145 if (!ch) break;
146 }
147 return NULL;
148 }
149
main()150 int main() {
151 FILE* f=fopen("entities.json","r");
152 char buf[256];
153 if (!f) return 1;
154 #if 0
155 puts("struct { const char* entity; const char* utf8; } codepoints[] = {");
156 #endif
157 while (fgets(buf,sizeof(buf),f)) {
158 char* s,* entity;
159 size_t ul;
160 if (!isspace(buf[0])) continue;
161 for (s=buf; *s && *s!='"'; ++s) ; // skip whitespace
162 if (!(*s=='"')) continue;
163 ++s;
164 entity=s;
165 if (*entity!='&') continue; ++entity; ++s;
166 for (; *s && *s!='"'; ++s) ; // skip to end of entity
167 if (!(*s=='"')) continue;
168 if (s[-1]!=';') continue;
169 s[-1]=0; ++s;
170 s=strchr(s,'[');
171 if (!s) continue;
172 n=0;
173 #if 0
174 printf(" { \"%s\", \"",entity);
175 #endif
176 ++s;
177 *cur=malloc(sizeof(**cur));
178 (*cur)->next=0;
179 if (!((*cur)->entity=strdup(entity))) return 1;
180 ul=0;
181 do {
182 while (isspace(*s)) ++s;
183 m=scan_ulong(s,&l);
184 if (!m) return 2;
185 s+=n;
186 n=fmt_utf8(tmp,l);
187 if (ul+n>sizeof((*cur)->utf8)) return 3;
188 memcpy((*cur)->utf8+ul,tmp,n);
189 ul+=n;
190 #if 0
191 {
192 size_t i;
193 for (i=0; i<n; ++i) {
194 fwrite(tmp2,fmt_escapecharc(tmp2,(unsigned char)tmp[i]),1,stdout);
195 }
196 }
197 #endif
198 if (*s==']') break;
199 } while (*s==',');
200 (*cur)->utf8[ul]=0;
201 #if 0
202 puts("\" },");
203 #endif
204 addword(&d,(*cur)->entity,(*cur)->utf8);
205 }
206 fclose(f);
207 // dump(d,0);
208 marshal(d);
209 {
210 FILE* f=fopen("entities.h","w");
211 size_t i;
212 fprintf(f,"struct {\n uint32_t tab[%u];\n char data[%lu];\n} entities = {\n {",marshaled[0],datasize);
213 for (i=0; i<marshaled[0]; ++i) {
214 if (i%8 == 0) fprintf(f,"\n ");
215 fprintf(f,"0x%x,",marshaled[i]);
216 }
217 fprintf(f,"\n } , {");
218 for (i=0; i<datasize; ++i) {
219 if (i%16 == 0) fprintf(f,"\n ");
220 fprintf(f,"0x%x,",data[i]&0xff);
221 }
222 fprintf(f,"\n }\n};");
223 fclose(f);
224 }
225 // puts(lookup(heap,1,"zwnj"));
226 #if 0
227 puts("};");
228 #endif
229 return 0;
230 }
231