1 #include <stdio.h>
2 #include <ctype.h>
3 #include <string.h>
4 #include <stdlib.h>
5 #include "scan.h"
6 #include <assert.h>
7 
8 #include "scan/scan_ulong.c"
9 #include "scan/scan_ulongn.c"
10 #include "fmt/fmt_utf8.c"
11 #include "fmt/fmt_tohex.c"
12 #include "fmt/fmt_escapecharc.c"
13 
14 char tmp[20];
15 char tmp2[20];
16 size_t n,m;
17 unsigned long l;
18 
19 struct entity {
20   const char* entity;
21   char utf8[10];
22   struct entity* next;
23 }* root,** cur=&root;
24 
25 struct letter {
26   char c;
27   struct letters* weiter;
28   uint32_t marshaled;	// lower 8 bits: char. rest: ofs from start of marshaled blob
29 };
30 
31 struct letters {
32   size_t n;
33   struct letter liste[256];
34 };
35 
36 struct letters* d;
37 size_t nodes,datasize;
38 
addword(struct letters ** s,const char * t,void * pointer)39 void addword(struct letters** s,const char* t, void* pointer) {
40   size_t i;
41   if (!*s) {
42     *s=malloc(sizeof(**s));
43     memset(*s,0,sizeof(**s));
44     (*s)->liste[0].c='?';
45   }
46   i=(unsigned char)*t;
47   if ((*s)->liste[i].c==*t) {
48     if (!*t) {
49       datasize+=strlen((char*)pointer)+1;
50       (*s)->liste[i].weiter=pointer;
51     } else
52       addword(&(*s)->liste[i].weiter,t+1,pointer);
53     return;
54   }
55 
56   ++nodes;
57   (*s)->n++;
58   (*s)->liste[i].c=*t;
59   if (!*t) {
60     datasize+=strlen((char*)pointer)+1;
61     (*s)->liste[i].weiter=pointer;
62   } else {
63     (*s)->liste[i].weiter=0;
64     addword(&(*s)->liste[i].weiter,t+1,pointer);
65   }
66 }
67 
dump(struct letters * s,size_t depth)68 void dump(struct letters* s,size_t depth) {
69   size_t i,j;
70   if (!s) return;
71   for (i=0; i<256; ++i) {
72     if (s->liste[i].c!=i) continue;
73     for (j=0; j<depth; ++j) printf("  ");
74     printf("'%c' -> {\n",s->liste[i].c);
75     if (s->liste[i].c)
76       dump(s->liste[i].weiter,depth+1);
77     for (j=0; j<depth; ++j) printf("  ");
78     printf("}\n");
79   }
80 }
81 
82 size_t used;
83 size_t useddata;
84 char* heap;
85 uint32_t* marshaled;
86 char* data;
87 
marshalhelper(struct letters * s)88 void marshalhelper(struct letters* s) {
89   size_t i;
90   uint32_t myindex=used;
91   if (!s) return;
92   used+=s->n;
93   assert(used<nodes+2);
94   for (i=1; i!=0; ++i) {	// start at 1, go to 256, then access modulo 256; effect: sort but put 0 last
95     uint32_t x;
96     i&=0xff;
97 //    printf("%c ",i);
98     if (s->liste[i].c!=i) {
99       if (i==0) return;
100       continue;
101     }
102 //    printf("marshalhelper: %c\n",i);
103     x=(unsigned char)s->liste[i].c;
104     if (!x) {
105       size_t l=strlen((char*)s->liste[i].weiter)+1;
106 //      puts((char*)s->liste[i].weiter);
107       x|=useddata<<8;
108       assert(useddata+l<=datasize);
109       memcpy(data+useddata,s->liste[i].weiter,l);
110       useddata+=l;
111       marshaled[++myindex]=x;
112       return;
113     } else {
114       x|=(used+1)<<8;
115       marshalhelper(s->liste[i].weiter);
116     }
117     marshaled[++myindex]=x;
118   }
119 //  printf("return\n");
120 }
121 
marshal(struct letters * s)122 void marshal(struct letters* s) {
123   fprintf(stderr,"nodes=%lu, datasize=%lu\n",nodes,datasize);
124   heap=malloc((nodes+1)*sizeof(uint32_t)+datasize);
125   if (!heap) return;
126   marshaled=(uint32_t*)heap;
127   marshaled[0]=nodes+1;
128   data=heap+(nodes+1)*sizeof(uint32_t);
129   marshalhelper(s);
130   fprintf(stderr,"actually used: %lu nodes, %lu bytes data\n",used,useddata);
131 }
132 
lookup(char * ds,size_t ofs,const char * t)133 char* lookup(char* ds,size_t ofs,const char* t) {
134   uint32_t* tab=(uint32_t*)ds;
135   if (ofs>tab[0]) return 0;
136   while (ofs<tab[0]) {
137     unsigned char ch=tab[ofs]&0xff;
138     if (ch==(unsigned char)*t) {
139       if (!ch)
140 	return ds+tab[0]*sizeof(uint32_t)+(tab[ofs]>>8);
141       else
142 	return lookup(ds,tab[ofs]>>8,t+1);
143     } else
144       ++ofs;
145     if (!ch) break;
146   }
147   return NULL;
148 }
149 
main()150 int main() {
151   FILE* f=fopen("entities.json","r");
152   char buf[256];
153   if (!f) return 1;
154 #if 0
155   puts("struct { const char* entity; const char* utf8; } codepoints[] = {");
156 #endif
157   while (fgets(buf,sizeof(buf),f)) {
158     char* s,* entity;
159     size_t ul;
160     if (!isspace(buf[0])) continue;
161     for (s=buf; *s && *s!='"'; ++s) ;	// skip whitespace
162     if (!(*s=='"')) continue;
163     ++s;
164     entity=s;
165     if (*entity!='&') continue; ++entity; ++s;
166     for (; *s && *s!='"'; ++s) ;	// skip to end of entity
167     if (!(*s=='"')) continue;
168     if (s[-1]!=';') continue;
169     s[-1]=0; ++s;
170     s=strchr(s,'[');
171     if (!s) continue;
172     n=0;
173 #if 0
174     printf("  { \"%s\", \"",entity);
175 #endif
176     ++s;
177     *cur=malloc(sizeof(**cur));
178     (*cur)->next=0;
179     if (!((*cur)->entity=strdup(entity))) return 1;
180     ul=0;
181     do {
182       while (isspace(*s)) ++s;
183       m=scan_ulong(s,&l);
184       if (!m) return 2;
185       s+=n;
186       n=fmt_utf8(tmp,l);
187       if (ul+n>sizeof((*cur)->utf8)) return 3;
188       memcpy((*cur)->utf8+ul,tmp,n);
189       ul+=n;
190 #if 0
191       {
192 	size_t i;
193 	for (i=0; i<n; ++i) {
194 	  fwrite(tmp2,fmt_escapecharc(tmp2,(unsigned char)tmp[i]),1,stdout);
195 	}
196       }
197 #endif
198       if (*s==']') break;
199     } while (*s==',');
200     (*cur)->utf8[ul]=0;
201 #if 0
202     puts("\" },");
203 #endif
204     addword(&d,(*cur)->entity,(*cur)->utf8);
205   }
206   fclose(f);
207 //  dump(d,0);
208   marshal(d);
209   {
210     FILE* f=fopen("entities.h","w");
211     size_t i;
212     fprintf(f,"struct {\n  uint32_t tab[%u];\n  char data[%lu];\n} entities = {\n  {",marshaled[0],datasize);
213     for (i=0; i<marshaled[0]; ++i) {
214       if (i%8 == 0) fprintf(f,"\n    ");
215       fprintf(f,"0x%x,",marshaled[i]);
216     }
217     fprintf(f,"\n  } , {");
218     for (i=0; i<datasize; ++i) {
219       if (i%16 == 0) fprintf(f,"\n    ");
220       fprintf(f,"0x%x,",data[i]&0xff);
221     }
222     fprintf(f,"\n  }\n};");
223     fclose(f);
224   }
225 //  puts(lookup(heap,1,"zwnj"));
226 #if 0
227   puts("};");
228 #endif
229   return 0;
230 }
231