1
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <ctype.h>
5
6 #include <string>
7 #include <string.h>
8 #include "tlsh.h"
9
10 static void html_contents(std::string &htmls);
11 static void html_table(std::string &htmls, int *ntags);
12
13 struct tagdef {
14 int relative_count;
15 const char *s;
16 };
17
18 struct tagdef anchor_def[] = {
19 { 165, "<a href= >" },
20 { 2, "<ahref= >" },
21 { 323, "<A href= >" },
22 { 1, "<Ahref= >" },
23 { 8, "<A HREF= >" },
24 { 2, "<a href= code= target= >" },
25 { 6, "<a href= id= >" },
26 { 8, "<a href= Id= >" },
27 { 7, "<A href= id= >" },
28 { 11, "<A href= Id= >" },
29 { 1, "<Ahref= Id= >" },
30 { 4, "<A href= id= id= >" },
31 { 14, "<a href= id= id= target= >" },
32 { 6, "<a href= ID= id= target= >" },
33 { 2, "<A href= id= id= target= >" },
34 { 19, "<a href= id= target= >" },
35 { 5, "<A href= Id= target= >" },
36 { 67, "<a href= target= >" },
37 { 2, "<ahref= target= >" },
38 { 7, "<A href= target= >" },
39 { 7, "<A href= target= rel= >" },
40 { 1, "<a href= Type= Id= >" },
41 { 6, "<a href= Type= Id= target= >" },
42 { 2, "<a target= href= >" },
43 { 2, "<A title= href= target= >" },
44 { 0, NULL }
45 };
46
random_tags(struct tagdef * tag_def)47 static int random_tags(struct tagdef *tag_def)
48 {
49 int count = 0;
50 int total = 0;
51 while (tag_def[count].relative_count > 0) {
52 total = total + tag_def[count].relative_count;
53 count ++;
54 }
55 // printf("total=%d count=%d\n", total, count);
56 int x = abs((int) random()) % total;
57 int idx = 0;
58 for (int ti=0; ti<count; ti++) {
59 // printf("x=%d ti=%d\n", x, ti);
60 x = x - tag_def[ti].relative_count;
61 if (x <= 0) {
62 idx = ti;
63 break;
64 }
65 }
66 return(idx);
67 }
68
endtag(std::string & htmls,char * tag)69 static void endtag(std::string &htmls, char *tag)
70 {
71 htmls += "</";
72 int ti = 1;
73 while (tag[ti] != '\0') {
74 if ((tag[ti] == ' ') || (tag[ti] == '\t') || (tag[ti] == '>')) {
75 htmls += '>';
76 return;
77 }
78 htmls += tag[ti];
79 ti ++;
80 }
81 htmls += '>';
82 }
83
anchor(std::string & htmls)84 static void anchor(std::string &htmls)
85 {
86 int anchor_tag = random_tags(anchor_def);
87 char *anchor_tag_str = (char *) anchor_def[anchor_tag].s;
88 htmls += anchor_tag_str;
89 htmls += '\n';
90 endtag(htmls, anchor_tag_str);
91 htmls += '\n';
92 }
93
94 struct tagdef body_def[] = {
95 { 83, "<body>" },
96 { 38, "<BODY>" },
97 { 17, "<BODY bgcolor= >" },
98 { 46, "<BODY bgColor= >" },
99 { 24, "<body class= >" },
100 { 1, "<BODY lang= style= vLink= link= bgColor= >" },
101 { 1, "<BODY style= >" },
102 { 5, "<BODY style= bgColor= background= >" },
103 { 1, "<BODY style= text= bgColor= background= COLOR= >" },
104 { 1, "<BODY style= text= vLink= aLink= link= bgColor= background= COLOR= >" },
105 { 0, NULL }
106 };
107
108 struct tagdef meta_def[] = {
109 { 9, "<meta content= charset= http-equiv= >" },
110 { 1, "<META content= charset= http-equiv= >" },
111 { 90, "<META content= name= >" },
112 { 9, "<meta http-equiv= content= >" },
113 { 22, "<META http-equiv= content= >" },
114 { 23, "<meta http-equiv= content= charset= >" },
115 { 88, "<META http-equiv= content= charset= >" },
116 { 2, "<META HTTP-EQUIV= CONTENT= charset= >" },
117 { 27, "<meta name= content= >" },
118 { 2, "<META NAME= CONTENT= >" },
119 { 0, NULL }
120 };
121
122 struct tagdef head_def[] = {
123 { 64, "<head>" },
124 { 10, "<Head>" },
125 { 110, "<HEAD>" },
126 { 0, NULL }
127 };
128
head_meta(std::string & htmls)129 static void head_meta(std::string &htmls)
130 {
131 // <HEAD><META http-equiv= content= charset= ><META content= name= ><STYLE><TEXT></STYLE><style><TEXT></style><!-->--></HEAD>
132 int head_tag = random_tags(head_def);
133 char *head_tag_str = (char *) head_def[head_tag].s;
134 htmls += head_tag_str;
135 htmls += '\n';
136
137 // meta #1
138 int meta_tag = random_tags(meta_def);
139 char *meta_tag_str = (char *) meta_def[meta_tag].s;
140 htmls += meta_tag_str;
141 htmls += '\n';
142
143 // meta #2
144 meta_tag = random_tags(meta_def);
145 meta_tag_str = (char *) meta_def[meta_tag].s;
146 htmls += meta_tag_str;
147 htmls += '\n';
148
149 endtag(htmls, head_tag_str);
150 htmls += '\n';
151 }
152
153
154 struct tagdef html_def[] = {
155 { 111, "<html>" },
156 { 111, "<HTML>" },
157 { 1, "<HTML >" },
158 { 0, NULL }
159 };
160
html_tags(std::string & htmls,bool verbose)161 static void html_tags(std::string &htmls, bool verbose)
162 {
163 bool bodyFlag = true;
164 if (random() % 6 == 1)
165 bodyFlag = false;
166
167 if (random() % 20 == 1) {
168 if (random() % 10 == 1)
169 htmls += "<!doctype >";
170 else
171 htmls += "<!DOCTYPE >";
172 }
173
174 int html_tag = random_tags(html_def);
175 char *html_tag_str = (char *) html_def[html_tag].s;
176 htmls += html_tag_str;
177 htmls += '\n';
178
179 if (random() % 10 == 1) {
180 head_meta(htmls);
181 }
182
183 char *body_tag_str;
184 if (bodyFlag) {
185 int body_tag = random_tags(body_def);
186 body_tag_str = (char *) body_def[body_tag].s;
187 htmls += body_tag_str;
188 htmls += '\n';
189 }
190 if (verbose)
191 printf("BEFORE html_contents: %s\n", htmls.c_str() );
192 html_contents(htmls);
193 if (verbose)
194 printf("AFTER html_contents: %s\n", htmls.c_str() );
195 if (bodyFlag) {
196 endtag(htmls, body_tag_str);
197 htmls += '\n';
198 }
199 endtag(htmls, html_tag_str);
200 htmls += '\n';
201 }
202
203 #define MIN_TLSH_LEN 512
204
html(unsigned int seed,bool show_lsh,char * dir)205 static void html(unsigned int seed, bool show_lsh, char *dir)
206 {
207 std::string htmls;
208 Tlsh n;
209 bool verbose = false;
210 int showvers = 0;
211 srandom(seed);
212 // if (seed == 4628)
213 // verbose = true;
214 html_tags(htmls, verbose);
215 // printf("seed=%d len=%d\n", seed, htmls.length());
216 if (htmls.length() <= MIN_TLSH_LEN)
217 return;
218 n.final((unsigned char *)htmls.c_str(), htmls.length());
219 const char *tlsh_str = n.getHash(showvers);
220 if (tlsh_str == NULL)
221 return;
222 if (show_lsh) {
223 printf("%s %d\n", tlsh_str, seed);
224 } else {
225 if (dir == NULL) {
226 printf("=== seed=%d ===\n", seed);
227 printf("%s", htmls.c_str() );
228 } else {
229 char fname[1000];
230 snprintf(fname, 1000, "%s/tags.%d", dir, seed);
231 FILE *f;
232 f = fopen(fname, "w");
233 if (f == NULL) {
234 printf("error: cannot open to write %s\n", fname);
235 exit(1);
236 }
237 fprintf(f, "%s", htmls.c_str() );
238 fclose(f);
239 }
240 }
241 }
242
243 struct tagdef random_def[] = {
244 { 13, "<big>" },
245 { 10, "<big style= >" },
246 { 1, "<BLOCKQUOTE>" },
247 { 12, "<blockquote class= style= >" },
248 { 2, "<BLOCKQUOTE class= style= >" },
249 { 9, "<blockquote style= >" },
250 { 4, "<BLOCKQUOTE style= >" },
251 { 1, "<blockquote type= class= cite= >" },
252 { 1, "<center>" },
253 { 2, "<cite>" },
254 { 1, "<colgroup>" },
255 { 41, "<div>" },
256 { 1863, "<DIV>" }, // orig 18632
257 { 23, "<div align= >" },
258 { 7, "<DIV align= >" },
259 { 21, "<div class= >" },
260 { 2, "<DIV class= >" },
261 { 1, "<DIV class= lang= dir= align= >" },
262 { 11, "<DIV dir= align= >" },
263 { 28, "<DIV dir= style= >" },
264 { 2, "<div id= >" },
265 { 13, "<DIV id= >" },
266 { 4, "<div id= class= >" },
267 { 7, "<DIV id= dir= >" },
268 { 2, "<div id= style= >" },
269 { 25, "<div style= >" },
270 { 11, "<DIV style= >" },
271 { 7, "<DL>" },
272 { 35, "<DT>" },
273 { 4, "<EM>" },
274 { 122, "<font color= >" },
275 { 46, "<FONT color= >" },
276 { 1, "<FONT COLOR= >" },
277 { 2, "<font color= face= >" },
278 { 36, "<FONT color= size= >" },
279 { 40, "<font color= size= face= >" },
280 { 2, "<font color= target= >" },
281 { 11, "<font face= >" },
282 { 1, "<FONT face= >" },
283 { 12, "<FONT FACE= >" },
284 { 2, "<FONT face= color= >" },
285 { 11, "<FONT face= color= size= >" },
286 { 1, "<font face= size= >" },
287 { 52, "<FONT face= size= >" },
288 { 47, "<font size= >" },
289 { 93, "<FONT size= >" },
290 { 5, "<FONT SIZE= >" },
291 { 4, "<font size= color= >" },
292 { 17, "<font size= color= face= >" },
293 { 17, "<FONT size= color= face= >" },
294 { 109, "<font size= face= >" },
295 { 2, "<FONT size= face= >" },
296 { 1, "<FONT SIZE= SIZE= FACE= LANG= >" },
297 { 4, "<FONT style= >" },
298 { 3, "<h1>" },
299 { 8, "<i>" },
300 { 40, "<I>" },
301 { 205, "<p>" },
302 { 1796, "<P>" },
303 { 3, "<p align= >" },
304 { 14, "<P align= >" },
305 { 1, "<P ALIGN= >" },
306 { 19, "<pre>" },
307 { 1, "<PRE>" },
308 { 2, "<pre style= >" },
309 { 2, "<p style= >" },
310 { 2, "<P style= >" },
311 { 1, "<small>" },
312 { 1, "<span class= >" },
313 { 14, "<SPAN class= >" },
314 { 10, "<span dir= >" },
315 { 4, "<SPAN id= >" },
316 { 4, "<SPAN name= border= >" },
317 { 40, "<span style= >" },
318 { 32, "<SPAN style= >" },
319 { 11, "<SPAN STYLE= >" },
320 { 1, "<strong>" },
321 { 5, "<STRONG>" },
322 { 32, "<style>" },
323 { 55, "<STYLE>" },
324 { 8, "<style type= >" },
325 { 1, "<STYLE type= >" },
326 { 25, "<tt>" },
327 { 3, "<u>" },
328 { 2, "<U>" },
329 { 4, "<ul>" },
330 { 0, NULL }
331 };
332
333 struct tagdef oneoff_def[] = {
334 { 2225, "<br>" }, // originally 22256
335 { 17, "<br >" },
336 { 494, "<BR>" }, // originally 4941
337 { 6, "<br clear= >" },
338 { 10, "<br style= >" },
339 { 2, "<hr>" },
340 { 17, "<hr >" },
341 { 2, "<HR>" },
342 { 1, "<HR ALIGN= SIZE= WIDTH= >" },
343 { 1, "<hr align= width= SIZE= >" },
344 { 2, "<hr id= >" },
345 { 2, "<HR id= >" },
346 { 2, "<hr noshade>" },
347 { 10, "<hr size= >" },
348 { 3, "<HR SIZE= >" },
349 { 4, "<HR style= >" },
350 { 1, "<HR tabIndex= >" },
351 { 19, "<IMG alt= hspace= src= align= border= >" },
352 { 1, "<IMG alt= src= border= >" },
353 { 2, "<IMG alt= src=\"cid:border= >" },
354 { 2, "<IMG alt= src= id= border= >" },
355 { 2, "<IMG alt= src= src=\"cid:border= >" },
356 { 2, "<IMG src= >" },
357 { 1, "<img src= alt= height= width= >" },
358 { 2, "<img src= border= width= height= >" },
359 { 1, "<IMG src=\"cid:border= >" },
360 { 1, "<img src=\"cid:width= height= >" },
361 { 2, "<IMG src= src=\"cid:border= >" },
362 { 8, "<img width= height= src=\"cid:border= alt= >" },
363 { 1, "<img width= height= src= src=\"cid:border= alt= >" },
364 { 4186, "<TEXT>" }, // originally 41862
365 { 0, NULL }
366 };
367
rhtml_contents(std::string & htmls,int * ntags,int * ndistinct_tags)368 static void rhtml_contents(std::string &htmls, int *ntags, int *ndistinct_tags)
369 {
370 // original code
371 // ((*ntags <= 0) && ( ndistinct_tags <= 0))
372 // bad - should not do comparison on pointer value
373 //
374 // intention
375 // ((*ntags <= 0) && (*ndistinct_tags <= 0))
376 // have the == NULL test to be consistent - pass regression tests
377 if ((*ntags <= 0) && (ndistinct_tags == NULL))
378 return;
379 if (random() % 10 == 1) {
380 anchor(htmls);
381 *ntags = *ntags - 2;
382 *ndistinct_tags = *ndistinct_tags - 1;
383 } else if (random() % 20 == 1) {
384 html_table(htmls, ntags);
385 *ndistinct_tags = *ndistinct_tags - 1;
386 } else if (random() % 3 == 1) {
387 int oneoff_tag = random_tags(oneoff_def);
388 char *oneoff_tag_str = (char *) oneoff_def[oneoff_tag].s;
389 htmls += oneoff_tag_str;
390 htmls += '\n';
391 *ntags = *ntags - 1;
392 *ndistinct_tags = *ndistinct_tags - 1;
393 } else if (random() % 3 == 1) {
394 return;
395 } else {
396 int rtag = random_tags(random_def);
397 char *rtag_str = (char *) random_def[rtag].s;
398 htmls += rtag_str;
399 htmls += '\n';
400 *ntags = *ntags - 2;
401 *ndistinct_tags = *ndistinct_tags - 1;
402 rhtml_contents(htmls, ntags, ndistinct_tags);
403 endtag(htmls, rtag_str);
404 htmls += '\n';
405 }
406 }
407
html_contents(std::string & htmls)408 static void html_contents(std::string &htmls)
409 {
410 int ntags = random() % 32;
411 int loop;
412 for (loop=0; loop<8; loop++) {
413 if (random() % 2 == 1)
414 break;
415 ntags = ntags * 2;
416 }
417 ntags = ntags + 5;
418 // printf("ntags=%d loop=%d\n", ntags, loop);
419
420 int ndistinct_tags = 3;
421 while ((ntags > 0) || (ndistinct_tags > 0))
422 rhtml_contents(htmls, &ntags, &ndistinct_tags);
423 }
424
425 struct tagdef table_def[] = {
426 { 7, "<table>" },
427 { 4, "<table >" },
428 { 1, "<table align= border= cellspacing= width= >" },
429 { 1, "<TABLE bgColor= border= Color= cellPadding= cellSpacing= height= width= >" },
430 { 1, "<table border= cellspacing= cellpadding= >" },
431 { 2, "<table cellpadding= cellspacing= border= style= >" },
432 { 11, "<table cellspacing= cellpadding= border= >" },
433 { 3, "<TABLE cellSpacing= cellPadding= border= >" },
434 { 7, "<TABLE cellSpacing= cellPadding= width= >" },
435 { 2, "<TABLE cellSpacing= width= align= border= >" },
436 { 1, "<TABLE class= width= >" },
437 { 7, "<TABLE Color= height= width= bgColor= border= >" },
438 { 7, "<TABLE Color= height= width= border= >" },
439 { 7, "<TABLE id= cellSpacing= cellPadding= width= border= >" },
440 { 1, "<table style= cellspacing= cellpadding= width= border= >" },
441 { 14, "<table width= >" },
442 { 1, "<table width= border= >" },
443 { 4, "<table width= border= cellspacing= cellpadding= >" },
444 { 0, NULL }
445 };
446
447 struct tagdef td_def[] = {
448 { 59, "<td>" },
449 { 2, "<TD align= bgColor= height= width= >" },
450 { 8, "<td bgcolor= >" },
451 { 1, "<td bgcolor= height= >" },
452 { 2, "<TD bgColor= height= >" },
453 { 1, "<td class= style= width= height= >" },
454 { 7, "<TD Color= width= bgColor= height= >" },
455 { 1, "<td height= >" },
456 { 2, "<TD height= >" },
457 { 1, "<TD id= dir= style= width= >" },
458 { 1, "<TD id= style= vAlign= width= >" },
459 { 5, "<TD id= style= width= >" },
460 { 9, "<TD id= vAlign= align= >" },
461 { 7, "<TD id= width= >" },
462 { 1, "<td style= width= >" },
463 { 4, "<td valign= >" },
464 { 6, "<TD vAlign= >" },
465 { 1, "<TD vAlign= align= >" },
466 { 1, "<TD vAlign= colSpan= >" },
467 { 10, "<td valign= style= >" },
468 { 25, "<td width= >" },
469 { 7, "<TD width= >" },
470 { 7, "<TD width= bgColor= height= >" },
471 { 1, "<td width= bgcolor= valign= >" },
472 { 14, "<TD width= height= >" },
473 { 0, NULL }
474 };
475
476 struct tagdef tr_def[] = {
477 { 20, "<tr>" },
478 { 58, "<TR>" },
479 { 3, "<TR class= >" },
480 { 1, "<tr style= height= >" },
481 { 47, "<tr valign= >" },
482 { 0, NULL }
483 };
484
485 #define MAX_COL 10
486 #define MAX_ROW 10
487
html_table(std::string & htmls,int * ntags)488 static void html_table(std::string &htmls, int *ntags)
489 {
490 char * col_tag_str[MAX_COL];
491 char * row_tag_str;
492 int table_tag = random_tags(table_def);
493 char *table_tag_str = (char *) table_def[table_tag].s;
494 htmls += table_tag_str;
495 htmls += '\n';
496 *ntags = *ntags - 1;
497
498 int nrow = random() % MAX_ROW;
499 int ncol = random() % MAX_COL;
500 if (random() % 2 == 1)
501 nrow = nrow / 2;
502 if (random() % 2 == 1)
503 nrow = nrow / 2;
504 if (random() % 2 == 1)
505 ncol = ncol / 2;
506 if (random() % 2 == 1)
507 ncol = ncol / 2;
508 if (nrow <= 0)
509 nrow = 1;
510 if (ncol <= 0)
511 ncol = 1;
512 // printf("nrow = %d\n", nrow);
513 // printf("ncol = %d\n", ncol);
514
515 int row_tag = random_tags( tr_def );
516 row_tag_str = (char *) tr_def[row_tag].s;
517 for (int ci=0; ci<ncol; ci++) {
518 int col_tag = random_tags( td_def );
519 col_tag_str[ci] = (char *) td_def[col_tag].s;
520 }
521
522 for (int ri=0; ri<nrow; ri++) {
523 htmls += row_tag_str;
524 for (int ci=0; ci<ncol; ci++) {
525 htmls += col_tag_str[ci];
526 endtag(htmls, col_tag_str[ci]);
527 }
528 endtag(htmls, row_tag_str);
529 htmls += '\n';
530
531 *ntags = *ntags - (ncol + 2);
532 if (*ntags <= 0)
533 break;
534 }
535
536 endtag(htmls, table_tag_str);
537 htmls += '\n';
538 *ntags = *ntags - 1;
539 }
540
541 // struct tagdef meta_def[] = {
542 // { 0, NULL }
543 // };
544
545 #ifdef UNUSED
546
547
548
549 { 2, "<tbody>" },
550 { 35, "<TBODY>" },
551
552 { 31, "</title>" },
553 { 31, "<title>" },
554 { 43, "</TITLE>" },
555 { 43, "<TITLE>" },
556
557
558 #endif
559
560 static void usage()
561 {
562 printf("rand_tags [-start start] [-end end] [-tlsh] [-v1|v2]\n");
563 printf(" generate random tag structure\n");
564 printf(" start = starting seed\n");
565 printf(" end = ending seed\n");
566 printf(" -tlsh output tlsh value instead of HTML\n");
567 printf(" -v1 output nilsimsa\n");
568 printf(" -v2 output TLSH * default value *\n");
569 exit(1);
570 }
571
main(int argc,char * argv[])572 int main(int argc, char *argv[])
573 {
574 int start_seed = 1;
575 int end_seed = -1;
576 bool show_lsh = false;
577 char *dir = NULL;
578
579 int argIdx = 1;
580 if (argc == 1)
581 usage();
582 while (argc > argIdx) {
583 if (strcmp(argv[argIdx], "-start") == 0) {
584 if (argIdx+1 <= argc && isdigit((unsigned char)argv[argIdx+1][0])) {
585 start_seed = atoi(argv[argIdx+1]);
586 }
587 argIdx = argIdx+2;
588 } else if (strcmp(argv[argIdx], "-end") == 0) {
589 if (argIdx+1 <= argc && isdigit((unsigned char)argv[argIdx+1][0])) {
590 end_seed = atoi(argv[argIdx+1]);
591 }
592 argIdx = argIdx+2;
593 } else if (strcmp(argv[argIdx], "-d") == 0) {
594 dir = argv[argIdx+1];
595 argIdx = argIdx+2;
596 } else if (strcmp(argv[argIdx], "-tlsh") == 0) {
597 show_lsh = true;
598 argIdx = argIdx+1;
599 } else {
600 usage();
601 }
602 }
603 if (end_seed == -1) {
604 html(start_seed, show_lsh, dir);
605 } else {
606 for (int seed=start_seed; seed<=end_seed; seed++) {
607 html(seed, show_lsh, dir);
608 }
609 }
610 }
611