1 #include "cpp_compress.hh"
2 
3 #include <vector>
4 #include <map>
5 #include <set>
6 #include <iostream>
7 
8 #include <stdio.h>
9 
10 namespace
11 {
12     std::string macro_prefix_chars = "qghdowam";
13     bool verbose = false;
14 
15     /* ^List of characters that are _least_ likely to form
16      *  a legitimate identifier name in the input code when
17      *  a [0-9A-Z] (case sensitive) is appended to it.
18      */
19 
20     std::set<std::string> parametric_macro_list;
21 
reset_parametric_macro_list()22     void reset_parametric_macro_list()
23     {
24         static const char* const list[] =
25         {
26             /* IMPORTANT NOTE: HARDCODED LIST OF ALL PREPROCESSOR
27              * MACROS THAT TAKE PARAMETERS. ANY EXTERNAL MACROS
28              * NOT LISTED HERE WILL BE BROKEN AFTER COMPRESSION.
29              * MACROS DEFINED WITHIN THE PROCESSED INPUT ARE FINE.
30              */
31             "assert", "getc", "putc",
32             "FP_TRACE_OPCODENAME",
33             "FP_TRACE_BYTECODE_OPTIMIZATION",
34             "FP_TRACE_BYTECODE_ADD",
35             "N","P", "DBL_ONLY", "LNG_ONLY",
36             "incStackPtr", "TryCompilePowi","findName"
37         };
38         parametric_macro_list.clear();
39         for(unsigned p=0; p<sizeof(list)/sizeof(*list); ++p)
40             parametric_macro_list.insert(list[p]);
41     }
42 
43     struct length_rec
44     {
45         unsigned begin_index;
46         unsigned num_tokens;
47         unsigned num_occurrences;
48     };
49 
isnamechar(char c)50     inline bool isnamechar(char c)
51     {
52         switch(c)
53         {
54             case '0':case 'a':case 'b':case 'c':case 'd':case 'e':case 'f':
55             case '1':case 'g':case 'h':case 'i':case 'j':case 'k':case 'l':
56             case '2':case 'm':case 'n':case 'o':case 'p':case 'q':case 'r':
57             case '3':case 's':case 't':case 'u':case 'v':case 'w':case 'x':
58             case '4':case 'y':case 'z':case '_':
59             case '5':case 'A':case 'B':case 'C':case 'D':case 'E':case 'F':
60             case '6':case 'G':case 'H':case 'I':case 'J':case 'K':case 'L':
61             case '7':case 'M':case 'N':case 'O':case 'P':case 'Q':case 'R':
62             case '8':case 'S':case 'T':case 'U':case 'V':case 'W':case 'X':
63             case '9':case 'Y':case 'Z': return true;
64         }
65         return false;
66     }
67 
68     class StringSeq: private std::string
69     {
70     private:
71         std::vector<bool> sticky;
72     public:
StringSeq()73         StringSeq()
74             : std::string(), sticky() { }
75 
StringSeq(const std::string & s)76         StringSeq(const std::string& s)
77             : std::string(s), sticky() { }
78 
GetString() const79         const std::string& GetString() const { return *this; }
80 
81         template<typename T>
operator +=(const T & k)82         void operator += (const T& k)
83         {
84             std::string::operator+=(k);
85         }
operator [](size_t pos) const86         char operator[] (size_t pos) const
87         {
88             return std::string::operator[] (pos);
89         }
90 
SetSticky(size_t n_last_chars)91         void SetSticky(size_t n_last_chars)
92         {
93             if(sticky.size() < size()) sticky.resize(size());
94             for(size_t p = size() - n_last_chars; p < size(); ++p)
95                 sticky[p] = true;
96         }
IsSticky(size_t index) const97         bool IsSticky(size_t index) const
98         {
99             return index < sticky.size() && sticky[index];
100         }
HasSticky() const101         bool HasSticky() const { return !sticky.empty(); }
102 
substr(size_t begin,size_t count=std::string::npos) const103         StringSeq substr(size_t begin, size_t count = std::string::npos) const
104         {
105             StringSeq result;
106             result.std::string::assign(*this, begin,count);
107             result.sticky.assign(
108                 begin < sticky.size()
109                     ? sticky.begin() + begin
110                     : sticky.end(),
111                 (count != std::string::npos && begin+count < sticky.size())
112                     ? sticky.begin() + begin + count
113                     : sticky.end());
114             while(!result.sticky.empty() && !result.sticky.back())
115                 result.sticky.pop_back();
116             return result;
117         }
118 
119         using std::string::empty;
120         using std::string::size;
121         using std::string::append;
122     };
123 
124     struct token
125     {
126         std::string value;
127         struct token_meta
128         {
129             unsigned    hash;
130             int         balance;
131             unsigned    comma;
132             bool        preproc;
133         } meta;
134 
token__anond5476afc0111::token135         token(const std::string& v) : value(v)
136         {
137             Rehash();
138         }
139 
token__anond5476afc0111::token140         token(const StringSeq& v) : value(v.GetString())
141         {
142             Rehash();
143             if(v.HasSticky())
144                 meta.preproc = true;
145         }
146 
operator =__anond5476afc0111::token147         void operator=(const std::string& v) { value=v; Rehash(); }
148 
operator ==__anond5476afc0111::token149         bool operator==(const token& b) const
150         {
151             return meta.hash == b.meta.hash
152                 && value == b.value
153                 && meta.preproc == b.meta.preproc;
154         }
155 
operator !=__anond5476afc0111::token156         bool operator!=(const token& b) const
157             { return !operator==(b); }
158 
Rehash__anond5476afc0111::token159         void Rehash()
160         {
161             const char* v = value.data();
162             meta.preproc = (v[0] == '#' || v[0] == '\n');
163             meta.balance = 0;
164             meta.comma   = 0;
165             meta.hash    = 0;
166             for(size_t a=0; a<value.size(); ++a)
167             {
168                 //meta.hash = meta.hash*0x8088405u + v[a];
169                 meta.hash = ((meta.hash << (1)) | (meta.hash >> (32-1))) - v[a];
170                 switch(v[a])
171                 {
172                     case '(': ++meta.balance; break;
173                     case ')': --meta.balance; break;
174                     case ',': ++meta.comma; break;
175                 }
176             }
177         }
swap__anond5476afc0111::token178         void swap(token& b)
179         {
180             value.swap(b.value);
181             std::swap(meta, b.meta);
182         }
183     };
184     bool Debug = false;
GetSeq(std::vector<token>::const_iterator begin,size_t n,bool NewLines)185     StringSeq GetSeq(std::vector<token>::const_iterator begin,
186                      size_t n, bool NewLines)
187     {
188         bool InDefineMode = false;
189 
190         /* Resequence the input */
191         StringSeq result;
192         int quotemode = 0;
193         while(n-- > 0)
194         {
195             const token&       tok = *begin;
196             const std::string& value = tok.value; ++begin;
197 
198             if(Debug)
199                 result += tok.meta.preproc
200                            ? (InDefineMode ? "\xBF" : "\xA1")  // upside-down ? and !
201                            : (InDefineMode ? "\x3F" : "\x21"); // ?, !
202             else if (!result.empty() && result[result.size()-1] != '\n')
203             {
204                 if (!InDefineMode && NewLines && value[0] == '#') result += '\n';
205                 else if (isnamechar(value[0])
206                      && (isnamechar(result[result.size()-1])
207                        || result[result.size()-1] == '"'
208                        || result[result.size()-1] == '\''
209                             //     || result[result.size()-1]==')'
210                             /* An identifier preceded by an identifier character
211                              * requires a separator. Also, an identifier
212                              * preceded by a macro that expands to something
213                              * that ends in an identifier character requires a
214                              * separator when using Microsoft C++, thus you may
215                              * need to include the ")" check above sometimes.
216                              * Also, in C++11 you can't tack an identifier right
217                              * after a string or character constant or there will
218                              * be an error.
219                              */
220                         ))
221                 {
222                     if (!NewLines || InDefineMode)
223                         result += ' ';
224                     else
225                         result += '\n';
226                 }
227             }
228             if (value[0] == '#')
229             {
230                 if (value.substr(0,7) == "#define") InDefineMode = true;
231                 if (value.substr(0,8) == "#include") InDefineMode = true;
232             }
233             if (value == "\n")
234             {
235                 if (InDefineMode) { result += "\n"; InDefineMode = false; }
236                 continue;
237             }
238 
239             switch(quotemode)
240             {
241                 case 0: // prev wasn't a quote, or this is not a quote
242                     if (value[0] == '"'
243                     && (n>0 && begin->value[0] == '"')) // this and next are quotes
244                         { quotemode = 1;
245                           result.append(value, 0, value.size()-1);
246                           //result += value.substr(0, value.size()-1);
247                           continue;
248                         }
249                     else
250                     {
251                         result += value;
252                         if(tok.meta.preproc || (value[0] == '(' && value.size() > 1))
253                         {
254                             /* A macro parameter list is an undivisible entity */
255                             result.SetSticky(value.size());
256                         }
257                     }
258                     break;
259                 case 1: // prev was a quote, skip this quote
260                     if (n>0 && begin->value[0] == '"')
261                         { //result += value.substr(1, value.size()-2);
262                           result.append(value, 1, value.size()-2);
263                           continue;
264                         }
265                     else
266                         { quotemode = 0;
267                           //result += value.substr(1);
268                           result.append(value, 1, value.size()-1);
269                         }
270                     break;
271             }
272             if (!Debug)
273             {
274                 if (NewLines && !InDefineMode)
275                 {
276                     if (value[0] == '#'
277                      || value[0] == '}'
278                      || value[0] == '"'
279                       )
280                     {
281                         result += '\n';
282                     }
283                 }
284                 if (n > 0 && (value.size() == 1 &&
285                                (value[0] == '<'  // < < is not <<
286                              || value[0] == '>'  // > > is not >>
287                              || value[0] == '+'  // + + is not ++
288                              || value[0] == '-'  // - - is not --
289                              || value[0] == '&') // & & is not &&
290                                ) && value == begin->value)
291                 {
292                     result += ' ';
293                 }
294             }
295         }
296         return result;
297     }
298 
299     struct DefineParsingMode
300     {
301         bool Active;
302         std::set<std::string> ParamList;
303 
DefineParsingMode__anond5476afc0111::DefineParsingMode304         DefineParsingMode() : Active(false), ParamList() { }
305 
Activate__anond5476afc0111::DefineParsingMode306         void Activate() { Active=true; }
Deactivate__anond5476afc0111::DefineParsingMode307         void Deactivate() { Active=false; ParamList.clear(); }
308     };
309 
310     std::vector<token>
Tokenize(const StringSeq & input,DefineParsingMode & defmode,bool SplitMacros,bool SplitStrings)311         Tokenize(const StringSeq& input,
312                  DefineParsingMode& defmode,
313                  bool SplitMacros,
314                  bool SplitStrings)
315     {
316         std::vector<token> result;
317         size_t a=0, b=input.size();
318         while(a < b)
319         {
320             /*if(input.IsSticky(a))
321             {
322                 size_t eat = 1;
323                 while(input.IsSticky(a+eat)) ++eat;
324                 result.push_back( input.substr(a, eat) );
325                 a += eat;
326                 continue;
327             }*/
328             if (defmode.Active && input[a]=='\\' && input[a+1] == '\n')
329             {
330                 a += 2;
331                 continue;
332             }
333             if (defmode.Active && input[a]=='\n')
334             {
335                 defmode.Deactivate();
336                 result.push_back( token("\n") );
337                 ++a;
338                 continue;
339             }
340             if (input[a]==' ' || input[a]=='\t'
341             || input[a]=='\n' || input[a]=='\r') { ++a; continue; }
342 
343             if (input[a]=='/' && input[a+1]=='*')
344             {
345                 a += 2;
346                 while(a < b && (input[a-2]!='*' || input[a-1]!='/')) ++a;
347                 continue;
348             }
349             if (input[a]=='/' && input[a+1]=='/')
350             {
351                 while(a < b && input[a]!='\n') ++a;
352                 continue;
353             }
354             if (input[a]=='_' || (input[a]>='a' && input[a]<='z')
355                               || (input[a]>='A' && input[a]<='Z'))
356             {
357                 size_t name_begin = a;
358                 while(++a < b)
359                 {
360                     if (isnamechar(input[a])) continue;
361                     break;
362                 }
363 
364                 std::string name(input.GetString(), name_begin, a-name_begin);
365                 result.push_back(name);
366 
367                 if (defmode.Active && defmode.ParamList.find(name)
368                                    != defmode.ParamList.end())
369                 {
370                     // Define params are immutable.
371                     result.back().meta.preproc = true;
372                 }
373 
374                 if (input[a] == '('
375                 && parametric_macro_list.find(name)
376                 != parametric_macro_list.end())
377                 {
378                     std::vector<token> remains = Tokenize(input.substr(a), defmode, SplitMacros, SplitStrings);
379                     int balance = 1;
380                     size_t eat = 1;
381                     for(; eat < remains.size() && balance != 0; ++eat)
382                         balance += remains[eat].meta.balance;
383                     if(SplitMacros)
384                     {
385                         for(size_t c=0; c<eat; ++c)
386                             if(remains[c].meta.balance != 0
387                             || remains[c].meta.comma != 0)
388                                 remains[c].meta.preproc = true;
389                         result.insert(result.end(), remains.begin(), remains.end());
390                     }
391                     else
392                     {
393                         //result.push_back( GetSeq(remains.begin(), eat, false) );
394                         StringSeq tmp = GetSeq(remains.begin(), eat, false);
395                         result.back() = result.back().value + tmp.GetString();
396                         result.insert(result.end(), remains.begin()+eat, remains.end());
397                     }
398                     a = b; // done
399                 }
400                 continue;
401             }
402             if (std::isdigit(input[a]) ||
403                (input[a] == '.' && std::isdigit(input[a+1])))
404             {
405                 size_t value_begin = a;
406                 while(++a < b)
407                 {
408                     if ((input[a]>='0' && input[a]<='9')
409                     || input[a]=='.' || input[a]=='+' || input[a]=='-'
410                     || input[a]=='x' || (input[a]>='a' && input[a]<='f')
411                     || input[a]=='p' || (input[a]>='A' && input[a]<='F')
412                     || input[a]=='u' || input[a]=='U'
413                     || input[a]=='l' || input[a]=='f'
414                     || input[a]=='L' || input[a]=='F') continue;
415                     break;
416                 }
417                 StringSeq s = input.substr(value_begin, a-value_begin );
418                 /* TODO: Convert hex to decimal */
419                 result.push_back( s );
420                 continue;
421             }
422             if (a+1 < b && input[a] == '>' && input[a+1] == '>')
423                 { int n = (a+2 < b && input[a+2] == '=') ? 3 : 2;
424                   result.push_back(input.substr(a, n)); a += n; continue; }
425             if (a+1 < b && input[a] == '<' && input[a+1] == '<')
426                 { int n = (a+2 < b && input[a+2] == '=') ? 3 : 2;
427                   result.push_back(input.substr(a, n)); a += n; continue; }
428             if (a+1 < b && input[a] == '+' && input[a+1] == '+')
429                 { result.push_back(input.substr(a, 2)); a += 2; continue; }
430             if (a+1 < b && input[a] == '-' && input[a+1] == '-')
431                 { result.push_back(input.substr(a, 2)); a += 2; continue; }
432             if (a+1 < b && input[a] == '-' && input[a+1] == '>')
433                 { result.push_back(input.substr(a, 2)); a += 2; continue; }
434             if (a+1 < b && input[a] == '#' && input[a+1] == '#')
435                 { result.push_back(input.substr(a, 2)); a += 2; continue; }
436             if (a+1 < b && input[a] == '&' && input[a+1] == '&')
437                 { result.push_back(input.substr(a, 2)); a += 2; continue; }
438             if (a+1 < b && input[a] == '|' && input[a+1] == '|')
439                 { result.push_back(input.substr(a, 2)); a += 2; continue; }
440             if (a+1 < b && (input[a] == '>' || input[a] == '<'
441                         || input[a] == '!' || input[a] == '='
442                         || input[a] == '+' || input[a] == '-'
443                         || input[a] == '*' || input[a] == '/'
444                         || input[a] == '&' || input[a] == '|'))
445                 if (input[a+1] == '=')
446                     { result.push_back(input.substr(a, 2)); a += 2; continue; }
447             if (a+1 < b && (input[a] == ':' && input[a+1] == ':'))
448                     { result.push_back(input.substr(a, 2)); a += 2; continue; }
449             if (!defmode.Active && input[a] == '#')
450             {
451                 if (input.substr(a,8).GetString() == "#include")
452                 {
453                     size_t p = a;
454                     while(p < b && input[p] != '\n') ++p;
455                     result.push_back( input.substr(a, p-a) );
456                     result.back().meta.preproc = true;
457                     result.push_back( token("\n") );
458                     a = p;
459                     continue;
460                 }
461                 if (input.substr(a,7).GetString() == "#define")
462                 {
463                     size_t p = a+7;
464                     while(p < b && std::isspace(input[p])) ++p; // skip blank
465                     size_t term_begin = p;
466                     while(p < b && isnamechar(input[p])) ++p; // skip term
467                     if (input[p] != '(')
468                     {
469                         defmode.Activate();
470                         std::string def = input.substr(a, p-a).GetString();
471                         if(input[p] != '\n') def += ' ';
472                         result.push_back(def); /* #define, term name and a space */
473                         a = p;
474                         continue;
475                     }
476                     else
477                     {
478                         std::string macro(input.GetString(), term_begin, p-term_begin);
479                         if(parametric_macro_list.find(macro)
480                         == parametric_macro_list.end())
481                         {
482                             if(verbose) std::cerr << "Detected parametric macro: " << macro << " (ok)\n";
483                             parametric_macro_list.insert(macro);
484                         }
485 
486                         size_t /*param_list_begin = p,*/ param_begin = p+1;
487                         int balance = 1;
488                         for (++p; true; ++p)
489                         {
490                             if(input[p]=='(') ++balance;
491                             if(input[p]==',' || input[p]==')')
492                             {
493                                 std::string paramname = input.substr(param_begin,p-param_begin).GetString();
494                                 //std::cerr << "Param name<" << paramname << ">\n";
495                                 defmode.ParamList.insert(paramname);
496                                 param_begin = p+1;
497                             }
498                             if(input[p]==')') { if(--balance == 0) { ++p; break; } }
499                         }
500                         size_t param_list_end = p;
501                         while(input[p] != '\n' && std::isspace(input[p])) ++p;
502 
503                         defmode.Activate();
504                         std::string def = input.substr(a, param_list_end-a).GetString();
505                         if(input[p] != '\n') def += ' ';
506                         result.push_back(def); /* #define, term name, params and a space */
507                         a = p;
508                         continue;
509                     }
510                 }
511                 size_t preproc_begin = a;
512                 bool in_quotes = false;
513                 while(++a < b)
514                 {
515                     if (!in_quotes && input[a]=='"')
516                         { in_quotes=true; continue; }
517                     if (in_quotes && input[a]=='"' && input[a-1]!='\\')
518                         { in_quotes=false; continue; }
519                     if (input[a]=='\\' && input[a+1]=='\n') { ++a; continue; }
520                     if (input[a]=='\n') break;
521                 }
522                 std::string stmt(input.GetString(), preproc_begin, a-preproc_begin);
523                 if (stmt.substr(0,5) != "#line")
524                     result.push_back(stmt);
525                 result.push_back( token("\n") );
526                 continue;
527             }
528             if (input[a] == '"')
529             {
530                 size_t string_begin = a;
531                 while(++a < b)
532                     if (input[a]=='"' &&
533                       (input[a-1] != '\\'
534                      || input[a-2]=='\\')) { ++a; break; }
535                 if(input.GetString() == "\"\"") continue; // Don't add an empty string token
536 
537                 std::string stringconst(input.GetString(), string_begin+1, a-string_begin-2);
538                 if (SplitMacros)
539                     while (true)
540                     {
541                         size_t p = stringconst.find_first_of(" ,+-", 0, 1);
542                         if(p == stringconst.npos) break;
543                         if(p > 0)
544                             result.push_back( "\""+std::string(stringconst,0,p)+"\"" );
545                         result.push_back( "\""+std::string(stringconst,p,1)+"\"" );
546                         stringconst.erase(0, p+1);
547                     }
548                 if(!stringconst.empty()) result.push_back("\""+stringconst+"\"");
549                 continue;
550             }
551             if (input[a] == '\'')
552             {
553                 size_t char_begin = a; a += 3;
554                 if (input[a-2] == '\\') ++a;
555                 result.push_back( std::string(input.GetString(), char_begin, a-char_begin) );
556                 continue;
557             }
558 
559             result.push_back( input.substr(a++, 1) );
560         }
561         return result;
562     }
563 
564     static const char cbuf[] =
565     "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_";
566 
567     unsigned macro_counter = 0;
568 
GenerateMacroName()569     std::string GenerateMacroName()
570     {
571         std::string result;
572         unsigned p = macro_counter++;
573         result += macro_prefix_chars[(p/36)%macro_prefix_chars.size()];
574         result += cbuf[p%36];
575         p /= unsigned(macro_prefix_chars.size()*36); // 0-9A-Z
576         for(; p != 0; p /= 63)
577             result += cbuf[p%63];
578         return result;
579     }
580 
CompressWithNonparametricMacros(std::vector<token> & tokens,const std::string & seq_name_buf)581     bool CompressWithNonparametricMacros
582         (std::vector<token>& tokens,
583          const std::string& seq_name_buf)
584     {
585         size_t seq_name_length = seq_name_buf.size();
586 
587         /* Find a sub-sequence of tokens for which
588          * the occurrence-count times total length is
589          * largest and the balance of parentheses is even.
590          */
591         std::map<unsigned, length_rec> hash_results;
592         long best_score=0;
593         //size_t best_score_length=0;
594         unsigned best_hash=0;
595 
596         if(verbose) std::cerr << tokens.size() << " tokens\n";
597 
598         std::vector<bool> donttest(tokens.size(), false);
599 
600         const size_t lookahead_depth = (140*100000) / tokens.size();
601         // ^Lookahead limit. The value chosen is an arbitrary value
602         //  to shield against exponential runtime. A larger value
603         //  yields better compression, but is slower.
604 
605         for(size_t a=0; a<tokens.size(); ++a)
606         {
607             if (donttest[a]) continue;
608 
609             //std::cerr << a << '\t' << best_score << '\t' << best_score_length << '\r' << std::flush;
610             size_t cap = a+lookahead_depth;
611             for(size_t b=a+1; b<tokens.size() && b<cap; ++b)
612             {
613                 size_t max_match_len = std::min(tokens.size()-b, b-a);
614                 size_t match_len = 0;
615                 unsigned hash = 0;
616                 //int balance = 0;
617 
618                 while(match_len < max_match_len
619                    && tokens[a+match_len] == tokens[b+match_len]
620                    && tokens[a+match_len].meta.preproc == false)
621                 {
622                     const token& word = tokens[a+match_len];
623 
624                     //balance += word.meta.balance;
625                     //if (preserve_params
626                     // && (balance < 0 || (word.meta.comma && balance==0))) break;
627 
628                     ++match_len;
629                     hash = ((hash << (1+0*(match_len&31)))
630                           ^ (hash >> (31-0*(match_len&31))))
631                           ^ word.meta.hash;
632                     //hash = ~hash*0x8088405u + word.meta.hash;
633 
634                     donttest[b] = true;
635 
636                     //if  (!balance == 0)
637                     {
638                         std::map<unsigned, length_rec>::iterator i
639                             = hash_results.lower_bound(hash);
640                         if (i == hash_results.end() || i->first != hash)
641                         {
642                             length_rec rec;
643                             rec.begin_index = (unsigned)a;
644                             rec.num_tokens  = (unsigned)match_len;
645                             rec.num_occurrences = 1;
646                             hash_results.insert(i, std::make_pair(hash,rec));
647                             cap = std::max(cap, b+match_len+lookahead_depth);
648                         }
649                         else if (i->second.begin_index == (unsigned)a)
650                         {
651                             if (std::equal(
652                                 tokens.begin()+a, tokens.begin()+a+match_len,
653                                 tokens.begin() + i->second.begin_index))
654                             {
655                                 long string_len = GetSeq(tokens.begin()+a, match_len, false).size();
656                                 long n = (i->second.num_occurrences += 1);
657                                 long define_length = seq_name_length + 9 - long(string_len);
658                                 long replace_length = long(string_len) - (long(seq_name_length)+1);
659                                 long score = replace_length * n - define_length;
660                                 if (score > best_score)
661                                 {
662                                     best_score        = score;
663                                     //best_score_length = string_len;
664                                     best_hash         = hash;
665                                 }
666                             }
667                             cap = std::max(cap, b+match_len+lookahead_depth);
668                         }
669                     }
670                 }
671             }
672         }
673         if (best_score > 0)
674         {
675             const length_rec& rec = hash_results[best_hash];
676             if (rec.num_occurrences > 0)
677             {
678                 /* Found a practical saving */
679                 std::vector<token> sequence
680                     (tokens.begin()+rec.begin_index,
681                      tokens.begin()+rec.begin_index+rec.num_tokens);
682                 if(verbose) std::cerr << "#define " << seq_name_buf << " " <<
683                     GetSeq(sequence.begin(), sequence.size(), false).GetString()
684                         //<< " /* " << rec.num_occurrences
685                         //<< " occurrences */"
686                           << std::endl;
687 
688                 /* Replace all occurrences of the sequence with the sequence name */
689                 std::vector<bool> deletemap(tokens.size(), false);
690                 for(size_t a=0;//rec.begin_index+rec.num_tokens;
691                            a+rec.num_tokens<=tokens.size();
692                            ++a)
693                 {
694                     if (std::equal(sequence.begin(),
695                                   sequence.end(),
696                                   tokens.begin()+a))
697                     {
698                         tokens[a] = seq_name_buf;
699                         for(size_t b=1; b<rec.num_tokens; ++b)
700                             deletemap[++a] = true;
701                     }
702                 }
703                 size_t tgt=0, src=0;
704                 for(; src < tokens.size(); ++src)
705                     if (!deletemap[src])
706                         tokens[tgt++].swap(tokens[src]);
707                 tokens.erase(tokens.begin()+tgt, tokens.end());
708 
709                 sequence.insert(sequence.begin(),
710                     token("#define " + seq_name_buf + " "));
711                 sequence.push_back( token("\n") );
712 
713                 tokens.insert(tokens.begin(),
714                     sequence.begin(), sequence.end());
715 
716                 /* Find more repetitions */
717                 return true;
718             }
719         }
720         return false;
721     }
722 
CompressWithParametricMacros(std::vector<token> &,const std::string &)723     bool CompressWithParametricMacros(
724         std::vector<token>& /*tokens*/,
725         const std::string& /*seq_name_buf*/)
726     {
727         /* TODO: Someone invent an algorithm for this?
728          *
729          * Find e.g.
730          *
731          * Cmpeq_i qI x == y ; } q1
732          * Cmpge_d qI x >= y ; } q1
733          * Cmpge_i qI x >= y ; } q1
734          * Cmpgt_d qI x >  y ; } q1
735          * Cmpgt_i qI x >  y ; } q1
736          * Cmple_d qI x <= y ; } q1
737          * Cmple_i qI x <= y ; } q1
738          * Cmplt_d qI x <  y ; } q1
739          * Cmplt_i qI x <  y ; } q1
740          * Cmpne_d qI x != y ; } q1
741          * Cmpne_i qI x != y ; } q1
742          *
743          * Substitute with:
744          *
745          * #define Zzz(A,B) A qI x B y ; } q1
746          * Zzz(Cmpeq_i,==)
747          * Zzz(Cmpge_d,>=)
748          * Zzz(Cmpge_i,>=)
749          * Zzz(Cmpgt_d,>)
750          * Zzz(Cmpgt_i,>)
751          * Zzz(Cmple_d,<=)
752          * Zzz(Cmple_i,<=)
753          * Zzz(Cmplt_d,<)
754          * Zzz(Cmplt_i,<)
755          * Zzz(Cmpne_d,!=)
756          * Zzz(Cmpne_i,!=)
757          *
758          * Which can be further turned into (well, theoretically):
759          *
760          * #define Zzz(A,B) A qI x B y ; } q1
761          * #define Zxy(A,B) Zzz(Cmp##A##_d,B) Zzz(Cmp##A##_i,B)
762          * Zzz(Cmpeq_i,==)
763          * Zxy(ge,>=)
764          * Zxy(gt,>)
765          * Zxy(le,<=)
766          * Zxy(lt,<)
767          * Zxy(ne,!=)
768          */
769         return false;
770     }
771 }
772 
Compress(const std::string & input)773 std::string CPPcompressor::Compress(const std::string& input)
774 {
775     FILE* fp = fopen("cpp_compress_disable", "r");
776     if(fp) { fclose(fp); return input; }
777 
778     reset_parametric_macro_list();
779     macro_counter = 0;
780 
781     DefineParsingMode defmode;
782     std::vector<token> tokens = Tokenize(input, defmode, false, false);
783 
784     int tried_retoken_rounds = 0;
785     std::string seq_name_buf = GenerateMacroName();
786 
787     //bool preserve_parens = false;
788 
789     StringSeq result;
790     while (true)
791     {
792         if (CompressWithNonparametricMacros(tokens, seq_name_buf))
793         {
794             tried_retoken_rounds = 0;
795             seq_name_buf = GenerateMacroName();
796         }
797         else if (CompressWithParametricMacros(tokens, seq_name_buf))
798         {
799             tried_retoken_rounds = 0;
800             seq_name_buf = GenerateMacroName();
801         }
802         else
803         {
804             if (tried_retoken_rounds >= 4) break;
805             //preserve_parens = true;
806 
807             if(verbose) std::cerr << "Retokenizing\n";
808             //static int counter=0; ++counter;
809             //if(counter>=1) {Debug=true;}
810             result = GetSeq(tokens.begin(), tokens.size(), true);
811             //if(counter>=1) break;
812 
813             DefineParsingMode defmode;
814             tokens = Tokenize(result, defmode, tried_retoken_rounds&1, tried_retoken_rounds&2);
815             ++tried_retoken_rounds;
816         }
817     }
818     return result.GetString();
819 }
820 
Compress(const std::string & input,const std::string & m)821 std::string CPPcompressor::Compress
822     (const std::string& input,
823      const std::string& m)
824 {
825     if(!m.empty()) macro_prefix_chars = m;
826     return Compress(input);
827 }
828