xref: /reactos/sdk/tools/asmpp/asmpp.cpp (revision 2245dd78)
1 /*
2  * PROJECT:     ReactOS host tools
3  * LICENSE:     MIT (https://spdx.org/licenses/MIT)
4  * PURPOSE:     ASM preprocessor
5  * COPYRIGHT:   Copyright 2021 Timo Kreuzer <timo.kreuzer@reactos.org>
6  */
7 
8 // Optimize even on debug builds, because otherwise it's ridiculously slow
9 #ifdef _MSC_VER
10 #pragma optimize("gst", on)
11 #pragma auto_inline(on)
12 #else
13 #pragma GCC optimize("O3,inline")
14 #endif
15 
16 #include "tokenizer.hpp"
17 #include <cstdlib>
18 #include <cstdio>
19 #include <sstream>
20 #include <ctime>
21 
22 #define PROFILING_ENABLED 0
23 
24 using namespace std;
25 
26 time_t search_time;
27 
28 enum TOKEN_TYPE
29 {
30     Invalid = -1,
31     Eof,
32     WhiteSpace,
33     NewLine,
34     Comment,
35     DecNumber,
36     HexNumber,
37     String,
38 
39     BraceOpen,
40     BraceClose,
41     MemRefStart,
42     MemRefEnd,
43     Colon,
44     Operator,
45     StringDef,
46 
47     KW_include,
48     KW_const,
49     KW_code,
50     KW_endprolog,
51     KW_ALIGN,
52     KW_EXTERN,
53     KW_PUBLIC,
54     KW_ENDM,
55     KW_END,
56     KW_if,
57     KW_ifdef,
58     KW_ifndef,
59     KW_else,
60     KW_endif,
61 
62     KW_allocstack,
63     KW_savereg,
64     KW_savexmm128,
65 
66     KW_DB,
67     KW_DW,
68     KW_DD,
69     KW_DQ,
70     KW_EQU,
71     KW_TEXTEQU,
72     KW_MACRO,
73     KW_PROC,
74     KW_FRAME,
75     KW_ENDP,
76     KW_RECORD,
77 
78     KW_MASK,
79     KW_ERRDEF,
80 
81     Filename,
82     Instruction,
83     Reg8,
84     Reg16,
85     Reg32,
86     Reg64,
87     RegXmm,
88     BYTE_PTR,
89     WORD_PTR,
90     DWORD_PTR,
91     QWORD_PTR,
92     XMMWORD_PTR,
93 
94     LabelName,
95     Identifier
96 };
97 
fake_printf(const char * format,...)98 int fake_printf(const char* format, ...)
99 {
100     return 0;
101 }
102 
103 //#define printf fake_printf
104 
105 // Use a look-ahead for following characters, not included into the match
106 //#define FOLLOWED_BY(x) R"((?=)" x R"())"
107 #define FOLLOWED_BY(x) x
108 
109 #define ANY_CHAR R"((?:.|\n))"
110 #define WHITESPACE R"((?:[ \t]++))"
111 #define NEWLINE R"([\n])"
112 #define WS_OR_NL R"((?:)" WHITESPACE "|" NEWLINE R"()+)"
113 #define SEPARATOR R"([\s,\=\+\-\*\/\:\~\[\]])"
114 
115 #define INSTRUCTION \
116     "AAA|AAD|AAM|AAS|ADC|ADCX|ADD|ADDPD|ADDPS|ADDSD|ADDSS|ADDSUBPD|ADDSUBPS|" \
117     "ADOX|AESDEC|AESDECLAST|AESENC|AESENCLAST|AESIMC|AESKEYGENASSIST|AND|ANDN|" \
118     "ANDNPD|ANDNPS|ANDPD|ANDPS|ARPL|BEXTR|BLENDPD|BLENDPS|BLENDVPD|BLENDVPS|" \
119     "BLSI|BLSMSK|BLSR|BNDCL|BNDCN|BNDCU|BNDLDX|BNDMK|BNDMOV|BNDSTX|BOUND|BSF|" \
120     "BSR|BSWAP|BT|BTC|BTR|BTS|BZHI|CALL|CBW|CDQ|CDQE|CLAC|CLC|CLD|CLDEMOTE|" \
121     "CLFLUSH|CLFLUSHOPT|CLI|CLTS|CLWB|CMC|CMOVcc|CMP|CMPPD|CMPPS|CMPS|CMPSB|" \
122     "CMPSD|CMPSQ|CMPSS|CMPSW|CMPXCHG|CMPXCHG16B|CMPXCHG8B|COMISD|COMISS|CPUID|" \
123     "CQO|CRC32|CVTDQ2PD|CVTDQ2PS|CVTPD2DQ|CVTPD2PI|CVTPD2PS|CVTPI2PD|CVTPI2PS|" \
124     "CVTPS2DQ|CVTPS2PD|CVTPS2PI|CVTSD2SI|CVTSD2SS|CVTSI2SD|CVTSI2SS|CVTSS2SD|" \
125     "CVTSS2SI|CVTTPD2DQ|CVTTPD2PI|CVTTPS2DQ|CVTTPS2PI|CVTTSD2SI|CVTTSS2SI|CWD|" \
126     "CWDE|DAA|DAS|DEC|DIV|DIVPD|DIVPS|DIVSD|DIVSS|DPPD|DPPS|EMMS|ENTER|" \
127     "EXTRACTPS|F2XM1|FABS|FADD|FADDP|FBLD|FBSTP|FCHS|FCLEX|FCMOVcc|FCOM|FCOMI|" \
128     "FCOMIP|FCOMP|FCOMPP|FCOS|FDECSTP|FDIV|FDIVP|FDIVR|FDIVRP|FFREE|FIADD|" \
129     "FICOM|FICOMP|FIDIV|FIDIVR|FILD|FIMUL|FINCSTP|FINIT|FIST|FISTP|FISTTP|" \
130     "FISUB|FISUBR|FLD|FLD1|FLDCW|FLDENV|FLDL2E|FLDL2T|FLDLG2|FLDLN2|FLDPI|" \
131     "FLDZ|FMUL|FMULP|FNCLEX|FNINIT|FNOP|FNSAVE|FNSTCW|FNSTENV|FNSTSW|FPATAN|" \
132     "FPREM|FPREM1|FPTAN|FRNDINT|FRSTOR|FSAVE|FSCALE|FSIN|FSINCOS|FSQRT|FST|" \
133     "FSTCW|FSTENV|FSTP|FSTSW|FSUB|FSUBP|FSUBR|FSUBRP|FTST|FUCOM|FUCOMI|" \
134     "FUCOMIP|FUCOMP|FUCOMPP|FWAIT|FXAM|FXCH|FXRSTOR|FXSAVE|FXTRACT|FYL2X|" \
135     "FYL2XP1|GF2P8AFFINEINVQB|GF2P8AFFINEQB|GF2P8MULB|HADDPD|HADDPS|HLT|" \
136     "HSUBPD|HSUBPS|IDIV|IMUL|IN|INC|INS|INSB|INSD|INSERTPS|INSW|INT|INT1|INT3|" \
137     "INTO|INVD|INVLPG|INVPCID|IRET|IRETD|JMP|Jcc|KADDB|KADDD|KADDQ|KADDW|" \
138     "KANDB|KANDD|KANDNB|KANDND|KANDNQ|KANDNW|KANDQ|KANDW|KMOVB|KMOVD|KMOVQ|" \
139     "KMOVW|KNOTB|KNOTD|KNOTQ|KNOTW|KORB|KORD|KORQ|KORTESTB|KORTESTD|KORTESTQ|" \
140     "KORTESTW|KORW|KSHIFTLB|KSHIFTLD|KSHIFTLQ|KSHIFTLW|KSHIFTRB|KSHIFTRD|" \
141     "KSHIFTRQ|KSHIFTRW|KTESTB|KTESTD|KTESTQ|KTESTW|KUNPCKBW|KUNPCKDQ|KUNPCKWD|" \
142     "KXNORB|KXNORD|KXNORQ|KXNORW|KXORB|KXORD|KXORQ|KXORW|LAHF|LAR|LDDQU|" \
143     "LDMXCSR|LDS|LEA|LEAVE|LES|LFENCE|LFS|LGDT|LGS|LIDT|LLDT|LMSW|LOCK|LODS|" \
144     "LODSB|LODSD|LODSQ|LODSW|LOOP|LOOPcc|LSL|LSS|LTR|LZCNT|MASKMOVDQU|MASKMOVQ|" \
145     "MAXPD|MAXPS|MAXSD|MAXSS|MFENCE|MINPD|MINPS|MINSD|MINSS|MONITOR|MOV|MOVAPD|" \
146     "MOVAPS|MOVBE|MOVD|MOVDDUP|MOVDIR64B|MOVDIRI|MOVDQ2Q|MOVDQA|MOVDQU|MOVHLPS|" \
147     "MOVHPD|MOVHPS|MOVLHPS|MOVLPD|MOVLPS|MOVMSKPD|MOVMSKPS|MOVNTDQ|MOVNTDQA|" \
148     "MOVNTI|MOVNTPD|MOVNTPS|MOVNTQ|MOVQ|MOVQ2DQ|MOVS|MOVSB|MOVSD|MOVSHDUP|" \
149     "MOVSLDUP|MOVSQ|MOVSS|MOVSW|MOVSX|MOVSXD|MOVUPD|MOVUPS|MOVZX|MPSADBW|MUL|" \
150     "MULPD|MULPS|MULSD|MULSS|MULX|MWAIT|NEG|NOP|NOT|OR|ORPD|ORPS|OUT|OUTS|" \
151     "OUTSB|OUTSD|OUTSW|PABSB|PABSD|PABSQ|PABSW|PACKSSDW|PACKSSWB|PACKUSDW|" \
152     "PACKUSWB|PADDB|PADDD|PADDQ|PADDSB|PADDSW|PADDUSB|PADDUSW|PADDW|PALIGNR|" \
153     "PAND|PANDN|PAUSE|PAVGB|PAVGW|PBLENDVB|PBLENDW|PCLMULQDQ|PCMPEQB|PCMPEQD|" \
154     "PCMPEQQ|PCMPEQW|PCMPESTRI|PCMPESTRM|PCMPGTB|PCMPGTD|PCMPGTQ|PCMPGTW|" \
155     "PCMPISTRI|PCMPISTRM|PDEP|PEXT|PEXTRB|PEXTRD|PEXTRQ|PEXTRW|PHADDD|PHADDSW|" \
156     "PHADDW|PHMINPOSUW|PHSUBD|PHSUBSW|PHSUBW|PINSRB|PINSRD|PINSRQ|PINSRW|" \
157     "PMADDUBSW|PMADDWD|PMAXSB|PMAXSD|PMAXSQ|PMAXSW|PMAXUB|PMAXUD|PMAXUQ|PMAXUW|" \
158     "PMINSB|PMINSD|PMINSQ|PMINSW|PMINUB|PMINUD|PMINUQ|PMINUW|PMOVMSKB|PMOVSX|" \
159     "PMOVZX|PMULDQ|PMULHRSW|PMULHUW|PMULHW|PMULLD|PMULLQ|PMULLW|PMULUDQ|POP|" \
160     "POPA|POPAD|POPCNT|POPF|POPFD|POPFQ|POR|PREFETCHW|PREFETCHh|PSADBW|PSHUFB|" \
161     "PSHUFD|PSHUFHW|PSHUFLW|PSHUFW|PSIGNB|PSIGND|PSIGNW|PSLLD|PSLLDQ|PSLLQ|" \
162     "PSLLW|PSRAD|PSRAQ|PSRAW|PSRLD|PSRLDQ|PSRLQ|PSRLW|PSUBB|PSUBD|PSUBQ|PSUBSB|" \
163     "PSUBSW|PSUBUSB|PSUBUSW|PSUBW|PTEST|PTWRITE|PUNPCKHBW|PUNPCKHDQ|PUNPCKHQDQ|" \
164     "PUNPCKHWD|PUNPCKLBW|PUNPCKLDQ|PUNPCKLQDQ|PUNPCKLWD|PUSH|PUSHA|PUSHAD|" \
165     "PUSHF|PUSHFD|PUSHFQ|PXOR|RCL|RCPPS|RCPSS|RCR|RDFSBASE|RDGSBASE|RDMSR|" \
166     "RDPID|RDPKRU|RDPMC|RDRAND|RDSEED|RDTSC|RDTSCP|REP|REPE|REPNE|REPNZ|REPZ|" \
167     "RET|ROL|ROR|RORX|ROUNDPD|ROUNDPS|ROUNDSD|ROUNDSS|RSM|RSQRTPS|RSQRTSS|SAHF|" \
168     "SAL|SAR|SARX|SBB|SCAS|SCASB|SCASD|SCASW|SETcc|SFENCE|SGDT|SHA1MSG1|" \
169     "SHA1MSG2|SHA1NEXTE|SHA1RNDS4|SHA256MSG1|SHA256MSG2|SHA256RNDS2|SHL|SHLD|" \
170     "SHLX|SHR|SHRD|SHRX|SHUFPD|SHUFPS|SIDT|SLDT|SMSW|SQRTPD|SQRTPS|SQRTSD|" \
171     "SQRTSS|STAC|STC|STD|STI|STMXCSR|STOS|STOSB|STOSD|STOSQ|STOSW|STR|SUB|" \
172     "SUBPD|SUBPS|SUBSD|SUBSS|SWAPGS|SYSCALL|SYSENTER|SYSEXIT|SYSRET|TEST|" \
173     "TPAUSE|TZCNT|UCOMISD|UCOMISS|UD|UMONITOR|UMWAIT|UNPCKHPD|UNPCKHPS|" \
174     "UNPCKLPD|UNPCKLPS|VALIGND|VALIGNQ|VBLENDMPD|VBLENDMPS|VBROADCAST|" \
175     "VCOMPRESSPD|VCOMPRESSPS|VCVTPD2QQ|VCVTPD2UDQ|VCVTPD2UQQ|VCVTPH2PS|" \
176     "VCVTPS2PH|VCVTPS2QQ|VCVTPS2UDQ|VCVTPS2UQQ|VCVTQQ2PD|VCVTQQ2PS|VCVTSD2USI|" \
177     "VCVTSS2USI|VCVTTPD2QQ|VCVTTPD2UDQ|VCVTTPD2UQQ|VCVTTPS2QQ|VCVTTPS2UDQ|" \
178     "VCVTTPS2UQQ|VCVTTSD2USI|VCVTTSS2USI|VCVTUDQ2PD|VCVTUDQ2PS|VCVTUQQ2PD|" \
179     "VCVTUQQ2PS|VCVTUSI2SD|VCVTUSI2SS|VDBPSADBW|VERR|VERW|VEXPANDPD|VEXPANDPS|" \
180     "VEXTRACTF128|VEXTRACTF32x4|VEXTRACTF32x8|VEXTRACTF64x2|VEXTRACTF64x4|" \
181     "VEXTRACTI128|VEXTRACTI32x4|VEXTRACTI32x8|VEXTRACTI64x2|VEXTRACTI64x4|" \
182     "VFIXUPIMMPD|VFIXUPIMMPS|VFIXUPIMMSD|VFIXUPIMMSS|VFMADD132PD|VFMADD132PS|" \
183     "VFMADD132SD|VFMADD132SS|VFMADD213PD|VFMADD213PS|VFMADD213SD|VFMADD213SS|" \
184     "VFMADD231PD|VFMADD231PS|VFMADD231SD|VFMADD231SS|VFMADDSUB132PD|" \
185     "VFMADDSUB132PS|VFMADDSUB213PD|VFMADDSUB213PS|VFMADDSUB231PD|" \
186     "VFMADDSUB231PS|VFMSUB132PD|VFMSUB132PS|VFMSUB132SD|VFMSUB132SS|" \
187     "VFMSUB213PD|VFMSUB213PS|VFMSUB213SD|VFMSUB213SS|VFMSUB231PD|VFMSUB231PS|" \
188     "VFMSUB231SD|VFMSUB231SS|VFMSUBADD132PD|VFMSUBADD132PS|VFMSUBADD213PD|" \
189     "VFMSUBADD213PS|VFMSUBADD231PD|VFMSUBADD231PS|VFNMADD132PD|VFNMADD132PS|" \
190     "VFNMADD132SD|VFNMADD132SS|VFNMADD213PD|VFNMADD213PS|VFNMADD213SD|" \
191     "VFNMADD213SS|VFNMADD231PD|VFNMADD231PS|VFNMADD231SD|VFNMADD231SS|" \
192     "VFNMSUB132PD|VFNMSUB132PS|VFNMSUB132SD|VFNMSUB132SS|VFNMSUB213PD|" \
193     "VFNMSUB213PS|VFNMSUB213SD|VFNMSUB213SS|VFNMSUB231PD|VFNMSUB231PS|" \
194     "VFNMSUB231SD|VFNMSUB231SS|VFPCLASSPD|VFPCLASSPS|VFPCLASSSD|VFPCLASSSS|" \
195     "VGATHERDPD|VGATHERDPS|VGATHERQPD|VGATHERQPS|VGETEXPPD|VGETEXPPS|VGETEXPSD|" \
196     "VGETEXPSS|VGETMANTPD|VGETMANTPS|VGETMANTSD|VGETMANTSS|VINSERTF128|" \
197     "VINSERTF32x4|VINSERTF32x8|VINSERTF64x2|VINSERTF64x4|VINSERTI128|" \
198     "VINSERTI32x4|VINSERTI32x8|VINSERTI64x2|VINSERTI64x4|VMASKMOV|VMOVDQA32|" \
199     "VMOVDQA64|VMOVDQU16|VMOVDQU32|VMOVDQU64|VMOVDQU8|VPBLENDD|VPBLENDMB|" \
200     "VPBLENDMD|VPBLENDMQ|VPBLENDMW|VPBROADCAST|VPBROADCASTB|VPBROADCASTD|" \
201     "VPBROADCASTM|VPBROADCASTQ|VPBROADCASTW|VPCMPB|VPCMPD|VPCMPQ|VPCMPUB|" \
202     "VPCMPUD|VPCMPUQ|VPCMPUW|VPCMPW|VPCOMPRESSD|VPCOMPRESSQ|VPCONFLICTD|" \
203     "VPCONFLICTQ|VPERM2F128|VPERM2I128|VPERMB|VPERMD|VPERMI2B|VPERMI2D|" \
204     "VPERMI2PD|VPERMI2PS|VPERMI2Q|VPERMI2W|VPERMILPD|VPERMILPS|VPERMPD|VPERMPS|" \
205     "VPERMQ|VPERMT2B|VPERMT2D|VPERMT2PD|VPERMT2PS|VPERMT2Q|VPERMT2W|VPERMW|" \
206     "VPEXPANDD|VPEXPANDQ|VPGATHERDD|VPGATHERDQ|VPGATHERQD|VPGATHERQQ|VPLZCNTD|" \
207     "VPLZCNTQ|VPMADD52HUQ|VPMADD52LUQ|VPMASKMOV|VPMOVB2M|VPMOVD2M|VPMOVDB|" \
208     "VPMOVDW|VPMOVM2B|VPMOVM2D|VPMOVM2Q|VPMOVM2W|VPMOVQ2M|VPMOVQB|VPMOVQD|" \
209     "VPMOVQW|VPMOVSDB|VPMOVSDW|VPMOVSQB|VPMOVSQD|VPMOVSQW|VPMOVSWB|VPMOVUSDB|" \
210     "VPMOVUSDW|VPMOVUSQB|VPMOVUSQD|VPMOVUSQW|VPMOVUSWB|VPMOVW2M|VPMOVWB|" \
211     "VPMULTISHIFTQB|VPROLD|VPROLQ|VPROLVD|VPROLVQ|VPRORD|VPRORQ|VPRORVD|" \
212     "VPRORVQ|VPSCATTERDD|VPSCATTERDQ|VPSCATTERQD|VPSCATTERQQ|VPSLLVD|VPSLLVQ|" \
213     "VPSLLVW|VPSRAVD|VPSRAVQ|VPSRAVW|VPSRLVD|VPSRLVQ|VPSRLVW|VPTERNLOGD|" \
214     "VPTERNLOGQ|VPTESTMB|VPTESTMD|VPTESTMQ|VPTESTMW|VPTESTNMB|VPTESTNMD|" \
215     "VPTESTNMQ|VPTESTNMW|VRANGEPD|VRANGEPS|VRANGESD|VRANGESS|VRCP14PD|VRCP14PS|" \
216     "VRCP14SD|VRCP14SS|VREDUCEPD|VREDUCEPS|VREDUCESD|VREDUCESS|VRNDSCALEPD|" \
217     "VRNDSCALEPS|VRNDSCALESD|VRNDSCALESS|VRSQRT14PD|VRSQRT14PS|VRSQRT14SD|" \
218     "VRSQRT14SS|VSCALEFPD|VSCALEFPS|VSCALEFSD|VSCALEFSS|VSCATTERDPD|" \
219     "VSCATTERDPS|VSCATTERQPD|VSCATTERQPS|VSHUFF32x4|VSHUFF64x2|VSHUFI32x4|" \
220     "VSHUFI64x2|VTESTPD|VTESTPS|VZEROALL|VZEROUPPER|WAIT|WBINVD|WRFSBASE|" \
221     "WRGSBASE|WRMSR|WRPKRU|XABORT|XACQUIRE|XADD|XBEGIN|XCHG|XEND|XGETBV|XLAT|" \
222     "XLATB|XOR|XORPD|XORPS|XRELEASE|XRSTOR|XRSTORS|XSAVE|XSAVEC|XSAVEOPT|" \
223     "XSAVES|XSETBV|XTEST"
224 
225 vector<TOKEN_DEF> g_TokenList =
226 {
227     //{ TOKEN_TYPE::WhiteSpace, R"((\s+))" },
228     { TOKEN_TYPE::WhiteSpace, R"(([ \t]+))" },
229     { TOKEN_TYPE::NewLine, R"((\n))" },
230     { TOKEN_TYPE::Comment, R"((;.*\n))" },
231     { TOKEN_TYPE::HexNumber, R"(([0-9][0-9a-f]*h))" FOLLOWED_BY(R"([\s\n\+\-\*\/,=!\]\(\)])") },
232     { TOKEN_TYPE::DecNumber, R"(([0-9]+))" FOLLOWED_BY(R"([\s\n\+\-\*\/,=!\]\(\)])") },
233     { TOKEN_TYPE::String, R"((\".*\"))" },
234 
235     { TOKEN_TYPE::BraceOpen, R"((\())"},
236     { TOKEN_TYPE::BraceClose, R"((\)))"},
237     { TOKEN_TYPE::MemRefStart, R"((\[))"},
238     { TOKEN_TYPE::MemRefEnd, R"((\]))"},
239     { TOKEN_TYPE::Colon, R"((\:))"},
240     { TOKEN_TYPE::Operator, R"(([,\+\-\*\/\:]))"},
241     { TOKEN_TYPE::StringDef, R"((<.+>))" },
242 
243     { TOKEN_TYPE::KW_include, R"((include))" FOLLOWED_BY(R"([\s])") },
244     { TOKEN_TYPE::KW_const, R"((\.const))" FOLLOWED_BY(R"([\s])") },
245     { TOKEN_TYPE::KW_code, R"((\.code))" FOLLOWED_BY(R"([\s])") },
246     { TOKEN_TYPE::KW_endprolog, R"((\.endprolog))" FOLLOWED_BY(R"([\s])") },
247     { TOKEN_TYPE::KW_ALIGN, R"((ALIGN))" FOLLOWED_BY(R"([\s])") },
248     { TOKEN_TYPE::KW_EXTERN, R"((EXTERN))" FOLLOWED_BY(R"([\s])") },
249     { TOKEN_TYPE::KW_EXTERN, R"((EXTRN))" FOLLOWED_BY(R"([\s])") },
250     { TOKEN_TYPE::KW_PUBLIC, R"((PUBLIC))" FOLLOWED_BY(R"([\s])") },
251     { TOKEN_TYPE::KW_ENDM, R"((ENDM))" FOLLOWED_BY(R"([\s\;])") },
252     { TOKEN_TYPE::KW_END, R"((END))" FOLLOWED_BY(R"([\s])") },
253     { TOKEN_TYPE::KW_if, R"((if))" FOLLOWED_BY(R"([\s])") },
254     { TOKEN_TYPE::KW_ifdef, R"((ifdef))" FOLLOWED_BY(R"([\s])")},
255     { TOKEN_TYPE::KW_ifndef, R"((ifndef))" FOLLOWED_BY(R"([\s])")},
256     { TOKEN_TYPE::KW_else, R"((else))" FOLLOWED_BY(R"([\s])")},
257     { TOKEN_TYPE::KW_endif, R"((endif))" FOLLOWED_BY(R"([\s])")},
258 
259     { TOKEN_TYPE::KW_allocstack, R"((.allocstack))" FOLLOWED_BY(R"([\s])") },
260     { TOKEN_TYPE::KW_savereg, R"((.savereg))" FOLLOWED_BY(R"([\s])") },
261     { TOKEN_TYPE::KW_savexmm128, R"((.savexmm128))" FOLLOWED_BY(R"([\s])") },
262 
263     { TOKEN_TYPE::KW_DB, R"((DB))" FOLLOWED_BY(R"([\s])") },
264     { TOKEN_TYPE::KW_DW, R"((DW))" FOLLOWED_BY(R"([\s])") },
265     { TOKEN_TYPE::KW_DD, R"((DD))" FOLLOWED_BY(R"([\s])") },
266     { TOKEN_TYPE::KW_DQ, R"((DQ))" FOLLOWED_BY(R"([\s])") },
267     { TOKEN_TYPE::KW_EQU, R"((EQU))" FOLLOWED_BY(R"([\s])") },
268     { TOKEN_TYPE::KW_TEXTEQU, R"((TEXTEQU))" FOLLOWED_BY(R"([\s])") },
269     { TOKEN_TYPE::KW_MACRO, R"((MACRO))" FOLLOWED_BY(R"([\s\;])") },
270     { TOKEN_TYPE::KW_PROC, R"((PROC))" FOLLOWED_BY(R"([\s\;])") },
271     { TOKEN_TYPE::KW_FRAME, R"((FRAME))" FOLLOWED_BY(R"([\s\;])") },
272     { TOKEN_TYPE::KW_ENDP, R"((ENDP))" FOLLOWED_BY(R"([\s\;])") },
273     { TOKEN_TYPE::KW_RECORD, R"((RECORD))" FOLLOWED_BY(R"([\s\;])") },
274     { TOKEN_TYPE::KW_MASK, R"((MASK))" FOLLOWED_BY(R"([\s\;])")},
275     { TOKEN_TYPE::KW_ERRDEF, R"((\.ERRDEF))" FOLLOWED_BY(R"([\s\;])")},
276 
277     { TOKEN_TYPE::Filename, R"(([a-z_][a-z0-9_]*\.inc))" FOLLOWED_BY(R"([\s])") },
278     { TOKEN_TYPE::Instruction, "(" INSTRUCTION ")" FOLLOWED_BY(R"([\s])") },
279     { TOKEN_TYPE::Reg8, R"((al|ah|bl|bh|cl|ch|dl|dh|sil|dil|bpl|spl|r8b|r9b|r10b|r11b|r12b|r13b|r14b|r15b))" FOLLOWED_BY(R"([\s\,])") },
280     { TOKEN_TYPE::Reg16, R"((ax|bx|cx|dx|si|di|bp|sp|r8w|r9w|r10w|r11w|r12w|r13w|r14w|r15w))" FOLLOWED_BY(R"([\s\,])") },
281     { TOKEN_TYPE::Reg32, R"((eax|ebx|ecx|edx|esi|edi|ebp|esp|r8d|r9d|r10d|r11d|r12d|r13d|r14d|r15d))" FOLLOWED_BY(R"([\s\,])") },
282     { TOKEN_TYPE::Reg64, R"((rax|rbx|rcx|rdx|rsi|rdi|rbp|rsp|r8|r9|r10|r11|r12|r13|r14|r15))" FOLLOWED_BY(R"([\s\,])") },
283     { TOKEN_TYPE::RegXmm, R"((xmm0|xmm1|xmm2|xmm3|xmm4|xmm5|xmm6|xmm7|xmm8|xmm9|xmm10|xmm11|xmm12|xmm13|xmm14|xmm15))" FOLLOWED_BY(R"([\s\,])") },
284     { TOKEN_TYPE::BYTE_PTR, R"((BYTE[\s]+PTR))" FOLLOWED_BY(R"([\s\[])") },
285     { TOKEN_TYPE::WORD_PTR, R"((WORD[\s]+PTR))" FOLLOWED_BY(R"([\s\[])") },
286     { TOKEN_TYPE::DWORD_PTR, R"((DWORD[\s]+PTR))" FOLLOWED_BY(R"([\s\[])") },
287     { TOKEN_TYPE::QWORD_PTR, R"((QWORD[\s]+PTR))" FOLLOWED_BY(R"([\s\[])") },
288     { TOKEN_TYPE::XMMWORD_PTR, R"((XMMWORD[\s]+PTR))" FOLLOWED_BY(R"([\s\[])") },
289 
290     { TOKEN_TYPE::Identifier, R"((@@))" FOLLOWED_BY(SEPARATOR)},
291     { TOKEN_TYPE::Identifier, R"((@[a-z_][a-z0-9_]*))" FOLLOWED_BY(SEPARATOR)},
292     { TOKEN_TYPE::Identifier, R"(([a-z_][a-z0-9_]*))" FOLLOWED_BY(SEPARATOR)},
293 
294 };
295 
296 // FIXME: use context?
297 unsigned int g_label_number = 0;
298 
299 bool g_processing_jmp = false;
300 
301 enum class IDTYPE
302 {
303     Memory,
304     Register,
305     Label,
306     Constant,
307     Macro,
308     Instruction,
309     String,
310     Unknown
311 };
312 
313 struct IDENTIFIER
314 {
315     string Name;
316     IDTYPE Type;
317 };
318 
319 vector<IDENTIFIER> g_identifiers;
320 
321 static
322 void
add_identifier(Token & tok,IDTYPE type)323 add_identifier(Token& tok, IDTYPE type)
324 {
325     g_identifiers.push_back(IDENTIFIER{ tok.str(), type });
326     //fprintf(stderr, "Added id: '%s'\n", tok.str().c_str());
327 }
328 
329 void
add_mem_id(Token & tok)330 add_mem_id(Token& tok)
331 {
332     add_identifier(tok, IDTYPE::Memory);
333 }
334 
335 bool
is_mem_id(Token & tok)336 is_mem_id(Token& tok)
337 {
338     for (IDENTIFIER& identifier : g_identifiers)
339     {
340         if (identifier.Name == tok.str())
341         {
342             return identifier.Type == IDTYPE::Memory;
343         }
344     }
345 
346     return true;
347 }
348 
349 bool
iequals(const string & a,const string & b)350 iequals(const string &a, const string &b)
351 {
352     size_t sz = a.size();
353     if (b.size() != sz)
354         return false;
355     for (unsigned int i = 0; i < sz; ++i)
356         if (tolower(a[i]) != tolower(b[i]))
357             return false;
358     return true;
359 }
360 
361 Token
get_expected_token(Token && tok,TOKEN_TYPE type)362 get_expected_token(Token&& tok, TOKEN_TYPE type)
363 {
364     if (tok.type() != type)
365     {
366         throw "Not white space after identifier!\n";
367     }
368 
369     return tok;
370 }
371 
get_ws(Token && tok)372 Token get_ws(Token&& tok)
373 {
374     int type = tok.type();
375     if (type != TOKEN_TYPE::WhiteSpace)
376     {
377         throw "Not white space after identifier!\n";
378     }
379 
380     return tok;
381 }
382 
get_ws_or_nl(Token && tok)383 Token get_ws_or_nl(Token&& tok)
384 {
385     int type = tok.type();
386     if ((type != TOKEN_TYPE::WhiteSpace) &&
387         (type != TOKEN_TYPE::NewLine))
388     {
389         throw "Not white space after identifier!\n";
390     }
391 
392     return tok;
393 }
394 
is_string_in_list(vector<string> list,string str)395 bool is_string_in_list(vector<string> list, string str)
396 {
397     for (string &s : list)
398     {
399         if (s == str)
400         {
401             return true;
402         }
403     }
404 
405     return false;
406 }
407 
408 size_t
translate_token(TokenList & tokens,size_t index,const vector<string> & macro_params)409 translate_token(TokenList& tokens, size_t index, const vector<string> &macro_params)
410 {
411     Token tok = tokens[index];
412     switch (tok.type())
413     {
414         case TOKEN_TYPE::Comment:
415             printf("//%s", tok.str().c_str() + 1);
416             break;
417 
418         case TOKEN_TYPE::DecNumber:
419         {
420             unsigned long long num = stoull(tok.str(), nullptr, 10);
421             printf("%llu", num);
422             break;
423         }
424 
425         case TOKEN_TYPE::HexNumber:
426         {
427             string number = tok.str();
428             printf("0x%s", number.substr(0, number.size() - 1).c_str());
429             break;
430         }
431 
432         case TOKEN_TYPE::Identifier:
433             if (is_string_in_list(macro_params, tok.str()))
434             {
435                 printf("\\");
436             }
437             printf("%s", tok.str().c_str());
438             break;
439 
440         // We migt want to improve these
441         case TOKEN_TYPE::BYTE_PTR:
442         case TOKEN_TYPE::WORD_PTR:
443         case TOKEN_TYPE::DWORD_PTR:
444         case TOKEN_TYPE::QWORD_PTR:
445         case TOKEN_TYPE::XMMWORD_PTR:
446 
447         // Check these. valid only in instructions?
448         case TOKEN_TYPE::Reg8:
449         case TOKEN_TYPE::Reg16:
450         case TOKEN_TYPE::Reg32:
451         case TOKEN_TYPE::Reg64:
452         case TOKEN_TYPE::RegXmm:
453         case TOKEN_TYPE::Instruction:
454 
455         case TOKEN_TYPE::WhiteSpace:
456         case TOKEN_TYPE::NewLine:
457         case TOKEN_TYPE::Operator:
458             printf("%s", tok.str().c_str());
459             break;
460 
461         default:
462             printf("%s", tok.str().c_str());
463             break;
464     }
465 
466     return index + 1;
467 }
468 
complete_line(TokenList & tokens,size_t index,const vector<string> & macro_params)469 size_t complete_line(TokenList &tokens, size_t index, const vector<string> &macro_params)
470 {
471     while (index < tokens.size())
472     {
473         Token tok = tokens[index];
474         index = translate_token(tokens, index, macro_params);
475         if ((tok.type() == TOKEN_TYPE::NewLine) ||
476             (tok.type() == TOKEN_TYPE::Comment))
477         {
478             break;
479         }
480     }
481 
482     return index;
483 }
484 
485 size_t
translate_expression(TokenList & tokens,size_t index,const vector<string> & macro_params)486 translate_expression(TokenList &tokens, size_t index, const vector<string> &macro_params)
487 {
488     while (index < tokens.size())
489     {
490         Token tok = tokens[index];
491         switch (tok.type())
492         {
493             case TOKEN_TYPE::NewLine:
494             case TOKEN_TYPE::Comment:
495                 return index;
496 
497             case TOKEN_TYPE::KW_MASK:
498                 printf("MASK_");
499                 index += 2;
500                 break;
501 
502             case TOKEN_TYPE::Instruction:
503                 if (iequals(tok.str(), "and"))
504                 {
505                     printf("&");
506                     index += 1;
507                 }
508                 else if (iequals(tok.str(), "or"))
509                 {
510                     printf("|");
511                     index += 1;
512                 }
513                 else if (iequals(tok.str(), "shl"))
514                 {
515                     printf("<<");
516                     index += 1;
517                 }
518                 else if (iequals(tok.str(), "not"))
519                 {
520                     printf("!");
521                     index += 1;
522                 }
523                 else
524                 {
525                     throw "Invalid expression";
526                 }
527                 break;
528 
529             case TOKEN_TYPE::Operator:
530                 if (tok.str() == ",")
531                 {
532                     return index;
533                 }
534             case TOKEN_TYPE::WhiteSpace:
535             case TOKEN_TYPE::BraceOpen:
536             case TOKEN_TYPE::BraceClose:
537             case TOKEN_TYPE::DecNumber:
538             case TOKEN_TYPE::HexNumber:
539             case TOKEN_TYPE::Identifier:
540                 index = translate_token(tokens, index, macro_params);
541                 break;
542 
543             default:
544                 index = translate_token(tokens, index, macro_params);
545         }
546     }
547 
548     return index;
549 }
550 
translate_mem_ref(TokenList & tokens,size_t index,const vector<string> & macro_params)551 size_t translate_mem_ref(TokenList& tokens, size_t index, const vector<string>& macro_params)
552 {
553     unsigned int offset = 0;
554 
555     Token tok = tokens[index];
556 
557     if ((tok.type() == TOKEN_TYPE::DecNumber) ||
558         (tok.type() == TOKEN_TYPE::HexNumber))
559     {
560         offset = stoi(tok.str(), nullptr, 0);
561         index += 2;
562     }
563 
564     index = translate_token(tokens, index, macro_params);
565 
566     while (index < tokens.size())
567     {
568         Token tok = tokens[index];
569         index = translate_token(tokens, index, macro_params);
570         if (tok.type() == TOKEN_TYPE::MemRefEnd)
571         {
572             if (offset != 0)
573             {
574                 printf(" + %u", offset);
575             }
576             return index;
577         }
578     }
579 
580     throw "Failed to translate memory ref";
581     return index;
582 }
583 
translate_instruction_param(TokenList & tokens,size_t index,const vector<string> & macro_params)584 size_t translate_instruction_param(TokenList& tokens, size_t index, const vector<string>& macro_params)
585 {
586     switch (tokens[index].type())
587     {
588         case TOKEN_TYPE::BYTE_PTR:
589         case TOKEN_TYPE::WORD_PTR:
590         case TOKEN_TYPE::DWORD_PTR:
591         case TOKEN_TYPE::QWORD_PTR:
592         case TOKEN_TYPE::XMMWORD_PTR:
593             index = translate_token(tokens, index, macro_params);
594 
595             // Optional whitespace
596             if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
597             {
598                 index = translate_token(tokens, index, macro_params);
599             }
600     }
601 
602     while (index < tokens.size())
603     {
604         Token tok = tokens[index];
605         switch (tok.type())
606         {
607             case TOKEN_TYPE::MemRefStart:
608                 return translate_mem_ref(tokens, index, macro_params);
609 
610             case TOKEN_TYPE::NewLine:
611             case TOKEN_TYPE::Comment:
612                 return index;
613 
614             case TOKEN_TYPE::Operator:
615                 if (tok.str() == ",")
616                     return index;
617                 return translate_token(tokens, index, macro_params);
618 
619             case TOKEN_TYPE::Identifier:
620                 index = translate_token(tokens, index, macro_params);
621                 if (is_mem_id(tok) &&
622                     !is_string_in_list(macro_params, tok.str()) &&
623                     !g_processing_jmp)
624                 {
625                     printf("[rip]");
626                 }
627                 break;
628 
629             default:
630                 index = translate_expression(tokens, index, macro_params);
631         }
632     }
633 
634     return index;
635 }
636 
637 static
638 bool
is_jmp_or_call(const Token & tok)639 is_jmp_or_call(const Token& tok)
640 {
641     const char* inst_list[] = {
642         "jmp", "call", "ja", "jae", "jb", "jbe", "jc", "jcxz", "je", "jecxz", "jg", "jge",
643         "jl", "jle", "jna", "jnae", "jnb", "jnbe", "jnc", "jne", "jng", "jnge", "jnl", "jnle",
644         "jno", "jnp", "jns", "jnz", "jo", "jp", "jpe", "jpo", "jrcxz", "js", "jz", "loop", "loope",
645         "loopne", "loopnz", "loopz"
646     };
647 
648     for (const char* inst : inst_list)
649     {
650         if (iequals(tok.str(), inst))
651         {
652             return true;
653         }
654     }
655 
656     return false;
657 }
658 
translate_instruction(TokenList & tokens,size_t index,const vector<string> & macro_params)659 size_t translate_instruction(TokenList& tokens, size_t index, const vector<string>& macro_params)
660 {
661     // Check for jump/call instructions
662     if (is_jmp_or_call(tokens[index]))
663     {
664         g_processing_jmp = true;
665     }
666 
667     // Translate the instruction itself
668     index = translate_token(tokens, index, macro_params);
669 
670     // Handle instruction parameters
671     while (index < tokens.size())
672     {
673         // Optional whitespace
674         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
675         {
676             index = translate_token(tokens, index, macro_params);
677         }
678 
679         // Check for parameters
680         Token tok = tokens[index];
681         switch (tok.type())
682         {
683             case TOKEN_TYPE::Comment:
684             case TOKEN_TYPE::NewLine:
685                 g_processing_jmp = false;
686                 return index;
687 
688             case TOKEN_TYPE::WhiteSpace:
689             case TOKEN_TYPE::Operator:
690                 index = translate_token(tokens, index, macro_params);
691                 break;
692 
693             default:
694                 index = translate_instruction_param(tokens, index, macro_params);
695                 break;
696         }
697     }
698 
699     g_processing_jmp = false;
700     return index;
701 }
702 
translate_item(TokenList & tokens,size_t index,const vector<string> & macro_params)703 size_t translate_item(TokenList& tokens, size_t index, const vector<string> &macro_params)
704 {
705     switch (tokens[index].type())
706     {
707         case TOKEN_TYPE::DecNumber:
708         case TOKEN_TYPE::HexNumber:
709         case TOKEN_TYPE::String:
710         case TOKEN_TYPE::WhiteSpace:
711             return translate_token(tokens, index, macro_params);
712     }
713 
714     throw "Failed to translate item";
715     return -1;
716 }
717 
translate_list(TokenList & tokens,size_t index,const vector<string> & macro_params)718 size_t translate_list(TokenList& tokens, size_t index, const vector<string> &macro_params)
719 {
720     while (index < tokens.size())
721     {
722         // The item itself
723         index = translate_item(tokens, index, macro_params);
724 
725         // Optional white space
726         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
727         {
728             index = translate_token(tokens, index, macro_params);
729         }
730 
731         // End of list?
732         if ((tokens[index].type() == TOKEN_TYPE::Comment) ||
733             (tokens[index].type() == TOKEN_TYPE::NewLine))
734         {
735             return index;
736         }
737 
738         // We expect a comma here
739         if ((tokens[index].type() != TOKEN_TYPE::Operator) ||
740             (tokens[index].str() != ","))
741         {
742             throw "Unexpected end of list";
743         }
744 
745         index = translate_token(tokens, index, macro_params);
746         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
747         {
748             index = translate_token(tokens, index, macro_params);
749         }
750     }
751 
752     throw "Failed to translate list";
753     return -1;
754 }
755 
756 size_t
translate_data_def(TokenList & tokens,size_t index,const vector<string> & macro_params)757 translate_data_def(TokenList& tokens, size_t index, const vector<string>& macro_params)
758 {
759     Token tok = tokens[index];
760     Token tok1 = get_ws(tokens[index + 1]);
761     string directive, need, have ="";
762 
763     switch (tok.type())
764     {
765         case TOKEN_TYPE::KW_DB:
766             directive = ".byte";
767             break;
768 
769         case TOKEN_TYPE::KW_DW:
770             directive = ".short";
771             break;
772 
773         case TOKEN_TYPE::KW_DD:
774             directive = ".long";
775             break;
776 
777         case TOKEN_TYPE::KW_DQ:
778             directive = ".quad";
779             break;
780     }
781 
782     index += 2;
783 
784     while (index < tokens.size())
785     {
786         // Check if we need '.ascii' for ASCII strings
787         if (tokens[index].str()[0] == '\"')
788         {
789             need = ".ascii";
790         }
791         else
792         {
793             need = directive;
794         }
795 
796         // Output the directive we need (or a comma)
797         if (have == "")
798         {
799             printf("%s ", need.c_str());
800         }
801         else if (have != need)
802         {
803             printf("\n%s ", need.c_str());
804         }
805         else
806         {
807             printf(", ");
808         }
809 
810         have = need;
811 
812         // The item itself
813         index = translate_item(tokens, index, macro_params);
814 
815         // Optional white space
816         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
817         {
818             index = translate_token(tokens, index, macro_params);
819         }
820 
821         // End of list?
822         if ((tokens[index].type() == TOKEN_TYPE::Comment) ||
823             (tokens[index].type() == TOKEN_TYPE::NewLine))
824         {
825             return index;
826         }
827 
828         // We expect a comma here
829         if ((tokens[index].type() != TOKEN_TYPE::Operator) ||
830             (tokens[index].str() != ","))
831         {
832             throw "Unexpected end of list";
833         }
834 
835         // Skip comma and optional white-space
836         index++;
837         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
838         {
839             index++;
840         }
841     }
842 
843     throw "Failed to translate list";
844     return -1;
845 }
846 
847 size_t
translate_construct_one_param(string translated,TokenList & tokens,size_t index,const vector<string> & macro_params)848 translate_construct_one_param(string translated, TokenList& tokens, size_t index, const vector<string>& macro_params)
849 {
850     // The next token should be white space
851     Token tok1 = get_ws(tokens[index + 1]);
852 
853     printf("%s%s", translated.c_str(), tok1.str().c_str());
854     return translate_expression(tokens, index + 2, macro_params);
855 }
856 
857 size_t
translate_record(TokenList & tokens,size_t index,const vector<string> & macro_params)858 translate_record(TokenList &tokens, size_t index, const vector<string> &macro_params)
859 {
860     unsigned int bits, bitpos = 0;
861     unsigned long long oldmask = 0, mask = 0;
862 
863     Token tok_name = get_expected_token(tokens[index], TOKEN_TYPE::Identifier);
864     index += 4;
865     while (index < tokens.size())
866     {
867         Token tok_member = get_expected_token(tokens[index++], TOKEN_TYPE::Identifier);
868 
869         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
870         {
871             index++;
872         }
873 
874         if (tokens[index++].str() != ":")
875         {
876             throw "Unexpected token";
877         }
878 
879         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
880         {
881             index++;
882         }
883 
884         Token tok_bits = tokens[index++];
885         if ((tok_bits.type() != TOKEN_TYPE::DecNumber) &&
886             (tok_bits.type() != TOKEN_TYPE::HexNumber))
887         {
888             throw "Unexpected token";
889         }
890 
891         bits = stoi(tok_bits.str(), nullptr, 0);
892 
893         printf("%s = %u\n", tok_member.str().c_str(), bitpos);
894 
895         oldmask = (1ULL << bitpos) - 1;
896         bitpos += bits;
897         mask = (1ULL << bitpos) - 1 - oldmask;
898         printf("MASK_%s = 0x%llx\n", tok_member.str().c_str(), mask);
899 
900         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
901         {
902             index++;
903         }
904 
905         if ((tokens[index].type() == TOKEN_TYPE::NewLine) ||
906             (tokens[index].type() == TOKEN_TYPE::Comment))
907         {
908             break;
909         }
910 
911         if (tokens[index].str() != ",")
912         {
913             throw "unexpected token";
914         }
915 
916         index++;
917         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
918         {
919             index++;
920         }
921 
922         if ((tokens[index].type() == TOKEN_TYPE::NewLine) ||
923             (tokens[index].type() == TOKEN_TYPE::Comment))
924         {
925             index++;
926         }
927 
928         if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
929         {
930             index++;
931         }
932     }
933 
934     return index;
935 }
936 
937 size_t
translate_identifier_construct(TokenList & tokens,size_t index,const vector<string> & macro_params)938 translate_identifier_construct(TokenList& tokens, size_t index, const vector<string> &macro_params)
939 {
940     Token tok = tokens[index];
941     Token tok1 = tokens[index + 1];
942 
943     if (tok1.type() == TOKEN_TYPE::Colon)
944     {
945         if (tok.str() == "@@")
946         {
947             g_label_number++;
948             printf("%u:", g_label_number);
949         }
950         else
951         {
952             printf("%s:", tok.str().c_str());
953         }
954         add_identifier(tok, IDTYPE::Label);
955         return index + 2;
956     }
957 
958     Token tok2 = tokens[index + 2];
959 
960     switch (tok2.type())
961     {
962         case TOKEN_TYPE::KW_MACRO:
963             throw "Cannot have a nested macro!";
964 
965         case TOKEN_TYPE::KW_DB:
966         case TOKEN_TYPE::KW_DW:
967         case TOKEN_TYPE::KW_DD:
968         case TOKEN_TYPE::KW_DQ:
969             printf("%s:%s", tok.str().c_str(), tok1.str().c_str());
970             add_mem_id(tok);
971             return translate_data_def(tokens, index + 2, macro_params);
972 
973         case TOKEN_TYPE::KW_EQU:
974             //printf("%s%s", tok.str().c_str(), tok1.str().c_str());
975             printf("#define %s ", tok.str().c_str());
976             add_identifier(tok, IDTYPE::Constant);
977             return translate_expression(tokens, index + 3, macro_params);
978 
979         case TOKEN_TYPE::KW_TEXTEQU:
980         {
981             Token tok3 = get_ws(tokens[index + 3]);
982             Token tok4 = get_expected_token(tokens[index + 4], TOKEN_TYPE::StringDef);
983 
984             string textdef = tok4.str();
985             printf("#define %s %s", tok.str().c_str(), textdef.substr(1, textdef.size() - 2).c_str());
986             add_identifier(tok, IDTYPE::Constant);
987             return index + 5;
988         }
989 
990         case TOKEN_TYPE::KW_PROC:
991         {
992             printf(".func %s\n", tok.str().c_str());
993             printf("%s:", tok.str().c_str());
994             index += 3;
995 
996             if ((tokens[index].type() == TOKEN_TYPE::WhiteSpace) &&
997                 (tokens[index + 1].type() == TOKEN_TYPE::KW_FRAME))
998             {
999 #ifdef TARGET_amd64
1000                 printf("\n.seh_proc %s\n", tok.str().c_str());
1001 #else
1002                 printf("\n.cfi_startproc\n");
1003 #endif
1004                 index += 2;
1005             }
1006             add_identifier(tok, IDTYPE::Label);
1007             break;
1008         }
1009 
1010         case TOKEN_TYPE::KW_ENDP:
1011         {
1012             printf(".seh_endproc\n.endfunc");
1013             index += 3;
1014             break;
1015         }
1016 
1017         case TOKEN_TYPE::KW_RECORD:
1018             index = translate_record(tokens, index, macro_params);
1019             break;
1020 
1021         default:
1022             // We don't know what it is, assume it's a macro and treat it like an instruction
1023             index = translate_instruction(tokens, index, macro_params);
1024             break;
1025     }
1026 
1027     return index;
1028 }
1029 
1030 size_t
translate_construct(TokenList & tokens,size_t index,const vector<string> & macro_params)1031 translate_construct(TokenList& tokens, size_t index, const vector<string> &macro_params)
1032 {
1033     Token tok = tokens[index];
1034 
1035     switch (tok.type())
1036     {
1037         case TOKEN_TYPE::WhiteSpace:
1038         case TOKEN_TYPE::NewLine:
1039         case TOKEN_TYPE::Comment:
1040             return translate_token(tokens, index, macro_params);
1041 
1042         case TOKEN_TYPE::Identifier:
1043             return translate_identifier_construct(tokens, index, macro_params);
1044 
1045         case TOKEN_TYPE::KW_ALIGN:
1046             index = translate_construct_one_param(".align", tokens, index, macro_params);
1047             break;
1048 
1049         case TOKEN_TYPE::KW_allocstack:
1050             index = translate_construct_one_param(".seh_stackalloc", tokens, index, macro_params);
1051             break;
1052 
1053         case TOKEN_TYPE::KW_code:
1054 #ifdef TARGET_amd64
1055             printf(".code64");
1056 #else
1057             printf(".code");
1058 #endif
1059             printf(" .intel_syntax noprefix");
1060             index++;
1061             break;
1062 
1063         case TOKEN_TYPE::KW_const:
1064             printf(".section .rdata");
1065             index++;
1066             break;
1067 
1068         case TOKEN_TYPE::KW_DB:
1069         case TOKEN_TYPE::KW_DW:
1070         case TOKEN_TYPE::KW_DD:
1071         case TOKEN_TYPE::KW_DQ:
1072             return translate_data_def(tokens, index, macro_params);
1073 
1074         case TOKEN_TYPE::KW_END:
1075             printf("// END\n");
1076             return tokens.size();
1077 
1078         case TOKEN_TYPE::KW_endprolog:
1079             printf(".seh_endprologue");
1080             index++;
1081             break;
1082 
1083         case TOKEN_TYPE::KW_EXTERN:
1084         {
1085             Token tok1 = get_ws_or_nl(tokens[index + 1]);
1086             Token tok2 = get_expected_token(tokens[index + 2], TOKEN_TYPE::Identifier);
1087             add_mem_id(tok2);
1088             printf("//");
1089             return complete_line(tokens, index, macro_params);
1090         }
1091 
1092         case TOKEN_TYPE::KW_if:
1093         case TOKEN_TYPE::KW_ifdef:
1094         case TOKEN_TYPE::KW_ifndef:
1095         case TOKEN_TYPE::KW_else:
1096         case TOKEN_TYPE::KW_endif:
1097             // TODO: handle parameter differences between "if" and ".if" etc.
1098             printf(".");
1099             return complete_line(tokens, index, macro_params);
1100 
1101         case TOKEN_TYPE::KW_include:
1102         {
1103             // The next token should be white space
1104             Token tok1 = get_ws_or_nl(tokens[index + 1]);
1105             Token tok2 = get_expected_token(tokens[index + 2], TOKEN_TYPE::Filename);
1106             printf("#include \"%s.h\"", tok2.str().c_str());
1107             index += 3;
1108             break;
1109         }
1110 
1111         case TOKEN_TYPE::KW_PUBLIC:
1112             index = translate_construct_one_param(".global", tokens, index, macro_params);
1113             break;
1114 
1115         case TOKEN_TYPE::KW_savereg:
1116             printf(".seh_savereg");
1117             return complete_line(tokens, index + 1, macro_params);
1118 
1119         case TOKEN_TYPE::KW_savexmm128:
1120             printf(".seh_savexmm");
1121             return complete_line(tokens, index + 1, macro_params);
1122 
1123         case TOKEN_TYPE::Instruction:
1124             index = translate_instruction(tokens, index, macro_params);
1125             break;
1126 
1127         case TOKEN_TYPE::KW_ERRDEF:
1128             printf("//");
1129             return complete_line(tokens, index, macro_params);
1130 
1131         default:
1132             throw "failed to translate construct";
1133     }
1134 
1135     // Skip optional white-space
1136     if (tokens[index].type() == TOKEN_TYPE::WhiteSpace)
1137     {
1138         index++;
1139     }
1140 
1141     // Line should end here!
1142     Token end = tokens[index];
1143     if ((end.type() != TOKEN_TYPE::Comment) &&
1144         (end.type() != TOKEN_TYPE::NewLine))
1145     {
1146         throw "unexpected tokens";
1147     }
1148 
1149     return index;
1150 }
1151 
1152 size_t
translate_macro(TokenList & tokens,size_t index)1153 translate_macro(TokenList& tokens, size_t index)
1154 {
1155     vector<string> macro_params;
1156 
1157     printf(".macro %s", tokens[index].str().c_str());
1158 
1159     // Parse marameters
1160     index += 3;
1161     while (index < tokens.size())
1162     {
1163         Token tok = tokens[index];
1164         switch (tok.type())
1165         {
1166             case TOKEN_TYPE::NewLine:
1167             case TOKEN_TYPE::Comment:
1168                 index = translate_token(tokens, index, macro_params);
1169                 break;
1170 
1171             case TOKEN_TYPE::Identifier:
1172                 macro_params.push_back(tok.str());
1173                 printf("%s", tok.str().c_str());
1174                 index++;
1175                 continue;
1176 
1177             case TOKEN_TYPE::WhiteSpace:
1178             case TOKEN_TYPE::Operator:
1179                 index = translate_token(tokens, index, macro_params);
1180                 continue;
1181         }
1182 
1183         break;
1184     }
1185 
1186     // Parse content
1187     while (index < tokens.size())
1188     {
1189         Token tok = tokens[index];
1190         switch (tok.type())
1191         {
1192             case TOKEN_TYPE::KW_ENDM:
1193                 printf(".endm");
1194                 return index + 1;
1195 
1196             default:
1197                 index = translate_construct(tokens, index, macro_params);
1198         }
1199     }
1200 
1201     throw "Failed to translate macro";
1202     return -1;
1203 }
1204 
1205 void
translate(TokenList & tokens)1206 translate(TokenList &tokens)
1207 {
1208     size_t index = 0;
1209     size_t size = tokens.size();
1210     vector<string> empty_macro_params;
1211 
1212     while (index < size)
1213     {
1214         // Macros are special
1215         if ((tokens[index].type() == TOKEN_TYPE::Identifier) &&
1216             (tokens[index + 1].type() == TOKEN_TYPE::WhiteSpace) &&
1217             (tokens[index + 2].type() == TOKEN_TYPE::KW_MACRO))
1218         {
1219             index = translate_macro(tokens, index);
1220         }
1221         else
1222         {
1223             index = translate_construct(tokens, index, empty_macro_params);
1224         }
1225     }
1226 }
1227 
main(int argc,char * argv[])1228 int main(int argc, char* argv[])
1229 {
1230     if (argc < 2)
1231     {
1232         fprintf(stderr, "Invalid parameter!\n");
1233         return -1;
1234     }
1235 
1236 #if PROFILING_ENABLED
1237     time_t start_time = time(NULL);
1238 #endif
1239 
1240     try
1241     {
1242         // Open and read the input file
1243         string filename(argv[1]);
1244         ifstream file(filename);
1245         stringstream buffer;
1246         buffer << file.rdbuf();
1247         string text = buffer.str();
1248 
1249         // Create the tokenizer
1250         Tokenizer tokenizer(g_TokenList);
1251 
1252         // Get a token list
1253         TokenList toklist(tokenizer, text);
1254 
1255         // Now translate the tokens
1256         translate(toklist);
1257     }
1258     catch (const char* message)
1259     {
1260         fprintf(stderr, "Exception caught: '%s'\n", message);
1261         return -2;
1262     }
1263 
1264 #if PROFILING_ENABLED
1265     time_t total_time = time(NULL) + 1 - start_time;
1266     fprintf(stderr, "total_time = %llu\n", total_time);
1267     fprintf(stderr, "search_time = %llu\n", search_time);
1268     fprintf(stderr, "search: %llu %%\n", search_time * 100 / total_time);
1269 #endif
1270 
1271     return 0;
1272 }
1273