1; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding 2; version for AMD64 on Windows using Microsoft C compiler 3; 4; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c 5; inffasx64.asm is called by inffas8664.c, which contain more info. 6 7 8; to compile this file, I use option 9; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm 10; with Microsoft Macro Assembler (x64) for AMD64 11; 12; ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK 13; 14; (you can get Windows 2003 server DDK with ml64 and cl.exe for AMD64 from 15; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price) 16; 17 18.code 19inffas8664fnc PROC 20 21; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 22; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 23; 24; All registers must be preserved across the call, except for 25; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch. 26 27 28 mov [rsp-8],rsi 29 mov [rsp-16],rdi 30 mov [rsp-24],r12 31 mov [rsp-32],r13 32 mov [rsp-40],r14 33 mov [rsp-48],r15 34 mov [rsp-56],rbx 35 36 mov rax,rcx 37 38 mov [rax+8], rbp ; /* save regs rbp and rsp */ 39 mov [rax], rsp 40 41 mov rsp, rax ; /* make rsp point to &ar */ 42 43 mov rsi, [rsp+16] ; /* rsi = in */ 44 mov rdi, [rsp+32] ; /* rdi = out */ 45 mov r9, [rsp+24] ; /* r9 = last */ 46 mov r10, [rsp+48] ; /* r10 = end */ 47 mov rbp, [rsp+64] ; /* rbp = lcode */ 48 mov r11, [rsp+72] ; /* r11 = dcode */ 49 mov rdx, [rsp+80] ; /* rdx = hold */ 50 mov ebx, [rsp+88] ; /* ebx = bits */ 51 mov r12d, [rsp+100] ; /* r12d = lmask */ 52 mov r13d, [rsp+104] ; /* r13d = dmask */ 53 ; /* r14d = len */ 54 ; /* r15d = dist */ 55 56 57 cld 58 cmp r10, rdi 59 je L_one_time ; /* if only one decode left */ 60 cmp r9, rsi 61 62 jne L_do_loop 63 64 65L_one_time: 66 mov r8, r12 ; /* r8 = lmask */ 67 cmp bl, 32 68 ja L_get_length_code_one_time 69 70 lodsd ; /* eax = *(uint *)in++ */ 71 mov cl, bl ; /* cl = bits, needs it for shifting */ 72 add bl, 32 ; /* bits += 32 */ 73 shl rax, cl 74 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ 75 jmp L_get_length_code_one_time 76 77ALIGN 4 78L_while_test: 79 cmp r10, rdi 80 jbe L_break_loop 81 cmp r9, rsi 82 jbe L_break_loop 83 84L_do_loop: 85 mov r8, r12 ; /* r8 = lmask */ 86 cmp bl, 32 87 ja L_get_length_code ; /* if (32 < bits) */ 88 89 lodsd ; /* eax = *(uint *)in++ */ 90 mov cl, bl ; /* cl = bits, needs it for shifting */ 91 add bl, 32 ; /* bits += 32 */ 92 shl rax, cl 93 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ 94 95L_get_length_code: 96 and r8, rdx ; /* r8 &= hold */ 97 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */ 98 99 mov cl, ah ; /* cl = this.bits */ 100 sub bl, ah ; /* bits -= this.bits */ 101 shr rdx, cl ; /* hold >>= this.bits */ 102 103 test al, al 104 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */ 105 106 mov r8, r12 ; /* r8 = lmask */ 107 shr eax, 16 ; /* output this.val char */ 108 stosb 109 110L_get_length_code_one_time: 111 and r8, rdx ; /* r8 &= hold */ 112 mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */ 113 114L_dolen: 115 mov cl, ah ; /* cl = this.bits */ 116 sub bl, ah ; /* bits -= this.bits */ 117 shr rdx, cl ; /* hold >>= this.bits */ 118 119 test al, al 120 jnz L_test_for_length_base ; /* if (op != 0) 45.7% */ 121 122 shr eax, 16 ; /* output this.val char */ 123 stosb 124 jmp L_while_test 125 126ALIGN 4 127L_test_for_length_base: 128 mov r14d, eax ; /* len = this */ 129 shr r14d, 16 ; /* len = this.val */ 130 mov cl, al 131 132 test al, 16 133 jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */ 134 and cl, 15 ; /* op &= 15 */ 135 jz L_decode_distance ; /* if (!op) */ 136 137L_add_bits_to_len: 138 sub bl, cl 139 xor eax, eax 140 inc eax 141 shl eax, cl 142 dec eax 143 and eax, edx ; /* eax &= hold */ 144 shr rdx, cl 145 add r14d, eax ; /* len += hold & mask[op] */ 146 147L_decode_distance: 148 mov r8, r13 ; /* r8 = dmask */ 149 cmp bl, 32 150 ja L_get_distance_code ; /* if (32 < bits) */ 151 152 lodsd ; /* eax = *(uint *)in++ */ 153 mov cl, bl ; /* cl = bits, needs it for shifting */ 154 add bl, 32 ; /* bits += 32 */ 155 shl rax, cl 156 or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ 157 158L_get_distance_code: 159 and r8, rdx ; /* r8 &= hold */ 160 mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */ 161 162L_dodist: 163 mov r15d, eax ; /* dist = this */ 164 shr r15d, 16 ; /* dist = this.val */ 165 mov cl, ah 166 sub bl, ah ; /* bits -= this.bits */ 167 shr rdx, cl ; /* hold >>= this.bits */ 168 mov cl, al ; /* cl = this.op */ 169 170 test al, 16 ; /* if ((op & 16) == 0) */ 171 jz L_test_for_second_level_dist 172 and cl, 15 ; /* op &= 15 */ 173 jz L_check_dist_one 174 175L_add_bits_to_dist: 176 sub bl, cl 177 xor eax, eax 178 inc eax 179 shl eax, cl 180 dec eax ; /* (1 << op) - 1 */ 181 and eax, edx ; /* eax &= hold */ 182 shr rdx, cl 183 add r15d, eax ; /* dist += hold & ((1 << op) - 1) */ 184 185L_check_window: 186 mov r8, rsi ; /* save in so from can use it's reg */ 187 mov rax, rdi 188 sub rax, [rsp+40] ; /* nbytes = out - beg */ 189 190 cmp eax, r15d 191 jb L_clip_window ; /* if (dist > nbytes) 4.2% */ 192 193 mov ecx, r14d ; /* ecx = len */ 194 mov rsi, rdi 195 sub rsi, r15 ; /* from = out - dist */ 196 197 sar ecx, 1 198 jnc L_copy_two ; /* if len % 2 == 0 */ 199 200 rep movsw 201 mov al, [rsi] 202 mov [rdi], al 203 inc rdi 204 205 mov rsi, r8 ; /* move in back to %rsi, toss from */ 206 jmp L_while_test 207 208L_copy_two: 209 rep movsw 210 mov rsi, r8 ; /* move in back to %rsi, toss from */ 211 jmp L_while_test 212 213ALIGN 4 214L_check_dist_one: 215 cmp r15d, 1 ; /* if dist 1, is a memset */ 216 jne L_check_window 217 cmp [rsp+40], rdi ; /* if out == beg, outside window */ 218 je L_check_window 219 220 mov ecx, r14d ; /* ecx = len */ 221 mov al, [rdi-1] 222 mov ah, al 223 224 sar ecx, 1 225 jnc L_set_two 226 mov [rdi], al 227 inc rdi 228 229L_set_two: 230 rep stosw 231 jmp L_while_test 232 233ALIGN 4 234L_test_for_second_level_length: 235 test al, 64 236 jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */ 237 238 xor eax, eax 239 inc eax 240 shl eax, cl 241 dec eax 242 and eax, edx ; /* eax &= hold */ 243 add eax, r14d ; /* eax += len */ 244 mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/ 245 jmp L_dolen 246 247ALIGN 4 248L_test_for_second_level_dist: 249 test al, 64 250 jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */ 251 252 xor eax, eax 253 inc eax 254 shl eax, cl 255 dec eax 256 and eax, edx ; /* eax &= hold */ 257 add eax, r15d ; /* eax += dist */ 258 mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/ 259 jmp L_dodist 260 261ALIGN 4 262L_clip_window: 263 mov ecx, eax ; /* ecx = nbytes */ 264 mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */ 265 neg ecx ; /* nbytes = -nbytes */ 266 267 cmp eax, r15d 268 jb L_invalid_distance_too_far ; /* if (dist > wsize) */ 269 270 add ecx, r15d ; /* nbytes = dist - nbytes */ 271 cmp dword ptr [rsp+96], 0 272 jne L_wrap_around_window ; /* if (write != 0) */ 273 274 mov rsi, [rsp+56] ; /* from = window */ 275 sub eax, ecx ; /* eax -= nbytes */ 276 add rsi, rax ; /* from += wsize - nbytes */ 277 278 mov eax, r14d ; /* eax = len */ 279 cmp r14d, ecx 280 jbe L_do_copy ; /* if (nbytes >= len) */ 281 282 sub eax, ecx ; /* eax -= nbytes */ 283 rep movsb 284 mov rsi, rdi 285 sub rsi, r15 ; /* from = &out[ -dist ] */ 286 jmp L_do_copy 287 288ALIGN 4 289L_wrap_around_window: 290 mov eax, [rsp+96] ; /* eax = write */ 291 cmp ecx, eax 292 jbe L_contiguous_in_window ; /* if (write >= nbytes) */ 293 294 mov esi, [rsp+92] ; /* from = wsize */ 295 add rsi, [rsp+56] ; /* from += window */ 296 add rsi, rax ; /* from += write */ 297 sub rsi, rcx ; /* from -= nbytes */ 298 sub ecx, eax ; /* nbytes -= write */ 299 300 mov eax, r14d ; /* eax = len */ 301 cmp eax, ecx 302 jbe L_do_copy ; /* if (nbytes >= len) */ 303 304 sub eax, ecx ; /* len -= nbytes */ 305 rep movsb 306 mov rsi, [rsp+56] ; /* from = window */ 307 mov ecx, [rsp+96] ; /* nbytes = write */ 308 cmp eax, ecx 309 jbe L_do_copy ; /* if (nbytes >= len) */ 310 311 sub eax, ecx ; /* len -= nbytes */ 312 rep movsb 313 mov rsi, rdi 314 sub rsi, r15 ; /* from = out - dist */ 315 jmp L_do_copy 316 317ALIGN 4 318L_contiguous_in_window: 319 mov rsi, [rsp+56] ; /* rsi = window */ 320 add rsi, rax 321 sub rsi, rcx ; /* from += write - nbytes */ 322 323 mov eax, r14d ; /* eax = len */ 324 cmp eax, ecx 325 jbe L_do_copy ; /* if (nbytes >= len) */ 326 327 sub eax, ecx ; /* len -= nbytes */ 328 rep movsb 329 mov rsi, rdi 330 sub rsi, r15 ; /* from = out - dist */ 331 jmp L_do_copy ; /* if (nbytes >= len) */ 332 333ALIGN 4 334L_do_copy: 335 mov ecx, eax ; /* ecx = len */ 336 rep movsb 337 338 mov rsi, r8 ; /* move in back to %esi, toss from */ 339 jmp L_while_test 340 341L_test_for_end_of_block: 342 test al, 32 343 jz L_invalid_literal_length_code 344 mov dword ptr [rsp+116], 1 345 jmp L_break_loop_with_status 346 347L_invalid_literal_length_code: 348 mov dword ptr [rsp+116], 2 349 jmp L_break_loop_with_status 350 351L_invalid_distance_code: 352 mov dword ptr [rsp+116], 3 353 jmp L_break_loop_with_status 354 355L_invalid_distance_too_far: 356 mov dword ptr [rsp+116], 4 357 jmp L_break_loop_with_status 358 359L_break_loop: 360 mov dword ptr [rsp+116], 0 361 362L_break_loop_with_status: 363; /* put in, out, bits, and hold back into ar and pop esp */ 364 mov [rsp+16], rsi ; /* in */ 365 mov [rsp+32], rdi ; /* out */ 366 mov [rsp+88], ebx ; /* bits */ 367 mov [rsp+80], rdx ; /* hold */ 368 369 mov rax, [rsp] ; /* restore rbp and rsp */ 370 mov rbp, [rsp+8] 371 mov rsp, rax 372 373 374 375 mov rsi,[rsp-8] 376 mov rdi,[rsp-16] 377 mov r12,[rsp-24] 378 mov r13,[rsp-32] 379 mov r14,[rsp-40] 380 mov r15,[rsp-48] 381 mov rbx,[rsp-56] 382 383 ret 0 384; : 385; : "m" (ar) 386; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", 387; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" 388; ); 389 390inffas8664fnc ENDP 391;_TEXT ENDS 392END 393