1;uInt longest_match_x64( 2; deflate_state *s, 3; IPos cur_match); /* current match */ 4 5; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86 6; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant. 7; 8; File written by Gilles Vollant, by converting to assembly the longest_match 9; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. 10; 11; and by taking inspiration on asm686 with masm, optimised assembly code 12; from Brian Raiter, written 1998 13; 14; http://www.zlib.net 15; http://www.winimage.com/zLibDll 16; http://www.muppetlabs.com/~breadbox/software/assembly.html 17; 18; to compile this file for infozip Zip, I use option: 19; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm 20; 21; to compile this file for zLib, I use option: 22; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm 23; Be carrefull to adapt zlib1222add below to your version of zLib 24; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change 25; value of zlib1222add later) 26; 27; This file compile with Microsoft Macro Assembler (x64) for AMD64 28; 29; ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK 30; 31; (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from 32; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price) 33; 34 35 36;uInt longest_match(s, cur_match) 37; deflate_state *s; 38; IPos cur_match; /* current match */ 39.code 40longest_match PROC 41 42 43;LocalVarsSize equ 88 44 LocalVarsSize equ 72 45 46; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 47; free register : r14,r15 48; register can be saved : rsp 49 50 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len 51 ; low word: s->wmask 52;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10 53;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11 54;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w 55;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx 56;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13 57;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d 58;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 59IFDEF INFOZIP 60ELSE 61 nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size 62ENDIF 63 64save_rdi equ rsp + 24 - LocalVarsSize 65save_rsi equ rsp + 32 - LocalVarsSize 66save_rbx equ rsp + 40 - LocalVarsSize 67save_rbp equ rsp + 48 - LocalVarsSize 68save_r12 equ rsp + 56 - LocalVarsSize 69save_r13 equ rsp + 64 - LocalVarsSize 70;save_r14 equ rsp + 72 - LocalVarsSize 71;save_r15 equ rsp + 80 - LocalVarsSize 72 73 74 75; all the +4 offsets are due to the addition of pending_buf_size (in zlib 76; in the deflate_state structure since the asm code was first written 77; (if you compile with zlib 1.0.4 or older, remove the +4). 78; Note : these value are good with a 8 bytes boundary pack structure 79 80 81 MAX_MATCH equ 258 82 MIN_MATCH equ 3 83 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) 84 85 86;;; Offsets for fields in the deflate_state structure. These numbers 87;;; are calculated from the definition of deflate_state, with the 88;;; assumption that the compiler will dword-align the fields. (Thus, 89;;; changing the definition of deflate_state could easily cause this 90;;; program to crash horribly, without so much as a warning at 91;;; compile time. Sigh.) 92 93; all the +zlib1222add offsets are due to the addition of fields 94; in zlib in the deflate_state structure since the asm code was first written 95; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 96; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 97; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 98 99 100IFDEF INFOZIP 101 102_DATA SEGMENT 103COMM window_size:DWORD 104; WMask ; 7fff 105COMM window:BYTE:010040H 106COMM prev:WORD:08000H 107; MatchLen : unused 108; PrevMatch : unused 109COMM strstart:DWORD 110COMM match_start:DWORD 111; Lookahead : ignore 112COMM prev_length:DWORD ; PrevLen 113COMM max_chain_length:DWORD 114COMM good_match:DWORD 115COMM nice_match:DWORD 116prev_ad equ OFFSET prev 117window_ad equ OFFSET window 118nicematch equ nice_match 119_DATA ENDS 120WMask equ 07fffh 121 122ELSE 123 124 IFNDEF zlib1222add 125 zlib1222add equ 0 126 ENDIF 127dsWSize equ 56+zlib1222add+(zlib1222add/2) 128dsWMask equ 64+zlib1222add+(zlib1222add/2) 129dsWindow equ 72+zlib1222add 130dsPrev equ 88+zlib1222add 131dsMatchLen equ 128+zlib1222add 132dsPrevMatch equ 132+zlib1222add 133dsStrStart equ 140+zlib1222add 134dsMatchStart equ 144+zlib1222add 135dsLookahead equ 148+zlib1222add 136dsPrevLen equ 152+zlib1222add 137dsMaxChainLen equ 156+zlib1222add 138dsGoodMatch equ 172+zlib1222add 139dsNiceMatch equ 176+zlib1222add 140 141window_size equ [ rcx + dsWSize] 142WMask equ [ rcx + dsWMask] 143window_ad equ [ rcx + dsWindow] 144prev_ad equ [ rcx + dsPrev] 145strstart equ [ rcx + dsStrStart] 146match_start equ [ rcx + dsMatchStart] 147Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip 148prev_length equ [ rcx + dsPrevLen] 149max_chain_length equ [ rcx + dsMaxChainLen] 150good_match equ [ rcx + dsGoodMatch] 151nice_match equ [ rcx + dsNiceMatch] 152ENDIF 153 154; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) 155 156; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 157; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 158; 159; All registers must be preserved across the call, except for 160; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. 161 162 163 164;;; Save registers that the compiler may be using, and adjust esp to 165;;; make room for our stack frame. 166 167 168;;; Retrieve the function arguments. r8d will hold cur_match 169;;; throughout the entire function. edx will hold the pointer to the 170;;; deflate_state structure during the function's setup (before 171;;; entering the main loop. 172 173; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) 174 175; this clear high 32 bits of r8, which can be garbage in both r8 and rdx 176 177 mov [save_rdi],rdi 178 mov [save_rsi],rsi 179 mov [save_rbx],rbx 180 mov [save_rbp],rbp 181IFDEF INFOZIP 182 mov r8d,ecx 183ELSE 184 mov r8d,edx 185ENDIF 186 mov [save_r12],r12 187 mov [save_r13],r13 188; mov [save_r14],r14 189; mov [save_r15],r15 190 191 192;;; uInt wmask = s->w_mask; 193;;; unsigned chain_length = s->max_chain_length; 194;;; if (s->prev_length >= s->good_match) { 195;;; chain_length >>= 2; 196;;; } 197 198 mov edi, prev_length 199 mov esi, good_match 200 mov eax, WMask 201 mov ebx, max_chain_length 202 cmp edi, esi 203 jl LastMatchGood 204 shr ebx, 2 205LastMatchGood: 206 207;;; chainlen is decremented once beforehand so that the function can 208;;; use the sign flag instead of the zero flag for the exit test. 209;;; It is then shifted into the high word, to make room for the wmask 210;;; value, which it will always accompany. 211 212 dec ebx 213 shl ebx, 16 214 or ebx, eax 215 216;;; on zlib only 217;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 218 219IFDEF INFOZIP 220 mov [chainlenwmask], ebx 221; on infozip nice_match = [nice_match] 222ELSE 223 mov eax, nice_match 224 mov [chainlenwmask], ebx 225 mov r10d, Lookahead 226 cmp r10d, eax 227 cmovnl r10d, eax 228 mov [nicematch],r10d 229ENDIF 230 231;;; register Bytef *scan = s->window + s->strstart; 232 mov r10, window_ad 233 mov ebp, strstart 234 lea r13, [r10 + rbp] 235 236;;; Determine how many bytes the scan ptr is off from being 237;;; dword-aligned. 238 239 mov r9,r13 240 neg r13 241 and r13,3 242 243;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 244;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 245IFDEF INFOZIP 246 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1)) 247ELSE 248 mov eax, window_size 249 sub eax, MIN_LOOKAHEAD 250ENDIF 251 xor edi,edi 252 sub ebp, eax 253 254 mov r11d, prev_length 255 256 cmovng ebp,edi 257 258;;; int best_len = s->prev_length; 259 260 261;;; Store the sum of s->window + best_len in esi locally, and in esi. 262 263 lea rsi,[r10+r11] 264 265;;; register ush scan_start = *(ushf*)scan; 266;;; register ush scan_end = *(ushf*)(scan+best_len-1); 267;;; Posf *prev = s->prev; 268 269 movzx r12d,word ptr [r9] 270 movzx ebx, word ptr [r9 + r11 - 1] 271 272 mov rdi, prev_ad 273 274;;; Jump into the main loop. 275 276 mov edx, [chainlenwmask] 277 278 cmp bx,word ptr [rsi + r8 - 1] 279 jz LookupLoopIsZero 280 281LookupLoop1: 282 and r8d, edx 283 284 movzx r8d, word ptr [rdi + r8*2] 285 cmp r8d, ebp 286 jbe LeaveNow 287 sub edx, 00010000h 288 js LeaveNow 289 290LoopEntry1: 291 cmp bx,word ptr [rsi + r8 - 1] 292 jz LookupLoopIsZero 293 294LookupLoop2: 295 and r8d, edx 296 297 movzx r8d, word ptr [rdi + r8*2] 298 cmp r8d, ebp 299 jbe LeaveNow 300 sub edx, 00010000h 301 js LeaveNow 302 303LoopEntry2: 304 cmp bx,word ptr [rsi + r8 - 1] 305 jz LookupLoopIsZero 306 307LookupLoop4: 308 and r8d, edx 309 310 movzx r8d, word ptr [rdi + r8*2] 311 cmp r8d, ebp 312 jbe LeaveNow 313 sub edx, 00010000h 314 js LeaveNow 315 316LoopEntry4: 317 318 cmp bx,word ptr [rsi + r8 - 1] 319 jnz LookupLoop1 320 jmp LookupLoopIsZero 321 322 323;;; do { 324;;; match = s->window + cur_match; 325;;; if (*(ushf*)(match+best_len-1) != scan_end || 326;;; *(ushf*)match != scan_start) continue; 327;;; [...] 328;;; } while ((cur_match = prev[cur_match & wmask]) > limit 329;;; && --chain_length != 0); 330;;; 331;;; Here is the inner loop of the function. The function will spend the 332;;; majority of its time in this loop, and majority of that time will 333;;; be spent in the first ten instructions. 334;;; 335;;; Within this loop: 336;;; ebx = scanend 337;;; r8d = curmatch 338;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 339;;; esi = windowbestlen - i.e., (window + bestlen) 340;;; edi = prev 341;;; ebp = limit 342 343LookupLoop: 344 and r8d, edx 345 346 movzx r8d, word ptr [rdi + r8*2] 347 cmp r8d, ebp 348 jbe LeaveNow 349 sub edx, 00010000h 350 js LeaveNow 351 352LoopEntry: 353 354 cmp bx,word ptr [rsi + r8 - 1] 355 jnz LookupLoop1 356LookupLoopIsZero: 357 cmp r12w, word ptr [r10 + r8] 358 jnz LookupLoop1 359 360 361;;; Store the current value of chainlen. 362 mov [chainlenwmask], edx 363 364;;; Point edi to the string under scrutiny, and esi to the string we 365;;; are hoping to match it up with. In actuality, esi and edi are 366;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 367;;; initialized to -(MAX_MATCH_8 - scanalign). 368 369 lea rsi,[r8+r10] 370 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8) 371 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8] 372 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8] 373 374 prefetcht1 [rsi+rdx] 375 prefetcht1 [rdi+rdx] 376 377 378;;; Test the strings for equality, 8 bytes at a time. At the end, 379;;; adjust rdx so that it is offset to the exact byte that mismatched. 380;;; 381;;; We already know at this point that the first three bytes of the 382;;; strings match each other, and they can be safely passed over before 383;;; starting the compare loop. So what this code does is skip over 0-3 384;;; bytes, as much as necessary in order to dword-align the edi 385;;; pointer. (rsi will still be misaligned three times out of four.) 386;;; 387;;; It should be confessed that this loop usually does not represent 388;;; much of the total running time. Replacing it with a more 389;;; straightforward "rep cmpsb" would not drastically degrade 390;;; performance. 391 392 393LoopCmps: 394 mov rax, [rsi + rdx] 395 xor rax, [rdi + rdx] 396 jnz LeaveLoopCmps 397 398 mov rax, [rsi + rdx + 8] 399 xor rax, [rdi + rdx + 8] 400 jnz LeaveLoopCmps8 401 402 403 mov rax, [rsi + rdx + 8+8] 404 xor rax, [rdi + rdx + 8+8] 405 jnz LeaveLoopCmps16 406 407 add rdx,8+8+8 408 409 jmp short LoopCmps 410LeaveLoopCmps16: add rdx,8 411LeaveLoopCmps8: add rdx,8 412LeaveLoopCmps: 413 414 test eax, 0000FFFFh 415 jnz LenLower 416 417 test eax,0ffffffffh 418 419 jnz LenLower32 420 421 add rdx,4 422 shr rax,32 423 or ax,ax 424 jnz LenLower 425 426LenLower32: 427 shr eax,16 428 add rdx,2 429LenLower: sub al, 1 430 adc rdx, 0 431;;; Calculate the length of the match. If it is longer than MAX_MATCH, 432;;; then automatically accept it as the best possible match and leave. 433 434 lea rax, [rdi + rdx] 435 sub rax, r9 436 cmp eax, MAX_MATCH 437 jge LenMaximum 438 439;;; If the length of the match is not longer than the best match we 440;;; have so far, then forget it and return to the lookup loop. 441;/////////////////////////////////// 442 443 cmp eax, r11d 444 jg LongerMatch 445 446 lea rsi,[r10+r11] 447 448 mov rdi, prev_ad 449 mov edx, [chainlenwmask] 450 jmp LookupLoop 451 452;;; s->match_start = cur_match; 453;;; best_len = len; 454;;; if (len >= nice_match) break; 455;;; scan_end = *(ushf*)(scan+best_len-1); 456 457LongerMatch: 458 mov r11d, eax 459 mov match_start, r8d 460 cmp eax, [nicematch] 461 jge LeaveNow 462 463 lea rsi,[r10+rax] 464 465 movzx ebx, word ptr [r9 + rax - 1] 466 mov rdi, prev_ad 467 mov edx, [chainlenwmask] 468 jmp LookupLoop 469 470;;; Accept the current string, with the maximum possible length. 471 472LenMaximum: 473 mov r11d,MAX_MATCH 474 mov match_start, r8d 475 476;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 477;;; return s->lookahead; 478 479LeaveNow: 480IFDEF INFOZIP 481 mov eax,r11d 482ELSE 483 mov eax, Lookahead 484 cmp r11d, eax 485 cmovng eax, r11d 486ENDIF 487 488;;; Restore the stack and return from whence we came. 489 490 491 mov rsi,[save_rsi] 492 mov rdi,[save_rdi] 493 mov rbx,[save_rbx] 494 mov rbp,[save_rbp] 495 mov r12,[save_r12] 496 mov r13,[save_r13] 497; mov r14,[save_r14] 498; mov r15,[save_r15] 499 500 501 ret 0 502; please don't remove this string ! 503; Your can freely use gvmat64 in any free or commercial app 504; but it is far better don't remove the string in the binary! 505 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 506longest_match ENDP 507 508match_init PROC 509 ret 0 510match_init ENDP 511 512 513END 514