1*440a403fSchristos;uInt longest_match_x64( 2*440a403fSchristos; deflate_state *s, 3*440a403fSchristos; IPos cur_match); /* current match */ 4*440a403fSchristos 5*440a403fSchristos; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64 6*440a403fSchristos; (AMD64 on Athlon 64, Opteron, Phenom 7*440a403fSchristos; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7) 8*440a403fSchristos; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant. 9*440a403fSchristos; 10*440a403fSchristos; File written by Gilles Vollant, by converting to assembly the longest_match 11*440a403fSchristos; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. 12*440a403fSchristos; 13*440a403fSchristos; and by taking inspiration on asm686 with masm, optimised assembly code 14*440a403fSchristos; from Brian Raiter, written 1998 15*440a403fSchristos; 16*440a403fSchristos; This software is provided 'as-is', without any express or implied 17*440a403fSchristos; warranty. In no event will the authors be held liable for any damages 18*440a403fSchristos; arising from the use of this software. 19*440a403fSchristos; 20*440a403fSchristos; Permission is granted to anyone to use this software for any purpose, 21*440a403fSchristos; including commercial applications, and to alter it and redistribute it 22*440a403fSchristos; freely, subject to the following restrictions: 23*440a403fSchristos; 24*440a403fSchristos; 1. The origin of this software must not be misrepresented; you must not 25*440a403fSchristos; claim that you wrote the original software. If you use this software 26*440a403fSchristos; in a product, an acknowledgment in the product documentation would be 27*440a403fSchristos; appreciated but is not required. 28*440a403fSchristos; 2. Altered source versions must be plainly marked as such, and must not be 29*440a403fSchristos; misrepresented as being the original software 30*440a403fSchristos; 3. This notice may not be removed or altered from any source distribution. 31*440a403fSchristos; 32*440a403fSchristos; 33*440a403fSchristos; 34*440a403fSchristos; http://www.zlib.net 35*440a403fSchristos; http://www.winimage.com/zLibDll 36*440a403fSchristos; http://www.muppetlabs.com/~breadbox/software/assembly.html 37*440a403fSchristos; 38*440a403fSchristos; to compile this file for infozip Zip, I use option: 39*440a403fSchristos; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm 40*440a403fSchristos; 41*440a403fSchristos; to compile this file for zLib, I use option: 42*440a403fSchristos; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm 43*440a403fSchristos; Be carrefull to adapt zlib1222add below to your version of zLib 44*440a403fSchristos; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change 45*440a403fSchristos; value of zlib1222add later) 46*440a403fSchristos; 47*440a403fSchristos; This file compile with Microsoft Macro Assembler (x64) for AMD64 48*440a403fSchristos; 49*440a403fSchristos; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK 50*440a403fSchristos; 51*440a403fSchristos; (you can get Windows WDK with ml64 for AMD64 from 52*440a403fSchristos; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price) 53*440a403fSchristos; 54*440a403fSchristos 55*440a403fSchristos 56*440a403fSchristos;uInt longest_match(s, cur_match) 57*440a403fSchristos; deflate_state *s; 58*440a403fSchristos; IPos cur_match; /* current match */ 59*440a403fSchristos.code 60*440a403fSchristoslongest_match PROC 61*440a403fSchristos 62*440a403fSchristos 63*440a403fSchristos;LocalVarsSize equ 88 64*440a403fSchristos LocalVarsSize equ 72 65*440a403fSchristos 66*440a403fSchristos; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 67*440a403fSchristos; free register : r14,r15 68*440a403fSchristos; register can be saved : rsp 69*440a403fSchristos 70*440a403fSchristos chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len 71*440a403fSchristos ; low word: s->wmask 72*440a403fSchristos;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10 73*440a403fSchristos;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11 74*440a403fSchristos;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w 75*440a403fSchristos;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx 76*440a403fSchristos;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13 77*440a403fSchristos;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d 78*440a403fSchristos;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 79*440a403fSchristosIFDEF INFOZIP 80*440a403fSchristosELSE 81*440a403fSchristos nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size 82*440a403fSchristosENDIF 83*440a403fSchristos 84*440a403fSchristossave_rdi equ rsp + 24 - LocalVarsSize 85*440a403fSchristossave_rsi equ rsp + 32 - LocalVarsSize 86*440a403fSchristossave_rbx equ rsp + 40 - LocalVarsSize 87*440a403fSchristossave_rbp equ rsp + 48 - LocalVarsSize 88*440a403fSchristossave_r12 equ rsp + 56 - LocalVarsSize 89*440a403fSchristossave_r13 equ rsp + 64 - LocalVarsSize 90*440a403fSchristos;save_r14 equ rsp + 72 - LocalVarsSize 91*440a403fSchristos;save_r15 equ rsp + 80 - LocalVarsSize 92*440a403fSchristos 93*440a403fSchristos 94*440a403fSchristos; summary of register usage 95*440a403fSchristos; scanend ebx 96*440a403fSchristos; scanendw bx 97*440a403fSchristos; chainlenwmask edx 98*440a403fSchristos; curmatch rsi 99*440a403fSchristos; curmatchd esi 100*440a403fSchristos; windowbestlen r8 101*440a403fSchristos; scanalign r9 102*440a403fSchristos; scanalignd r9d 103*440a403fSchristos; window r10 104*440a403fSchristos; bestlen r11 105*440a403fSchristos; bestlend r11d 106*440a403fSchristos; scanstart r12d 107*440a403fSchristos; scanstartw r12w 108*440a403fSchristos; scan r13 109*440a403fSchristos; nicematch r14d 110*440a403fSchristos; limit r15 111*440a403fSchristos; limitd r15d 112*440a403fSchristos; prev rcx 113*440a403fSchristos 114*440a403fSchristos; all the +4 offsets are due to the addition of pending_buf_size (in zlib 115*440a403fSchristos; in the deflate_state structure since the asm code was first written 116*440a403fSchristos; (if you compile with zlib 1.0.4 or older, remove the +4). 117*440a403fSchristos; Note : these value are good with a 8 bytes boundary pack structure 118*440a403fSchristos 119*440a403fSchristos 120*440a403fSchristos MAX_MATCH equ 258 121*440a403fSchristos MIN_MATCH equ 3 122*440a403fSchristos MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) 123*440a403fSchristos 124*440a403fSchristos 125*440a403fSchristos;;; Offsets for fields in the deflate_state structure. These numbers 126*440a403fSchristos;;; are calculated from the definition of deflate_state, with the 127*440a403fSchristos;;; assumption that the compiler will dword-align the fields. (Thus, 128*440a403fSchristos;;; changing the definition of deflate_state could easily cause this 129*440a403fSchristos;;; program to crash horribly, without so much as a warning at 130*440a403fSchristos;;; compile time. Sigh.) 131*440a403fSchristos 132*440a403fSchristos; all the +zlib1222add offsets are due to the addition of fields 133*440a403fSchristos; in zlib in the deflate_state structure since the asm code was first written 134*440a403fSchristos; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 135*440a403fSchristos; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 136*440a403fSchristos; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 137*440a403fSchristos 138*440a403fSchristos 139*440a403fSchristosIFDEF INFOZIP 140*440a403fSchristos 141*440a403fSchristos_DATA SEGMENT 142*440a403fSchristosCOMM window_size:DWORD 143*440a403fSchristos; WMask ; 7fff 144*440a403fSchristosCOMM window:BYTE:010040H 145*440a403fSchristosCOMM prev:WORD:08000H 146*440a403fSchristos; MatchLen : unused 147*440a403fSchristos; PrevMatch : unused 148*440a403fSchristosCOMM strstart:DWORD 149*440a403fSchristosCOMM match_start:DWORD 150*440a403fSchristos; Lookahead : ignore 151*440a403fSchristosCOMM prev_length:DWORD ; PrevLen 152*440a403fSchristosCOMM max_chain_length:DWORD 153*440a403fSchristosCOMM good_match:DWORD 154*440a403fSchristosCOMM nice_match:DWORD 155*440a403fSchristosprev_ad equ OFFSET prev 156*440a403fSchristoswindow_ad equ OFFSET window 157*440a403fSchristosnicematch equ nice_match 158*440a403fSchristos_DATA ENDS 159*440a403fSchristosWMask equ 07fffh 160*440a403fSchristos 161*440a403fSchristosELSE 162*440a403fSchristos 163*440a403fSchristos IFNDEF zlib1222add 164*440a403fSchristos zlib1222add equ 8 165*440a403fSchristos ENDIF 166*440a403fSchristosdsWSize equ 56+zlib1222add+(zlib1222add/2) 167*440a403fSchristosdsWMask equ 64+zlib1222add+(zlib1222add/2) 168*440a403fSchristosdsWindow equ 72+zlib1222add 169*440a403fSchristosdsPrev equ 88+zlib1222add 170*440a403fSchristosdsMatchLen equ 128+zlib1222add 171*440a403fSchristosdsPrevMatch equ 132+zlib1222add 172*440a403fSchristosdsStrStart equ 140+zlib1222add 173*440a403fSchristosdsMatchStart equ 144+zlib1222add 174*440a403fSchristosdsLookahead equ 148+zlib1222add 175*440a403fSchristosdsPrevLen equ 152+zlib1222add 176*440a403fSchristosdsMaxChainLen equ 156+zlib1222add 177*440a403fSchristosdsGoodMatch equ 172+zlib1222add 178*440a403fSchristosdsNiceMatch equ 176+zlib1222add 179*440a403fSchristos 180*440a403fSchristoswindow_size equ [ rcx + dsWSize] 181*440a403fSchristosWMask equ [ rcx + dsWMask] 182*440a403fSchristoswindow_ad equ [ rcx + dsWindow] 183*440a403fSchristosprev_ad equ [ rcx + dsPrev] 184*440a403fSchristosstrstart equ [ rcx + dsStrStart] 185*440a403fSchristosmatch_start equ [ rcx + dsMatchStart] 186*440a403fSchristosLookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip 187*440a403fSchristosprev_length equ [ rcx + dsPrevLen] 188*440a403fSchristosmax_chain_length equ [ rcx + dsMaxChainLen] 189*440a403fSchristosgood_match equ [ rcx + dsGoodMatch] 190*440a403fSchristosnice_match equ [ rcx + dsNiceMatch] 191*440a403fSchristosENDIF 192*440a403fSchristos 193*440a403fSchristos; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) 194*440a403fSchristos 195*440a403fSchristos; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and 196*440a403fSchristos; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp 197*440a403fSchristos; 198*440a403fSchristos; All registers must be preserved across the call, except for 199*440a403fSchristos; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. 200*440a403fSchristos 201*440a403fSchristos 202*440a403fSchristos 203*440a403fSchristos;;; Save registers that the compiler may be using, and adjust esp to 204*440a403fSchristos;;; make room for our stack frame. 205*440a403fSchristos 206*440a403fSchristos 207*440a403fSchristos;;; Retrieve the function arguments. r8d will hold cur_match 208*440a403fSchristos;;; throughout the entire function. edx will hold the pointer to the 209*440a403fSchristos;;; deflate_state structure during the function's setup (before 210*440a403fSchristos;;; entering the main loop. 211*440a403fSchristos 212*440a403fSchristos; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) 213*440a403fSchristos 214*440a403fSchristos; this clear high 32 bits of r8, which can be garbage in both r8 and rdx 215*440a403fSchristos 216*440a403fSchristos mov [save_rdi],rdi 217*440a403fSchristos mov [save_rsi],rsi 218*440a403fSchristos mov [save_rbx],rbx 219*440a403fSchristos mov [save_rbp],rbp 220*440a403fSchristosIFDEF INFOZIP 221*440a403fSchristos mov r8d,ecx 222*440a403fSchristosELSE 223*440a403fSchristos mov r8d,edx 224*440a403fSchristosENDIF 225*440a403fSchristos mov [save_r12],r12 226*440a403fSchristos mov [save_r13],r13 227*440a403fSchristos; mov [save_r14],r14 228*440a403fSchristos; mov [save_r15],r15 229*440a403fSchristos 230*440a403fSchristos 231*440a403fSchristos;;; uInt wmask = s->w_mask; 232*440a403fSchristos;;; unsigned chain_length = s->max_chain_length; 233*440a403fSchristos;;; if (s->prev_length >= s->good_match) { 234*440a403fSchristos;;; chain_length >>= 2; 235*440a403fSchristos;;; } 236*440a403fSchristos 237*440a403fSchristos mov edi, prev_length 238*440a403fSchristos mov esi, good_match 239*440a403fSchristos mov eax, WMask 240*440a403fSchristos mov ebx, max_chain_length 241*440a403fSchristos cmp edi, esi 242*440a403fSchristos jl LastMatchGood 243*440a403fSchristos shr ebx, 2 244*440a403fSchristosLastMatchGood: 245*440a403fSchristos 246*440a403fSchristos;;; chainlen is decremented once beforehand so that the function can 247*440a403fSchristos;;; use the sign flag instead of the zero flag for the exit test. 248*440a403fSchristos;;; It is then shifted into the high word, to make room for the wmask 249*440a403fSchristos;;; value, which it will always accompany. 250*440a403fSchristos 251*440a403fSchristos dec ebx 252*440a403fSchristos shl ebx, 16 253*440a403fSchristos or ebx, eax 254*440a403fSchristos 255*440a403fSchristos;;; on zlib only 256*440a403fSchristos;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 257*440a403fSchristos 258*440a403fSchristosIFDEF INFOZIP 259*440a403fSchristos mov [chainlenwmask], ebx 260*440a403fSchristos; on infozip nice_match = [nice_match] 261*440a403fSchristosELSE 262*440a403fSchristos mov eax, nice_match 263*440a403fSchristos mov [chainlenwmask], ebx 264*440a403fSchristos mov r10d, Lookahead 265*440a403fSchristos cmp r10d, eax 266*440a403fSchristos cmovnl r10d, eax 267*440a403fSchristos mov [nicematch],r10d 268*440a403fSchristosENDIF 269*440a403fSchristos 270*440a403fSchristos;;; register Bytef *scan = s->window + s->strstart; 271*440a403fSchristos mov r10, window_ad 272*440a403fSchristos mov ebp, strstart 273*440a403fSchristos lea r13, [r10 + rbp] 274*440a403fSchristos 275*440a403fSchristos;;; Determine how many bytes the scan ptr is off from being 276*440a403fSchristos;;; dword-aligned. 277*440a403fSchristos 278*440a403fSchristos mov r9,r13 279*440a403fSchristos neg r13 280*440a403fSchristos and r13,3 281*440a403fSchristos 282*440a403fSchristos;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 283*440a403fSchristos;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 284*440a403fSchristosIFDEF INFOZIP 285*440a403fSchristos mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1)) 286*440a403fSchristosELSE 287*440a403fSchristos mov eax, window_size 288*440a403fSchristos sub eax, MIN_LOOKAHEAD 289*440a403fSchristosENDIF 290*440a403fSchristos xor edi,edi 291*440a403fSchristos sub ebp, eax 292*440a403fSchristos 293*440a403fSchristos mov r11d, prev_length 294*440a403fSchristos 295*440a403fSchristos cmovng ebp,edi 296*440a403fSchristos 297*440a403fSchristos;;; int best_len = s->prev_length; 298*440a403fSchristos 299*440a403fSchristos 300*440a403fSchristos;;; Store the sum of s->window + best_len in esi locally, and in esi. 301*440a403fSchristos 302*440a403fSchristos lea rsi,[r10+r11] 303*440a403fSchristos 304*440a403fSchristos;;; register ush scan_start = *(ushf*)scan; 305*440a403fSchristos;;; register ush scan_end = *(ushf*)(scan+best_len-1); 306*440a403fSchristos;;; Posf *prev = s->prev; 307*440a403fSchristos 308*440a403fSchristos movzx r12d,word ptr [r9] 309*440a403fSchristos movzx ebx, word ptr [r9 + r11 - 1] 310*440a403fSchristos 311*440a403fSchristos mov rdi, prev_ad 312*440a403fSchristos 313*440a403fSchristos;;; Jump into the main loop. 314*440a403fSchristos 315*440a403fSchristos mov edx, [chainlenwmask] 316*440a403fSchristos 317*440a403fSchristos cmp bx,word ptr [rsi + r8 - 1] 318*440a403fSchristos jz LookupLoopIsZero 319*440a403fSchristos 320*440a403fSchristosLookupLoop1: 321*440a403fSchristos and r8d, edx 322*440a403fSchristos 323*440a403fSchristos movzx r8d, word ptr [rdi + r8*2] 324*440a403fSchristos cmp r8d, ebp 325*440a403fSchristos jbe LeaveNow 326*440a403fSchristos sub edx, 00010000h 327*440a403fSchristos js LeaveNow 328*440a403fSchristos 329*440a403fSchristosLoopEntry1: 330*440a403fSchristos cmp bx,word ptr [rsi + r8 - 1] 331*440a403fSchristos jz LookupLoopIsZero 332*440a403fSchristos 333*440a403fSchristosLookupLoop2: 334*440a403fSchristos and r8d, edx 335*440a403fSchristos 336*440a403fSchristos movzx r8d, word ptr [rdi + r8*2] 337*440a403fSchristos cmp r8d, ebp 338*440a403fSchristos jbe LeaveNow 339*440a403fSchristos sub edx, 00010000h 340*440a403fSchristos js LeaveNow 341*440a403fSchristos 342*440a403fSchristosLoopEntry2: 343*440a403fSchristos cmp bx,word ptr [rsi + r8 - 1] 344*440a403fSchristos jz LookupLoopIsZero 345*440a403fSchristos 346*440a403fSchristosLookupLoop4: 347*440a403fSchristos and r8d, edx 348*440a403fSchristos 349*440a403fSchristos movzx r8d, word ptr [rdi + r8*2] 350*440a403fSchristos cmp r8d, ebp 351*440a403fSchristos jbe LeaveNow 352*440a403fSchristos sub edx, 00010000h 353*440a403fSchristos js LeaveNow 354*440a403fSchristos 355*440a403fSchristosLoopEntry4: 356*440a403fSchristos 357*440a403fSchristos cmp bx,word ptr [rsi + r8 - 1] 358*440a403fSchristos jnz LookupLoop1 359*440a403fSchristos jmp LookupLoopIsZero 360*440a403fSchristos 361*440a403fSchristos 362*440a403fSchristos;;; do { 363*440a403fSchristos;;; match = s->window + cur_match; 364*440a403fSchristos;;; if (*(ushf*)(match+best_len-1) != scan_end || 365*440a403fSchristos;;; *(ushf*)match != scan_start) continue; 366*440a403fSchristos;;; [...] 367*440a403fSchristos;;; } while ((cur_match = prev[cur_match & wmask]) > limit 368*440a403fSchristos;;; && --chain_length != 0); 369*440a403fSchristos;;; 370*440a403fSchristos;;; Here is the inner loop of the function. The function will spend the 371*440a403fSchristos;;; majority of its time in this loop, and majority of that time will 372*440a403fSchristos;;; be spent in the first ten instructions. 373*440a403fSchristos;;; 374*440a403fSchristos;;; Within this loop: 375*440a403fSchristos;;; ebx = scanend 376*440a403fSchristos;;; r8d = curmatch 377*440a403fSchristos;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 378*440a403fSchristos;;; esi = windowbestlen - i.e., (window + bestlen) 379*440a403fSchristos;;; edi = prev 380*440a403fSchristos;;; ebp = limit 381*440a403fSchristos 382*440a403fSchristosLookupLoop: 383*440a403fSchristos and r8d, edx 384*440a403fSchristos 385*440a403fSchristos movzx r8d, word ptr [rdi + r8*2] 386*440a403fSchristos cmp r8d, ebp 387*440a403fSchristos jbe LeaveNow 388*440a403fSchristos sub edx, 00010000h 389*440a403fSchristos js LeaveNow 390*440a403fSchristos 391*440a403fSchristosLoopEntry: 392*440a403fSchristos 393*440a403fSchristos cmp bx,word ptr [rsi + r8 - 1] 394*440a403fSchristos jnz LookupLoop1 395*440a403fSchristosLookupLoopIsZero: 396*440a403fSchristos cmp r12w, word ptr [r10 + r8] 397*440a403fSchristos jnz LookupLoop1 398*440a403fSchristos 399*440a403fSchristos 400*440a403fSchristos;;; Store the current value of chainlen. 401*440a403fSchristos mov [chainlenwmask], edx 402*440a403fSchristos 403*440a403fSchristos;;; Point edi to the string under scrutiny, and esi to the string we 404*440a403fSchristos;;; are hoping to match it up with. In actuality, esi and edi are 405*440a403fSchristos;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 406*440a403fSchristos;;; initialized to -(MAX_MATCH_8 - scanalign). 407*440a403fSchristos 408*440a403fSchristos lea rsi,[r8+r10] 409*440a403fSchristos mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8) 410*440a403fSchristos lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8] 411*440a403fSchristos lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8] 412*440a403fSchristos 413*440a403fSchristos prefetcht1 [rsi+rdx] 414*440a403fSchristos prefetcht1 [rdi+rdx] 415*440a403fSchristos 416*440a403fSchristos 417*440a403fSchristos;;; Test the strings for equality, 8 bytes at a time. At the end, 418*440a403fSchristos;;; adjust rdx so that it is offset to the exact byte that mismatched. 419*440a403fSchristos;;; 420*440a403fSchristos;;; We already know at this point that the first three bytes of the 421*440a403fSchristos;;; strings match each other, and they can be safely passed over before 422*440a403fSchristos;;; starting the compare loop. So what this code does is skip over 0-3 423*440a403fSchristos;;; bytes, as much as necessary in order to dword-align the edi 424*440a403fSchristos;;; pointer. (rsi will still be misaligned three times out of four.) 425*440a403fSchristos;;; 426*440a403fSchristos;;; It should be confessed that this loop usually does not represent 427*440a403fSchristos;;; much of the total running time. Replacing it with a more 428*440a403fSchristos;;; straightforward "rep cmpsb" would not drastically degrade 429*440a403fSchristos;;; performance. 430*440a403fSchristos 431*440a403fSchristos 432*440a403fSchristosLoopCmps: 433*440a403fSchristos mov rax, [rsi + rdx] 434*440a403fSchristos xor rax, [rdi + rdx] 435*440a403fSchristos jnz LeaveLoopCmps 436*440a403fSchristos 437*440a403fSchristos mov rax, [rsi + rdx + 8] 438*440a403fSchristos xor rax, [rdi + rdx + 8] 439*440a403fSchristos jnz LeaveLoopCmps8 440*440a403fSchristos 441*440a403fSchristos 442*440a403fSchristos mov rax, [rsi + rdx + 8+8] 443*440a403fSchristos xor rax, [rdi + rdx + 8+8] 444*440a403fSchristos jnz LeaveLoopCmps16 445*440a403fSchristos 446*440a403fSchristos add rdx,8+8+8 447*440a403fSchristos 448*440a403fSchristos jnz short LoopCmps 449*440a403fSchristos jmp short LenMaximum 450*440a403fSchristosLeaveLoopCmps16: add rdx,8 451*440a403fSchristosLeaveLoopCmps8: add rdx,8 452*440a403fSchristosLeaveLoopCmps: 453*440a403fSchristos 454*440a403fSchristos test eax, 0000FFFFh 455*440a403fSchristos jnz LenLower 456*440a403fSchristos 457*440a403fSchristos test eax,0ffffffffh 458*440a403fSchristos 459*440a403fSchristos jnz LenLower32 460*440a403fSchristos 461*440a403fSchristos add rdx,4 462*440a403fSchristos shr rax,32 463*440a403fSchristos or ax,ax 464*440a403fSchristos jnz LenLower 465*440a403fSchristos 466*440a403fSchristosLenLower32: 467*440a403fSchristos shr eax,16 468*440a403fSchristos add rdx,2 469*440a403fSchristosLenLower: sub al, 1 470*440a403fSchristos adc rdx, 0 471*440a403fSchristos;;; Calculate the length of the match. If it is longer than MAX_MATCH, 472*440a403fSchristos;;; then automatically accept it as the best possible match and leave. 473*440a403fSchristos 474*440a403fSchristos lea rax, [rdi + rdx] 475*440a403fSchristos sub rax, r9 476*440a403fSchristos cmp eax, MAX_MATCH 477*440a403fSchristos jge LenMaximum 478*440a403fSchristos 479*440a403fSchristos;;; If the length of the match is not longer than the best match we 480*440a403fSchristos;;; have so far, then forget it and return to the lookup loop. 481*440a403fSchristos;/////////////////////////////////// 482*440a403fSchristos 483*440a403fSchristos cmp eax, r11d 484*440a403fSchristos jg LongerMatch 485*440a403fSchristos 486*440a403fSchristos lea rsi,[r10+r11] 487*440a403fSchristos 488*440a403fSchristos mov rdi, prev_ad 489*440a403fSchristos mov edx, [chainlenwmask] 490*440a403fSchristos jmp LookupLoop 491*440a403fSchristos 492*440a403fSchristos;;; s->match_start = cur_match; 493*440a403fSchristos;;; best_len = len; 494*440a403fSchristos;;; if (len >= nice_match) break; 495*440a403fSchristos;;; scan_end = *(ushf*)(scan+best_len-1); 496*440a403fSchristos 497*440a403fSchristosLongerMatch: 498*440a403fSchristos mov r11d, eax 499*440a403fSchristos mov match_start, r8d 500*440a403fSchristos cmp eax, [nicematch] 501*440a403fSchristos jge LeaveNow 502*440a403fSchristos 503*440a403fSchristos lea rsi,[r10+rax] 504*440a403fSchristos 505*440a403fSchristos movzx ebx, word ptr [r9 + rax - 1] 506*440a403fSchristos mov rdi, prev_ad 507*440a403fSchristos mov edx, [chainlenwmask] 508*440a403fSchristos jmp LookupLoop 509*440a403fSchristos 510*440a403fSchristos;;; Accept the current string, with the maximum possible length. 511*440a403fSchristos 512*440a403fSchristosLenMaximum: 513*440a403fSchristos mov r11d,MAX_MATCH 514*440a403fSchristos mov match_start, r8d 515*440a403fSchristos 516*440a403fSchristos;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 517*440a403fSchristos;;; return s->lookahead; 518*440a403fSchristos 519*440a403fSchristosLeaveNow: 520*440a403fSchristosIFDEF INFOZIP 521*440a403fSchristos mov eax,r11d 522*440a403fSchristosELSE 523*440a403fSchristos mov eax, Lookahead 524*440a403fSchristos cmp r11d, eax 525*440a403fSchristos cmovng eax, r11d 526*440a403fSchristosENDIF 527*440a403fSchristos 528*440a403fSchristos;;; Restore the stack and return from whence we came. 529*440a403fSchristos 530*440a403fSchristos 531*440a403fSchristos mov rsi,[save_rsi] 532*440a403fSchristos mov rdi,[save_rdi] 533*440a403fSchristos mov rbx,[save_rbx] 534*440a403fSchristos mov rbp,[save_rbp] 535*440a403fSchristos mov r12,[save_r12] 536*440a403fSchristos mov r13,[save_r13] 537*440a403fSchristos; mov r14,[save_r14] 538*440a403fSchristos; mov r15,[save_r15] 539*440a403fSchristos 540*440a403fSchristos 541*440a403fSchristos ret 0 542*440a403fSchristos; please don't remove this string ! 543*440a403fSchristos; Your can freely use gvmat64 in any free or commercial app 544*440a403fSchristos; but it is far better don't remove the string in the binary! 545*440a403fSchristos db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 546*440a403fSchristoslongest_match ENDP 547*440a403fSchristos 548*440a403fSchristosmatch_init PROC 549*440a403fSchristos ret 0 550*440a403fSchristosmatch_init ENDP 551*440a403fSchristos 552*440a403fSchristos 553*440a403fSchristosEND 554