1*440a403fSchristos;uInt longest_match_x64(
2*440a403fSchristos;    deflate_state *s,
3*440a403fSchristos;    IPos cur_match);                             /* current match */
4*440a403fSchristos
5*440a403fSchristos; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64
6*440a403fSchristos;  (AMD64 on Athlon 64, Opteron, Phenom
7*440a403fSchristos;     and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7)
8*440a403fSchristos; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
9*440a403fSchristos;
10*440a403fSchristos; File written by Gilles Vollant, by converting to assembly the longest_match
11*440a403fSchristos;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
12*440a403fSchristos;
13*440a403fSchristos;  and by taking inspiration on asm686 with masm, optimised assembly code
14*440a403fSchristos;        from Brian Raiter, written 1998
15*440a403fSchristos;
16*440a403fSchristos;  This software is provided 'as-is', without any express or implied
17*440a403fSchristos;  warranty.  In no event will the authors be held liable for any damages
18*440a403fSchristos;  arising from the use of this software.
19*440a403fSchristos;
20*440a403fSchristos;  Permission is granted to anyone to use this software for any purpose,
21*440a403fSchristos;  including commercial applications, and to alter it and redistribute it
22*440a403fSchristos;  freely, subject to the following restrictions:
23*440a403fSchristos;
24*440a403fSchristos;  1. The origin of this software must not be misrepresented; you must not
25*440a403fSchristos;     claim that you wrote the original software. If you use this software
26*440a403fSchristos;     in a product, an acknowledgment in the product documentation would be
27*440a403fSchristos;     appreciated but is not required.
28*440a403fSchristos;  2. Altered source versions must be plainly marked as such, and must not be
29*440a403fSchristos;     misrepresented as being the original software
30*440a403fSchristos;  3. This notice may not be removed or altered from any source distribution.
31*440a403fSchristos;
32*440a403fSchristos;
33*440a403fSchristos;
34*440a403fSchristos;         http://www.zlib.net
35*440a403fSchristos;         http://www.winimage.com/zLibDll
36*440a403fSchristos;         http://www.muppetlabs.com/~breadbox/software/assembly.html
37*440a403fSchristos;
38*440a403fSchristos; to compile this file for infozip Zip, I use option:
39*440a403fSchristos;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
40*440a403fSchristos;
41*440a403fSchristos; to compile this file for zLib, I use option:
42*440a403fSchristos;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
43*440a403fSchristos; Be carrefull to adapt zlib1222add below to your version of zLib
44*440a403fSchristos;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
45*440a403fSchristos;    value of zlib1222add later)
46*440a403fSchristos;
47*440a403fSchristos; This file compile with Microsoft Macro Assembler (x64) for AMD64
48*440a403fSchristos;
49*440a403fSchristos;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
50*440a403fSchristos;
51*440a403fSchristos;   (you can get Windows WDK with ml64 for AMD64 from
52*440a403fSchristos;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
53*440a403fSchristos;
54*440a403fSchristos
55*440a403fSchristos
56*440a403fSchristos;uInt longest_match(s, cur_match)
57*440a403fSchristos;    deflate_state *s;
58*440a403fSchristos;    IPos cur_match;                             /* current match */
59*440a403fSchristos.code
60*440a403fSchristoslongest_match PROC
61*440a403fSchristos
62*440a403fSchristos
63*440a403fSchristos;LocalVarsSize   equ 88
64*440a403fSchristos LocalVarsSize   equ 72
65*440a403fSchristos
66*440a403fSchristos; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
67*440a403fSchristos; free register :  r14,r15
68*440a403fSchristos; register can be saved : rsp
69*440a403fSchristos
70*440a403fSchristos chainlenwmask   equ  rsp + 8 - LocalVarsSize    ; high word: current chain len
71*440a403fSchristos                                                 ; low word: s->wmask
72*440a403fSchristos;window          equ  rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
73*440a403fSchristos;windowbestlen   equ  rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
74*440a403fSchristos;scanstart       equ  rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
75*440a403fSchristos;scanend         equ  rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
76*440a403fSchristos;scanalign       equ  rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
77*440a403fSchristos;bestlen         equ  rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
78*440a403fSchristos;scan            equ  rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
79*440a403fSchristosIFDEF INFOZIP
80*440a403fSchristosELSE
81*440a403fSchristos nicematch       equ  (rsp + 16 - LocalVarsSize) ; a good enough match size
82*440a403fSchristosENDIF
83*440a403fSchristos
84*440a403fSchristossave_rdi        equ  rsp + 24 - LocalVarsSize
85*440a403fSchristossave_rsi        equ  rsp + 32 - LocalVarsSize
86*440a403fSchristossave_rbx        equ  rsp + 40 - LocalVarsSize
87*440a403fSchristossave_rbp        equ  rsp + 48 - LocalVarsSize
88*440a403fSchristossave_r12        equ  rsp + 56 - LocalVarsSize
89*440a403fSchristossave_r13        equ  rsp + 64 - LocalVarsSize
90*440a403fSchristos;save_r14        equ  rsp + 72 - LocalVarsSize
91*440a403fSchristos;save_r15        equ  rsp + 80 - LocalVarsSize
92*440a403fSchristos
93*440a403fSchristos
94*440a403fSchristos; summary of register usage
95*440a403fSchristos; scanend     ebx
96*440a403fSchristos; scanendw    bx
97*440a403fSchristos; chainlenwmask   edx
98*440a403fSchristos; curmatch    rsi
99*440a403fSchristos; curmatchd   esi
100*440a403fSchristos; windowbestlen   r8
101*440a403fSchristos; scanalign   r9
102*440a403fSchristos; scanalignd  r9d
103*440a403fSchristos; window      r10
104*440a403fSchristos; bestlen     r11
105*440a403fSchristos; bestlend    r11d
106*440a403fSchristos; scanstart   r12d
107*440a403fSchristos; scanstartw  r12w
108*440a403fSchristos; scan        r13
109*440a403fSchristos; nicematch   r14d
110*440a403fSchristos; limit       r15
111*440a403fSchristos; limitd      r15d
112*440a403fSchristos; prev        rcx
113*440a403fSchristos
114*440a403fSchristos;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
115*440a403fSchristos;  in the deflate_state structure since the asm code was first written
116*440a403fSchristos;  (if you compile with zlib 1.0.4 or older, remove the +4).
117*440a403fSchristos;  Note : these value are good with a 8 bytes boundary pack structure
118*440a403fSchristos
119*440a403fSchristos
120*440a403fSchristos    MAX_MATCH           equ     258
121*440a403fSchristos    MIN_MATCH           equ     3
122*440a403fSchristos    MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
123*440a403fSchristos
124*440a403fSchristos
125*440a403fSchristos;;; Offsets for fields in the deflate_state structure. These numbers
126*440a403fSchristos;;; are calculated from the definition of deflate_state, with the
127*440a403fSchristos;;; assumption that the compiler will dword-align the fields. (Thus,
128*440a403fSchristos;;; changing the definition of deflate_state could easily cause this
129*440a403fSchristos;;; program to crash horribly, without so much as a warning at
130*440a403fSchristos;;; compile time. Sigh.)
131*440a403fSchristos
132*440a403fSchristos;  all the +zlib1222add offsets are due to the addition of fields
133*440a403fSchristos;  in zlib in the deflate_state structure since the asm code was first written
134*440a403fSchristos;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
135*440a403fSchristos;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
136*440a403fSchristos;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
137*440a403fSchristos
138*440a403fSchristos
139*440a403fSchristosIFDEF INFOZIP
140*440a403fSchristos
141*440a403fSchristos_DATA   SEGMENT
142*440a403fSchristosCOMM    window_size:DWORD
143*440a403fSchristos; WMask ; 7fff
144*440a403fSchristosCOMM    window:BYTE:010040H
145*440a403fSchristosCOMM    prev:WORD:08000H
146*440a403fSchristos; MatchLen : unused
147*440a403fSchristos; PrevMatch : unused
148*440a403fSchristosCOMM    strstart:DWORD
149*440a403fSchristosCOMM    match_start:DWORD
150*440a403fSchristos; Lookahead : ignore
151*440a403fSchristosCOMM    prev_length:DWORD ; PrevLen
152*440a403fSchristosCOMM    max_chain_length:DWORD
153*440a403fSchristosCOMM    good_match:DWORD
154*440a403fSchristosCOMM    nice_match:DWORD
155*440a403fSchristosprev_ad equ OFFSET prev
156*440a403fSchristoswindow_ad equ OFFSET window
157*440a403fSchristosnicematch equ nice_match
158*440a403fSchristos_DATA ENDS
159*440a403fSchristosWMask equ 07fffh
160*440a403fSchristos
161*440a403fSchristosELSE
162*440a403fSchristos
163*440a403fSchristos  IFNDEF zlib1222add
164*440a403fSchristos    zlib1222add equ 8
165*440a403fSchristos  ENDIF
166*440a403fSchristosdsWSize         equ 56+zlib1222add+(zlib1222add/2)
167*440a403fSchristosdsWMask         equ 64+zlib1222add+(zlib1222add/2)
168*440a403fSchristosdsWindow        equ 72+zlib1222add
169*440a403fSchristosdsPrev          equ 88+zlib1222add
170*440a403fSchristosdsMatchLen      equ 128+zlib1222add
171*440a403fSchristosdsPrevMatch     equ 132+zlib1222add
172*440a403fSchristosdsStrStart      equ 140+zlib1222add
173*440a403fSchristosdsMatchStart    equ 144+zlib1222add
174*440a403fSchristosdsLookahead     equ 148+zlib1222add
175*440a403fSchristosdsPrevLen       equ 152+zlib1222add
176*440a403fSchristosdsMaxChainLen   equ 156+zlib1222add
177*440a403fSchristosdsGoodMatch     equ 172+zlib1222add
178*440a403fSchristosdsNiceMatch     equ 176+zlib1222add
179*440a403fSchristos
180*440a403fSchristoswindow_size     equ [ rcx + dsWSize]
181*440a403fSchristosWMask           equ [ rcx + dsWMask]
182*440a403fSchristoswindow_ad       equ [ rcx + dsWindow]
183*440a403fSchristosprev_ad         equ [ rcx + dsPrev]
184*440a403fSchristosstrstart        equ [ rcx + dsStrStart]
185*440a403fSchristosmatch_start     equ [ rcx + dsMatchStart]
186*440a403fSchristosLookahead       equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
187*440a403fSchristosprev_length     equ [ rcx + dsPrevLen]
188*440a403fSchristosmax_chain_length equ [ rcx + dsMaxChainLen]
189*440a403fSchristosgood_match      equ [ rcx + dsGoodMatch]
190*440a403fSchristosnice_match      equ [ rcx + dsNiceMatch]
191*440a403fSchristosENDIF
192*440a403fSchristos
193*440a403fSchristos; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
194*440a403fSchristos
195*440a403fSchristos; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
196*440a403fSchristos; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
197*440a403fSchristos;
198*440a403fSchristos; All registers must be preserved across the call, except for
199*440a403fSchristos;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
200*440a403fSchristos
201*440a403fSchristos
202*440a403fSchristos
203*440a403fSchristos;;; Save registers that the compiler may be using, and adjust esp to
204*440a403fSchristos;;; make room for our stack frame.
205*440a403fSchristos
206*440a403fSchristos
207*440a403fSchristos;;; Retrieve the function arguments. r8d will hold cur_match
208*440a403fSchristos;;; throughout the entire function. edx will hold the pointer to the
209*440a403fSchristos;;; deflate_state structure during the function's setup (before
210*440a403fSchristos;;; entering the main loop.
211*440a403fSchristos
212*440a403fSchristos; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
213*440a403fSchristos
214*440a403fSchristos; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
215*440a403fSchristos
216*440a403fSchristos        mov [save_rdi],rdi
217*440a403fSchristos        mov [save_rsi],rsi
218*440a403fSchristos        mov [save_rbx],rbx
219*440a403fSchristos        mov [save_rbp],rbp
220*440a403fSchristosIFDEF INFOZIP
221*440a403fSchristos        mov r8d,ecx
222*440a403fSchristosELSE
223*440a403fSchristos        mov r8d,edx
224*440a403fSchristosENDIF
225*440a403fSchristos        mov [save_r12],r12
226*440a403fSchristos        mov [save_r13],r13
227*440a403fSchristos;        mov [save_r14],r14
228*440a403fSchristos;        mov [save_r15],r15
229*440a403fSchristos
230*440a403fSchristos
231*440a403fSchristos;;; uInt wmask = s->w_mask;
232*440a403fSchristos;;; unsigned chain_length = s->max_chain_length;
233*440a403fSchristos;;; if (s->prev_length >= s->good_match) {
234*440a403fSchristos;;;     chain_length >>= 2;
235*440a403fSchristos;;; }
236*440a403fSchristos
237*440a403fSchristos        mov edi, prev_length
238*440a403fSchristos        mov esi, good_match
239*440a403fSchristos        mov eax, WMask
240*440a403fSchristos        mov ebx, max_chain_length
241*440a403fSchristos        cmp edi, esi
242*440a403fSchristos        jl  LastMatchGood
243*440a403fSchristos        shr ebx, 2
244*440a403fSchristosLastMatchGood:
245*440a403fSchristos
246*440a403fSchristos;;; chainlen is decremented once beforehand so that the function can
247*440a403fSchristos;;; use the sign flag instead of the zero flag for the exit test.
248*440a403fSchristos;;; It is then shifted into the high word, to make room for the wmask
249*440a403fSchristos;;; value, which it will always accompany.
250*440a403fSchristos
251*440a403fSchristos        dec ebx
252*440a403fSchristos        shl ebx, 16
253*440a403fSchristos        or  ebx, eax
254*440a403fSchristos
255*440a403fSchristos;;; on zlib only
256*440a403fSchristos;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
257*440a403fSchristos
258*440a403fSchristosIFDEF INFOZIP
259*440a403fSchristos        mov [chainlenwmask], ebx
260*440a403fSchristos; on infozip nice_match = [nice_match]
261*440a403fSchristosELSE
262*440a403fSchristos        mov eax, nice_match
263*440a403fSchristos        mov [chainlenwmask], ebx
264*440a403fSchristos        mov r10d, Lookahead
265*440a403fSchristos        cmp r10d, eax
266*440a403fSchristos        cmovnl r10d, eax
267*440a403fSchristos        mov [nicematch],r10d
268*440a403fSchristosENDIF
269*440a403fSchristos
270*440a403fSchristos;;; register Bytef *scan = s->window + s->strstart;
271*440a403fSchristos        mov r10, window_ad
272*440a403fSchristos        mov ebp, strstart
273*440a403fSchristos        lea r13, [r10 + rbp]
274*440a403fSchristos
275*440a403fSchristos;;; Determine how many bytes the scan ptr is off from being
276*440a403fSchristos;;; dword-aligned.
277*440a403fSchristos
278*440a403fSchristos         mov r9,r13
279*440a403fSchristos         neg r13
280*440a403fSchristos         and r13,3
281*440a403fSchristos
282*440a403fSchristos;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
283*440a403fSchristos;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
284*440a403fSchristosIFDEF INFOZIP
285*440a403fSchristos        mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
286*440a403fSchristosELSE
287*440a403fSchristos        mov eax, window_size
288*440a403fSchristos        sub eax, MIN_LOOKAHEAD
289*440a403fSchristosENDIF
290*440a403fSchristos        xor edi,edi
291*440a403fSchristos        sub ebp, eax
292*440a403fSchristos
293*440a403fSchristos        mov r11d, prev_length
294*440a403fSchristos
295*440a403fSchristos        cmovng ebp,edi
296*440a403fSchristos
297*440a403fSchristos;;; int best_len = s->prev_length;
298*440a403fSchristos
299*440a403fSchristos
300*440a403fSchristos;;; Store the sum of s->window + best_len in esi locally, and in esi.
301*440a403fSchristos
302*440a403fSchristos       lea  rsi,[r10+r11]
303*440a403fSchristos
304*440a403fSchristos;;; register ush scan_start = *(ushf*)scan;
305*440a403fSchristos;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
306*440a403fSchristos;;; Posf *prev = s->prev;
307*440a403fSchristos
308*440a403fSchristos        movzx r12d,word ptr [r9]
309*440a403fSchristos        movzx ebx, word ptr [r9 + r11 - 1]
310*440a403fSchristos
311*440a403fSchristos        mov rdi, prev_ad
312*440a403fSchristos
313*440a403fSchristos;;; Jump into the main loop.
314*440a403fSchristos
315*440a403fSchristos        mov edx, [chainlenwmask]
316*440a403fSchristos
317*440a403fSchristos        cmp bx,word ptr [rsi + r8 - 1]
318*440a403fSchristos        jz  LookupLoopIsZero
319*440a403fSchristos
320*440a403fSchristosLookupLoop1:
321*440a403fSchristos        and r8d, edx
322*440a403fSchristos
323*440a403fSchristos        movzx   r8d, word ptr [rdi + r8*2]
324*440a403fSchristos        cmp r8d, ebp
325*440a403fSchristos        jbe LeaveNow
326*440a403fSchristos        sub edx, 00010000h
327*440a403fSchristos        js  LeaveNow
328*440a403fSchristos
329*440a403fSchristosLoopEntry1:
330*440a403fSchristos        cmp bx,word ptr [rsi + r8 - 1]
331*440a403fSchristos        jz  LookupLoopIsZero
332*440a403fSchristos
333*440a403fSchristosLookupLoop2:
334*440a403fSchristos        and r8d, edx
335*440a403fSchristos
336*440a403fSchristos        movzx   r8d, word ptr [rdi + r8*2]
337*440a403fSchristos        cmp r8d, ebp
338*440a403fSchristos        jbe LeaveNow
339*440a403fSchristos        sub edx, 00010000h
340*440a403fSchristos        js  LeaveNow
341*440a403fSchristos
342*440a403fSchristosLoopEntry2:
343*440a403fSchristos        cmp bx,word ptr [rsi + r8 - 1]
344*440a403fSchristos        jz  LookupLoopIsZero
345*440a403fSchristos
346*440a403fSchristosLookupLoop4:
347*440a403fSchristos        and r8d, edx
348*440a403fSchristos
349*440a403fSchristos        movzx   r8d, word ptr [rdi + r8*2]
350*440a403fSchristos        cmp r8d, ebp
351*440a403fSchristos        jbe LeaveNow
352*440a403fSchristos        sub edx, 00010000h
353*440a403fSchristos        js  LeaveNow
354*440a403fSchristos
355*440a403fSchristosLoopEntry4:
356*440a403fSchristos
357*440a403fSchristos        cmp bx,word ptr [rsi + r8 - 1]
358*440a403fSchristos        jnz LookupLoop1
359*440a403fSchristos        jmp LookupLoopIsZero
360*440a403fSchristos
361*440a403fSchristos
362*440a403fSchristos;;; do {
363*440a403fSchristos;;;     match = s->window + cur_match;
364*440a403fSchristos;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
365*440a403fSchristos;;;         *(ushf*)match != scan_start) continue;
366*440a403fSchristos;;;     [...]
367*440a403fSchristos;;; } while ((cur_match = prev[cur_match & wmask]) > limit
368*440a403fSchristos;;;          && --chain_length != 0);
369*440a403fSchristos;;;
370*440a403fSchristos;;; Here is the inner loop of the function. The function will spend the
371*440a403fSchristos;;; majority of its time in this loop, and majority of that time will
372*440a403fSchristos;;; be spent in the first ten instructions.
373*440a403fSchristos;;;
374*440a403fSchristos;;; Within this loop:
375*440a403fSchristos;;; ebx = scanend
376*440a403fSchristos;;; r8d = curmatch
377*440a403fSchristos;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
378*440a403fSchristos;;; esi = windowbestlen - i.e., (window + bestlen)
379*440a403fSchristos;;; edi = prev
380*440a403fSchristos;;; ebp = limit
381*440a403fSchristos
382*440a403fSchristosLookupLoop:
383*440a403fSchristos        and r8d, edx
384*440a403fSchristos
385*440a403fSchristos        movzx   r8d, word ptr [rdi + r8*2]
386*440a403fSchristos        cmp r8d, ebp
387*440a403fSchristos        jbe LeaveNow
388*440a403fSchristos        sub edx, 00010000h
389*440a403fSchristos        js  LeaveNow
390*440a403fSchristos
391*440a403fSchristosLoopEntry:
392*440a403fSchristos
393*440a403fSchristos        cmp bx,word ptr [rsi + r8 - 1]
394*440a403fSchristos        jnz LookupLoop1
395*440a403fSchristosLookupLoopIsZero:
396*440a403fSchristos        cmp     r12w, word ptr [r10 + r8]
397*440a403fSchristos        jnz LookupLoop1
398*440a403fSchristos
399*440a403fSchristos
400*440a403fSchristos;;; Store the current value of chainlen.
401*440a403fSchristos        mov [chainlenwmask], edx
402*440a403fSchristos
403*440a403fSchristos;;; Point edi to the string under scrutiny, and esi to the string we
404*440a403fSchristos;;; are hoping to match it up with. In actuality, esi and edi are
405*440a403fSchristos;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
406*440a403fSchristos;;; initialized to -(MAX_MATCH_8 - scanalign).
407*440a403fSchristos
408*440a403fSchristos        lea rsi,[r8+r10]
409*440a403fSchristos        mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
410*440a403fSchristos        lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
411*440a403fSchristos        lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
412*440a403fSchristos
413*440a403fSchristos        prefetcht1 [rsi+rdx]
414*440a403fSchristos        prefetcht1 [rdi+rdx]
415*440a403fSchristos
416*440a403fSchristos
417*440a403fSchristos;;; Test the strings for equality, 8 bytes at a time. At the end,
418*440a403fSchristos;;; adjust rdx so that it is offset to the exact byte that mismatched.
419*440a403fSchristos;;;
420*440a403fSchristos;;; We already know at this point that the first three bytes of the
421*440a403fSchristos;;; strings match each other, and they can be safely passed over before
422*440a403fSchristos;;; starting the compare loop. So what this code does is skip over 0-3
423*440a403fSchristos;;; bytes, as much as necessary in order to dword-align the edi
424*440a403fSchristos;;; pointer. (rsi will still be misaligned three times out of four.)
425*440a403fSchristos;;;
426*440a403fSchristos;;; It should be confessed that this loop usually does not represent
427*440a403fSchristos;;; much of the total running time. Replacing it with a more
428*440a403fSchristos;;; straightforward "rep cmpsb" would not drastically degrade
429*440a403fSchristos;;; performance.
430*440a403fSchristos
431*440a403fSchristos
432*440a403fSchristosLoopCmps:
433*440a403fSchristos        mov rax, [rsi + rdx]
434*440a403fSchristos        xor rax, [rdi + rdx]
435*440a403fSchristos        jnz LeaveLoopCmps
436*440a403fSchristos
437*440a403fSchristos        mov rax, [rsi + rdx + 8]
438*440a403fSchristos        xor rax, [rdi + rdx + 8]
439*440a403fSchristos        jnz LeaveLoopCmps8
440*440a403fSchristos
441*440a403fSchristos
442*440a403fSchristos        mov rax, [rsi + rdx + 8+8]
443*440a403fSchristos        xor rax, [rdi + rdx + 8+8]
444*440a403fSchristos        jnz LeaveLoopCmps16
445*440a403fSchristos
446*440a403fSchristos        add rdx,8+8+8
447*440a403fSchristos
448*440a403fSchristos        jnz short LoopCmps
449*440a403fSchristos        jmp short LenMaximum
450*440a403fSchristosLeaveLoopCmps16: add rdx,8
451*440a403fSchristosLeaveLoopCmps8: add rdx,8
452*440a403fSchristosLeaveLoopCmps:
453*440a403fSchristos
454*440a403fSchristos        test    eax, 0000FFFFh
455*440a403fSchristos        jnz LenLower
456*440a403fSchristos
457*440a403fSchristos        test eax,0ffffffffh
458*440a403fSchristos
459*440a403fSchristos        jnz LenLower32
460*440a403fSchristos
461*440a403fSchristos        add rdx,4
462*440a403fSchristos        shr rax,32
463*440a403fSchristos        or ax,ax
464*440a403fSchristos        jnz LenLower
465*440a403fSchristos
466*440a403fSchristosLenLower32:
467*440a403fSchristos        shr eax,16
468*440a403fSchristos        add rdx,2
469*440a403fSchristosLenLower:   sub al, 1
470*440a403fSchristos        adc rdx, 0
471*440a403fSchristos;;; Calculate the length of the match. If it is longer than MAX_MATCH,
472*440a403fSchristos;;; then automatically accept it as the best possible match and leave.
473*440a403fSchristos
474*440a403fSchristos        lea rax, [rdi + rdx]
475*440a403fSchristos        sub rax, r9
476*440a403fSchristos        cmp eax, MAX_MATCH
477*440a403fSchristos        jge LenMaximum
478*440a403fSchristos
479*440a403fSchristos;;; If the length of the match is not longer than the best match we
480*440a403fSchristos;;; have so far, then forget it and return to the lookup loop.
481*440a403fSchristos;///////////////////////////////////
482*440a403fSchristos
483*440a403fSchristos        cmp eax, r11d
484*440a403fSchristos        jg  LongerMatch
485*440a403fSchristos
486*440a403fSchristos        lea rsi,[r10+r11]
487*440a403fSchristos
488*440a403fSchristos        mov rdi, prev_ad
489*440a403fSchristos        mov edx, [chainlenwmask]
490*440a403fSchristos        jmp LookupLoop
491*440a403fSchristos
492*440a403fSchristos;;;         s->match_start = cur_match;
493*440a403fSchristos;;;         best_len = len;
494*440a403fSchristos;;;         if (len >= nice_match) break;
495*440a403fSchristos;;;         scan_end = *(ushf*)(scan+best_len-1);
496*440a403fSchristos
497*440a403fSchristosLongerMatch:
498*440a403fSchristos        mov r11d, eax
499*440a403fSchristos        mov match_start, r8d
500*440a403fSchristos        cmp eax, [nicematch]
501*440a403fSchristos        jge LeaveNow
502*440a403fSchristos
503*440a403fSchristos        lea rsi,[r10+rax]
504*440a403fSchristos
505*440a403fSchristos        movzx   ebx, word ptr [r9 + rax - 1]
506*440a403fSchristos        mov rdi, prev_ad
507*440a403fSchristos        mov edx, [chainlenwmask]
508*440a403fSchristos        jmp LookupLoop
509*440a403fSchristos
510*440a403fSchristos;;; Accept the current string, with the maximum possible length.
511*440a403fSchristos
512*440a403fSchristosLenMaximum:
513*440a403fSchristos        mov r11d,MAX_MATCH
514*440a403fSchristos        mov match_start, r8d
515*440a403fSchristos
516*440a403fSchristos;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
517*440a403fSchristos;;; return s->lookahead;
518*440a403fSchristos
519*440a403fSchristosLeaveNow:
520*440a403fSchristosIFDEF INFOZIP
521*440a403fSchristos        mov eax,r11d
522*440a403fSchristosELSE
523*440a403fSchristos        mov eax, Lookahead
524*440a403fSchristos        cmp r11d, eax
525*440a403fSchristos        cmovng eax, r11d
526*440a403fSchristosENDIF
527*440a403fSchristos
528*440a403fSchristos;;; Restore the stack and return from whence we came.
529*440a403fSchristos
530*440a403fSchristos
531*440a403fSchristos        mov rsi,[save_rsi]
532*440a403fSchristos        mov rdi,[save_rdi]
533*440a403fSchristos        mov rbx,[save_rbx]
534*440a403fSchristos        mov rbp,[save_rbp]
535*440a403fSchristos        mov r12,[save_r12]
536*440a403fSchristos        mov r13,[save_r13]
537*440a403fSchristos;        mov r14,[save_r14]
538*440a403fSchristos;        mov r15,[save_r15]
539*440a403fSchristos
540*440a403fSchristos
541*440a403fSchristos        ret 0
542*440a403fSchristos; please don't remove this string !
543*440a403fSchristos; Your can freely use gvmat64 in any free or commercial app
544*440a403fSchristos; but it is far better don't remove the string in the binary!
545*440a403fSchristos    db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
546*440a403fSchristoslongest_match   ENDP
547*440a403fSchristos
548*440a403fSchristosmatch_init PROC
549*440a403fSchristos  ret 0
550*440a403fSchristosmatch_init ENDP
551*440a403fSchristos
552*440a403fSchristos
553*440a403fSchristosEND
554