1; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86
2; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant.
3; File written by Gilles Vollant, by modifiying the longest_match
4;  from Jean-loup Gailly in deflate.c
5;
6;         http://www.zlib.net
7;         http://www.winimage.com/zLibDll
8;         http://www.muppetlabs.com/~breadbox/software/assembly.html
9;
10; For Visual C++ 4.x and higher and ML 6.x and higher
11;   ml.exe is in directory \MASM611C of Win95 DDK
12;   ml.exe is also distributed in http://www.masm32.com/masmdl.htm
13;    and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
14;
15; this file contain two implementation of longest_match
16;
17;  longest_match_7fff : written 1996 by Gilles Vollant optimized for
18;            first Pentium. Assume s->w_mask == 0x7fff
19;  longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro
20;
21;  for using an seembly version of longest_match, you need define ASMV in project
22;  There is two way in using gvmat32.asm
23;
24;  A) Suggested method
25;    if you want include both longest_match_7fff and longest_match_686
26;    compile the asm file running
27;           ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm
28;    and include gvmat32c.c in your project
29;    if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff,
30;        longest_match_7fff will be used
31;    if you have a more modern CPU (Pentium Pro, II and higher)
32;        longest_match_686 will be used
33;    on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used,
34;        but this is not a sitation you'll find often
35;
36;  B) Alternative
37;    if you are not interresed in old cpu performance and want the smaller
38;       binaries possible
39;
40;    compile the asm file running
41;           ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm
42;    and do not include gvmat32c.c in your project (ou define also
43;              NOOLDPENTIUMCODE)
44;
45; note : as I known, longest_match_686 is very faster than longest_match_7fff
46;        on pentium Pro/II/III, faster (but less) in P4, but it seem
47;        longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8
48;
49; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2
50
51;uInt longest_match_7fff(s, cur_match)
52;    deflate_state *s;
53;    IPos cur_match;                             /* current match */
54
55    NbStack         equ     76
56    cur_match       equ     dword ptr[esp+NbStack-0]
57    str_s           equ     dword ptr[esp+NbStack-4]
58; 5 dword on top (ret,ebp,esi,edi,ebx)
59    adrret          equ     dword ptr[esp+NbStack-8]
60    pushebp         equ     dword ptr[esp+NbStack-12]
61    pushedi         equ     dword ptr[esp+NbStack-16]
62    pushesi         equ     dword ptr[esp+NbStack-20]
63    pushebx         equ     dword ptr[esp+NbStack-24]
64
65    chain_length    equ     dword ptr [esp+NbStack-28]
66    limit           equ     dword ptr [esp+NbStack-32]
67    best_len        equ     dword ptr [esp+NbStack-36]
68    window          equ     dword ptr [esp+NbStack-40]
69    prev            equ     dword ptr [esp+NbStack-44]
70    scan_start      equ      word ptr [esp+NbStack-48]
71    wmask           equ     dword ptr [esp+NbStack-52]
72    match_start_ptr equ     dword ptr [esp+NbStack-56]
73    nice_match      equ     dword ptr [esp+NbStack-60]
74    scan            equ     dword ptr [esp+NbStack-64]
75
76    windowlen       equ     dword ptr [esp+NbStack-68]
77    match_start     equ     dword ptr [esp+NbStack-72]
78    strend          equ     dword ptr [esp+NbStack-76]
79    NbStackAdd      equ     (NbStack-24)
80
81    .386p
82
83    name    gvmatch
84    .MODEL  FLAT
85
86
87
88;  all the +zlib1222add offsets are due to the addition of fields
89;  in zlib in the deflate_state structure since the asm code was first written
90;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
91;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
92;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
93
94    zlib1222add         equ     8
95
96;  Note : these value are good with a 8 bytes boundary pack structure
97    dep_chain_length    equ     74h+zlib1222add
98    dep_window          equ     30h+zlib1222add
99    dep_strstart        equ     64h+zlib1222add
100    dep_prev_length     equ     70h+zlib1222add
101    dep_nice_match      equ     88h+zlib1222add
102    dep_w_size          equ     24h+zlib1222add
103    dep_prev            equ     38h+zlib1222add
104    dep_w_mask          equ     2ch+zlib1222add
105    dep_good_match      equ     84h+zlib1222add
106    dep_match_start     equ     68h+zlib1222add
107    dep_lookahead       equ     6ch+zlib1222add
108
109
110_TEXT                   segment
111
112IFDEF NOUNDERLINE
113   IFDEF NOOLDPENTIUMCODE
114            public  longest_match
115            public  match_init
116   ELSE
117            public  longest_match_7fff
118            public  cpudetect32
119            public  longest_match_686
120   ENDIF
121ELSE
122   IFDEF NOOLDPENTIUMCODE
123            public  _longest_match
124            public  _match_init
125   ELSE
126            public  _longest_match_7fff
127            public  _cpudetect32
128            public  _longest_match_686
129   ENDIF
130ENDIF
131
132    MAX_MATCH           equ     258
133    MIN_MATCH           equ     3
134    MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
135
136
137
138IFNDEF NOOLDPENTIUMCODE
139IFDEF NOUNDERLINE
140longest_match_7fff   proc near
141ELSE
142_longest_match_7fff  proc near
143ENDIF
144
145    mov     edx,[esp+4]
146
147
148
149    push    ebp
150    push    edi
151    push    esi
152    push    ebx
153
154    sub     esp,NbStackAdd
155
156; initialize or check the variables used in match.asm.
157    mov     ebp,edx
158
159; chain_length = s->max_chain_length
160; if (prev_length>=good_match) chain_length >>= 2
161    mov     edx,[ebp+dep_chain_length]
162    mov     ebx,[ebp+dep_prev_length]
163    cmp     [ebp+dep_good_match],ebx
164    ja      noshr
165    shr     edx,2
166noshr:
167; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop
168    inc     edx
169    mov     edi,[ebp+dep_nice_match]
170    mov     chain_length,edx
171    mov     eax,[ebp+dep_lookahead]
172    cmp     eax,edi
173; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
174    jae     nolookaheadnicematch
175    mov     edi,eax
176nolookaheadnicematch:
177; best_len = s->prev_length
178    mov     best_len,ebx
179
180; window = s->window
181    mov     esi,[ebp+dep_window]
182    mov     ecx,[ebp+dep_strstart]
183    mov     window,esi
184
185    mov     nice_match,edi
186; scan = window + strstart
187    add     esi,ecx
188    mov     scan,esi
189; dx = *window
190    mov     dx,word ptr [esi]
191; bx = *(window+best_len-1)
192    mov     bx,word ptr [esi+ebx-1]
193    add     esi,MAX_MATCH-1
194; scan_start = *scan
195    mov     scan_start,dx
196; strend = scan + MAX_MATCH-1
197    mov     strend,esi
198; bx = scan_end = *(window+best_len-1)
199
200;    IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
201;        s->strstart - (IPos)MAX_DIST(s) : NIL;
202
203    mov     esi,[ebp+dep_w_size]
204    sub     esi,MIN_LOOKAHEAD
205; here esi = MAX_DIST(s)
206    sub     ecx,esi
207    ja      nodist
208    xor     ecx,ecx
209nodist:
210    mov     limit,ecx
211
212; prev = s->prev
213    mov     edx,[ebp+dep_prev]
214    mov     prev,edx
215
216;
217    mov     edx,dword ptr [ebp+dep_match_start]
218    mov     bp,scan_start
219    mov     eax,cur_match
220    mov     match_start,edx
221
222    mov     edx,window
223    mov     edi,edx
224    add     edi,best_len
225    mov     esi,prev
226    dec     edi
227; windowlen = window + best_len -1
228    mov     windowlen,edi
229
230    jmp     beginloop2
231    align   4
232
233; here, in the loop
234;       eax = ax = cur_match
235;       ecx = limit
236;        bx = scan_end
237;        bp = scan_start
238;       edi = windowlen (window + best_len -1)
239;       esi = prev
240
241
242;// here; chain_length <=16
243normalbeg0add16:
244    add     chain_length,16
245    jz      exitloop
246normalbeg0:
247    cmp     word ptr[edi+eax],bx
248    je      normalbeg2noroll
249rcontlabnoroll:
250; cur_match = prev[cur_match & wmask]
251    and     eax,7fffh
252    mov     ax,word ptr[esi+eax*2]
253; if cur_match > limit, go to exitloop
254    cmp     ecx,eax
255    jnb     exitloop
256; if --chain_length != 0, go to exitloop
257    dec     chain_length
258    jnz     normalbeg0
259    jmp     exitloop
260
261normalbeg2noroll:
262; if (scan_start==*(cur_match+window)) goto normalbeg2
263    cmp     bp,word ptr[edx+eax]
264    jne     rcontlabnoroll
265    jmp     normalbeg2
266
267contloop3:
268    mov     edi,windowlen
269
270; cur_match = prev[cur_match & wmask]
271    and     eax,7fffh
272    mov     ax,word ptr[esi+eax*2]
273; if cur_match > limit, go to exitloop
274    cmp     ecx,eax
275jnbexitloopshort1:
276    jnb     exitloop
277; if --chain_length != 0, go to exitloop
278
279
280; begin the main loop
281beginloop2:
282    sub     chain_length,16+1
283; if chain_length <=16, don't use the unrolled loop
284    jna     normalbeg0add16
285
286do16:
287    cmp     word ptr[edi+eax],bx
288    je      normalbeg2dc0
289
290maccn   MACRO   lab
291    and     eax,7fffh
292    mov     ax,word ptr[esi+eax*2]
293    cmp     ecx,eax
294    jnb     exitloop
295    cmp     word ptr[edi+eax],bx
296    je      lab
297    ENDM
298
299rcontloop0:
300    maccn   normalbeg2dc1
301
302rcontloop1:
303    maccn   normalbeg2dc2
304
305rcontloop2:
306    maccn   normalbeg2dc3
307
308rcontloop3:
309    maccn   normalbeg2dc4
310
311rcontloop4:
312    maccn   normalbeg2dc5
313
314rcontloop5:
315    maccn   normalbeg2dc6
316
317rcontloop6:
318    maccn   normalbeg2dc7
319
320rcontloop7:
321    maccn   normalbeg2dc8
322
323rcontloop8:
324    maccn   normalbeg2dc9
325
326rcontloop9:
327    maccn   normalbeg2dc10
328
329rcontloop10:
330    maccn   short normalbeg2dc11
331
332rcontloop11:
333    maccn   short normalbeg2dc12
334
335rcontloop12:
336    maccn   short normalbeg2dc13
337
338rcontloop13:
339    maccn   short normalbeg2dc14
340
341rcontloop14:
342    maccn   short normalbeg2dc15
343
344rcontloop15:
345    and     eax,7fffh
346    mov     ax,word ptr[esi+eax*2]
347    cmp     ecx,eax
348    jnb     exitloop
349
350    sub     chain_length,16
351    ja      do16
352    jmp     normalbeg0add16
353
354;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
355
356normbeg MACRO   rcontlab,valsub
357; if we are here, we know that *(match+best_len-1) == scan_end
358    cmp     bp,word ptr[edx+eax]
359; if (match != scan_start) goto rcontlab
360    jne     rcontlab
361; calculate the good chain_length, and we'll compare scan and match string
362    add     chain_length,16-valsub
363    jmp     iseq
364    ENDM
365
366
367normalbeg2dc11:
368    normbeg rcontloop11,11
369
370normalbeg2dc12:
371    normbeg short rcontloop12,12
372
373normalbeg2dc13:
374    normbeg short rcontloop13,13
375
376normalbeg2dc14:
377    normbeg short rcontloop14,14
378
379normalbeg2dc15:
380    normbeg short rcontloop15,15
381
382normalbeg2dc10:
383    normbeg rcontloop10,10
384
385normalbeg2dc9:
386    normbeg rcontloop9,9
387
388normalbeg2dc8:
389    normbeg rcontloop8,8
390
391normalbeg2dc7:
392    normbeg rcontloop7,7
393
394normalbeg2dc6:
395    normbeg rcontloop6,6
396
397normalbeg2dc5:
398    normbeg rcontloop5,5
399
400normalbeg2dc4:
401    normbeg rcontloop4,4
402
403normalbeg2dc3:
404    normbeg rcontloop3,3
405
406normalbeg2dc2:
407    normbeg rcontloop2,2
408
409normalbeg2dc1:
410    normbeg rcontloop1,1
411
412normalbeg2dc0:
413    normbeg rcontloop0,0
414
415
416; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end
417
418normalbeg2:
419    mov     edi,window
420
421    cmp     bp,word ptr[edi+eax]
422    jne     contloop3                   ; if *(ushf*)match != scan_start, continue
423
424iseq:
425; if we are here, we know that *(match+best_len-1) == scan_end
426; and (match == scan_start)
427
428    mov     edi,edx
429    mov     esi,scan                    ; esi = scan
430    add     edi,eax                     ; edi = window + cur_match = match
431
432    mov     edx,[esi+3]                 ; compare manually dword at match+3
433    xor     edx,[edi+3]                 ; and scan +3
434
435    jz      begincompare                ; if equal, go to long compare
436
437; we will determine the unmatch byte and calculate len (in esi)
438    or      dl,dl
439    je      eq1rr
440    mov     esi,3
441    jmp     trfinval
442eq1rr:
443    or      dx,dx
444    je      eq1
445
446    mov     esi,4
447    jmp     trfinval
448eq1:
449    and     edx,0ffffffh
450    jz      eq11
451    mov     esi,5
452    jmp     trfinval
453eq11:
454    mov     esi,6
455    jmp     trfinval
456
457begincompare:
458    ; here we now scan and match begin same
459    add     edi,6
460    add     esi,6
461    mov     ecx,(MAX_MATCH-(2+4))/4     ; scan for at most MAX_MATCH bytes
462    repe    cmpsd                       ; loop until mismatch
463
464    je      trfin                       ; go to trfin if not unmatch
465; we determine the unmatch byte
466    sub     esi,4
467    mov     edx,[edi-4]
468    xor     edx,[esi]
469
470    or      dl,dl
471    jnz     trfin
472    inc     esi
473
474    or      dx,dx
475    jnz     trfin
476    inc     esi
477
478    and     edx,0ffffffh
479    jnz     trfin
480    inc     esi
481
482trfin:
483    sub     esi,scan          ; esi = len
484trfinval:
485; here we have finised compare, and esi contain len of equal string
486    cmp     esi,best_len        ; if len > best_len, go newbestlen
487    ja      short newbestlen
488; now we restore edx, ecx and esi, for the big loop
489    mov     esi,prev
490    mov     ecx,limit
491    mov     edx,window
492    jmp     contloop3
493
494newbestlen:
495    mov     best_len,esi        ; len become best_len
496
497    mov     match_start,eax     ; save new position as match_start
498    cmp     esi,nice_match      ; if best_len >= nice_match, exit
499    jae     exitloop
500    mov     ecx,scan
501    mov     edx,window          ; restore edx=window
502    add     ecx,esi
503    add     esi,edx
504
505    dec     esi
506    mov     windowlen,esi       ; windowlen = window + best_len-1
507    mov     bx,[ecx-1]          ; bx = *(scan+best_len-1) = scan_end
508
509; now we restore ecx and esi, for the big loop :
510    mov     esi,prev
511    mov     ecx,limit
512    jmp     contloop3
513
514exitloop:
515; exit : s->match_start=match_start
516    mov     ebx,match_start
517    mov     ebp,str_s
518    mov     ecx,best_len
519    mov     dword ptr [ebp+dep_match_start],ebx
520    mov     eax,dword ptr [ebp+dep_lookahead]
521    cmp     ecx,eax
522    ja      minexlo
523    mov     eax,ecx
524minexlo:
525; return min(best_len,s->lookahead)
526
527; restore stack and register ebx,esi,edi,ebp
528    add     esp,NbStackAdd
529
530    pop     ebx
531    pop     esi
532    pop     edi
533    pop     ebp
534    ret
535InfoAuthor:
536; please don't remove this string !
537; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary!
538    db     0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah
539
540
541
542IFDEF NOUNDERLINE
543longest_match_7fff   endp
544ELSE
545_longest_match_7fff  endp
546ENDIF
547
548
549IFDEF NOUNDERLINE
550cpudetect32     proc near
551ELSE
552_cpudetect32    proc near
553ENDIF
554
555    push    ebx
556
557    pushfd                  ; push original EFLAGS
558    pop     eax             ; get original EFLAGS
559    mov     ecx, eax        ; save original EFLAGS
560    xor     eax, 40000h     ; flip AC bit in EFLAGS
561    push    eax             ; save new EFLAGS value on stack
562    popfd                   ; replace current EFLAGS value
563    pushfd                  ; get new EFLAGS
564    pop     eax             ; store new EFLAGS in EAX
565    xor     eax, ecx        ; can�t toggle AC bit, processor=80386
566    jz      end_cpu_is_386  ; jump if 80386 processor
567    push    ecx
568    popfd                   ; restore AC bit in EFLAGS first
569
570    pushfd
571    pushfd
572    pop     ecx
573
574    mov     eax, ecx        ; get original EFLAGS
575    xor     eax, 200000h    ; flip ID bit in EFLAGS
576    push    eax             ; save new EFLAGS value on stack
577    popfd                   ; replace current EFLAGS value
578    pushfd                  ; get new EFLAGS
579    pop     eax             ; store new EFLAGS in EAX
580    popfd                   ; restore original EFLAGS
581    xor     eax, ecx        ; can�t toggle ID bit,
582    je      is_old_486      ; processor=old
583
584    mov     eax,1
585    db      0fh,0a2h        ;CPUID
586
587exitcpudetect:
588    pop ebx
589    ret
590
591end_cpu_is_386:
592    mov     eax,0300h
593    jmp     exitcpudetect
594
595is_old_486:
596    mov     eax,0400h
597    jmp     exitcpudetect
598
599IFDEF NOUNDERLINE
600cpudetect32     endp
601ELSE
602_cpudetect32    endp
603ENDIF
604ENDIF
605
606MAX_MATCH       equ     258
607MIN_MATCH       equ     3
608MIN_LOOKAHEAD   equ     (MAX_MATCH + MIN_MATCH + 1)
609MAX_MATCH_8_     equ     ((MAX_MATCH + 7) AND 0FFF0h)
610
611
612;;; stack frame offsets
613
614chainlenwmask   equ  esp + 0    ; high word: current chain len
615                    ; low word: s->wmask
616window      equ  esp + 4    ; local copy of s->window
617windowbestlen   equ  esp + 8    ; s->window + bestlen
618scanstart   equ  esp + 16   ; first two bytes of string
619scanend     equ  esp + 12   ; last two bytes of string
620scanalign   equ  esp + 20   ; dword-misalignment of string
621nicematch   equ  esp + 24   ; a good enough match size
622bestlen     equ  esp + 28   ; size of best match so far
623scan        equ  esp + 32   ; ptr to string wanting match
624
625LocalVarsSize   equ 36
626;   saved ebx   byte esp + 36
627;   saved edi   byte esp + 40
628;   saved esi   byte esp + 44
629;   saved ebp   byte esp + 48
630;   return address  byte esp + 52
631deflatestate    equ  esp + 56   ; the function arguments
632curmatch    equ  esp + 60
633
634;;; Offsets for fields in the deflate_state structure. These numbers
635;;; are calculated from the definition of deflate_state, with the
636;;; assumption that the compiler will dword-align the fields. (Thus,
637;;; changing the definition of deflate_state could easily cause this
638;;; program to crash horribly, without so much as a warning at
639;;; compile time. Sigh.)
640
641dsWSize     equ 36+zlib1222add
642dsWMask     equ 44+zlib1222add
643dsWindow    equ 48+zlib1222add
644dsPrev      equ 56+zlib1222add
645dsMatchLen  equ 88+zlib1222add
646dsPrevMatch equ 92+zlib1222add
647dsStrStart  equ 100+zlib1222add
648dsMatchStart    equ 104+zlib1222add
649dsLookahead equ 108+zlib1222add
650dsPrevLen   equ 112+zlib1222add
651dsMaxChainLen   equ 116+zlib1222add
652dsGoodMatch equ 132+zlib1222add
653dsNiceMatch equ 136+zlib1222add
654
655
656;;; match.asm -- Pentium-Pro-optimized version of longest_match()
657;;; Written for zlib 1.1.2
658;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
659;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html
660;;;
661;;; This is free software; you can redistribute it and/or modify it
662;;; under the terms of the GNU General Public License.
663
664;GLOBAL _longest_match, _match_init
665
666
667;SECTION    .text
668
669;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
670
671;_longest_match:
672IFDEF NOOLDPENTIUMCODE
673    IFDEF NOUNDERLINE
674    longest_match       proc near
675    ELSE
676    _longest_match      proc near
677    ENDIF
678ELSE
679    IFDEF NOUNDERLINE
680    longest_match_686   proc near
681    ELSE
682    _longest_match_686  proc near
683    ENDIF
684ENDIF
685
686;;; Save registers that the compiler may be using, and adjust esp to
687;;; make room for our stack frame.
688
689        push    ebp
690        push    edi
691        push    esi
692        push    ebx
693        sub esp, LocalVarsSize
694
695;;; Retrieve the function arguments. ecx will hold cur_match
696;;; throughout the entire function. edx will hold the pointer to the
697;;; deflate_state structure during the function's setup (before
698;;; entering the main loop.
699
700        mov edx, [deflatestate]
701        mov ecx, [curmatch]
702
703;;; uInt wmask = s->w_mask;
704;;; unsigned chain_length = s->max_chain_length;
705;;; if (s->prev_length >= s->good_match) {
706;;;     chain_length >>= 2;
707;;; }
708
709        mov eax, [edx + dsPrevLen]
710        mov ebx, [edx + dsGoodMatch]
711        cmp eax, ebx
712        mov eax, [edx + dsWMask]
713        mov ebx, [edx + dsMaxChainLen]
714        jl  LastMatchGood
715        shr ebx, 2
716LastMatchGood:
717
718;;; chainlen is decremented once beforehand so that the function can
719;;; use the sign flag instead of the zero flag for the exit test.
720;;; It is then shifted into the high word, to make room for the wmask
721;;; value, which it will always accompany.
722
723        dec ebx
724        shl ebx, 16
725        or  ebx, eax
726        mov [chainlenwmask], ebx
727
728;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
729
730        mov eax, [edx + dsNiceMatch]
731        mov ebx, [edx + dsLookahead]
732        cmp ebx, eax
733        jl  LookaheadLess
734        mov ebx, eax
735LookaheadLess:  mov [nicematch], ebx
736
737;;; register Bytef *scan = s->window + s->strstart;
738
739        mov esi, [edx + dsWindow]
740        mov [window], esi
741        mov ebp, [edx + dsStrStart]
742        lea edi, [esi + ebp]
743        mov [scan], edi
744
745;;; Determine how many bytes the scan ptr is off from being
746;;; dword-aligned.
747
748        mov eax, edi
749        neg eax
750        and eax, 3
751        mov [scanalign], eax
752
753;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
754;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
755
756        mov eax, [edx + dsWSize]
757        sub eax, MIN_LOOKAHEAD
758        sub ebp, eax
759        jg  LimitPositive
760        xor ebp, ebp
761LimitPositive:
762
763;;; int best_len = s->prev_length;
764
765        mov eax, [edx + dsPrevLen]
766        mov [bestlen], eax
767
768;;; Store the sum of s->window + best_len in esi locally, and in esi.
769
770        add esi, eax
771        mov [windowbestlen], esi
772
773;;; register ush scan_start = *(ushf*)scan;
774;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
775;;; Posf *prev = s->prev;
776
777        movzx   ebx, word ptr [edi]
778        mov [scanstart], ebx
779        movzx   ebx, word ptr [edi + eax - 1]
780        mov [scanend], ebx
781        mov edi, [edx + dsPrev]
782
783;;; Jump into the main loop.
784
785        mov edx, [chainlenwmask]
786        jmp short LoopEntry
787
788align 4
789
790;;; do {
791;;;     match = s->window + cur_match;
792;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
793;;;         *(ushf*)match != scan_start) continue;
794;;;     [...]
795;;; } while ((cur_match = prev[cur_match & wmask]) > limit
796;;;          && --chain_length != 0);
797;;;
798;;; Here is the inner loop of the function. The function will spend the
799;;; majority of its time in this loop, and majority of that time will
800;;; be spent in the first ten instructions.
801;;;
802;;; Within this loop:
803;;; ebx = scanend
804;;; ecx = curmatch
805;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
806;;; esi = windowbestlen - i.e., (window + bestlen)
807;;; edi = prev
808;;; ebp = limit
809
810LookupLoop:
811        and ecx, edx
812        movzx   ecx, word ptr [edi + ecx*2]
813        cmp ecx, ebp
814        jbe LeaveNow
815        sub edx, 00010000h
816        js  LeaveNow
817LoopEntry:  movzx   eax, word ptr [esi + ecx - 1]
818        cmp eax, ebx
819        jnz LookupLoop
820        mov eax, [window]
821        movzx   eax, word ptr [eax + ecx]
822        cmp eax, [scanstart]
823        jnz LookupLoop
824
825;;; Store the current value of chainlen.
826
827        mov [chainlenwmask], edx
828
829;;; Point edi to the string under scrutiny, and esi to the string we
830;;; are hoping to match it up with. In actuality, esi and edi are
831;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
832;;; initialized to -(MAX_MATCH_8 - scanalign).
833
834        mov esi, [window]
835        mov edi, [scan]
836        add esi, ecx
837        mov eax, [scanalign]
838        mov edx, 0fffffef8h; -(MAX_MATCH_8)
839        lea edi, [edi + eax + 0108h] ;MAX_MATCH_8]
840        lea esi, [esi + eax + 0108h] ;MAX_MATCH_8]
841
842;;; Test the strings for equality, 8 bytes at a time. At the end,
843;;; adjust edx so that it is offset to the exact byte that mismatched.
844;;;
845;;; We already know at this point that the first three bytes of the
846;;; strings match each other, and they can be safely passed over before
847;;; starting the compare loop. So what this code does is skip over 0-3
848;;; bytes, as much as necessary in order to dword-align the edi
849;;; pointer. (esi will still be misaligned three times out of four.)
850;;;
851;;; It should be confessed that this loop usually does not represent
852;;; much of the total running time. Replacing it with a more
853;;; straightforward "rep cmpsb" would not drastically degrade
854;;; performance.
855
856LoopCmps:
857        mov eax, [esi + edx]
858        xor eax, [edi + edx]
859        jnz LeaveLoopCmps
860        mov eax, [esi + edx + 4]
861        xor eax, [edi + edx + 4]
862        jnz LeaveLoopCmps4
863        add edx, 8
864        jnz LoopCmps
865        jmp short LenMaximum
866LeaveLoopCmps4: add edx, 4
867LeaveLoopCmps:  test    eax, 0000FFFFh
868        jnz LenLower
869        add edx,  2
870        shr eax, 16
871LenLower:   sub al, 1
872        adc edx, 0
873
874;;; Calculate the length of the match. If it is longer than MAX_MATCH,
875;;; then automatically accept it as the best possible match and leave.
876
877        lea eax, [edi + edx]
878        mov edi, [scan]
879        sub eax, edi
880        cmp eax, MAX_MATCH
881        jge LenMaximum
882
883;;; If the length of the match is not longer than the best match we
884;;; have so far, then forget it and return to the lookup loop.
885
886        mov edx, [deflatestate]
887        mov ebx, [bestlen]
888        cmp eax, ebx
889        jg  LongerMatch
890        mov esi, [windowbestlen]
891        mov edi, [edx + dsPrev]
892        mov ebx, [scanend]
893        mov edx, [chainlenwmask]
894        jmp LookupLoop
895
896;;;         s->match_start = cur_match;
897;;;         best_len = len;
898;;;         if (len >= nice_match) break;
899;;;         scan_end = *(ushf*)(scan+best_len-1);
900
901LongerMatch:    mov ebx, [nicematch]
902        mov [bestlen], eax
903        mov [edx + dsMatchStart], ecx
904        cmp eax, ebx
905        jge LeaveNow
906        mov esi, [window]
907        add esi, eax
908        mov [windowbestlen], esi
909        movzx   ebx, word ptr [edi + eax - 1]
910        mov edi, [edx + dsPrev]
911        mov [scanend], ebx
912        mov edx, [chainlenwmask]
913        jmp LookupLoop
914
915;;; Accept the current string, with the maximum possible length.
916
917LenMaximum: mov edx, [deflatestate]
918        mov dword ptr [bestlen], MAX_MATCH
919        mov [edx + dsMatchStart], ecx
920
921;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
922;;; return s->lookahead;
923
924LeaveNow:
925        mov edx, [deflatestate]
926        mov ebx, [bestlen]
927        mov eax, [edx + dsLookahead]
928        cmp ebx, eax
929        jg  LookaheadRet
930        mov eax, ebx
931LookaheadRet:
932
933;;; Restore the stack and return from whence we came.
934
935        add esp, LocalVarsSize
936        pop ebx
937        pop esi
938        pop edi
939        pop ebp
940
941        ret
942; please don't remove this string !
943; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary!
944    db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah
945
946
947IFDEF NOOLDPENTIUMCODE
948    IFDEF NOUNDERLINE
949    longest_match       endp
950    ELSE
951    _longest_match      endp
952    ENDIF
953
954    IFDEF NOUNDERLINE
955    match_init      proc near
956                    ret
957    match_init      endp
958    ELSE
959    _match_init     proc near
960                    ret
961    _match_init     endp
962    ENDIF
963ELSE
964    IFDEF NOUNDERLINE
965    longest_match_686   endp
966    ELSE
967    _longest_match_686  endp
968    ENDIF
969ENDIF
970
971_TEXT   ends
972end
973