1/* match.s -- Pentium-Pro-optimized version of longest_match() 2 * Written for zlib 1.1.2 3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 4 * 5 * This is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License. 7 */ 8 9#ifndef NO_UNDERLINE 10#define match_init _match_init 11#define longest_match _longest_match 12#endif 13 14#define MAX_MATCH (258) 15#define MIN_MATCH (3) 16#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) 17#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) 18 19/* stack frame offsets */ 20 21#define chainlenwmask 0 /* high word: current chain len */ 22 /* low word: s->wmask */ 23#define window 4 /* local copy of s->window */ 24#define windowbestlen 8 /* s->window + bestlen */ 25#define scanstart 16 /* first two bytes of string */ 26#define scanend 12 /* last two bytes of string */ 27#define scanalign 20 /* dword-misalignment of string */ 28#define nicematch 24 /* a good enough match size */ 29#define bestlen 28 /* size of best match so far */ 30#define scan 32 /* ptr to string wanting match */ 31 32#define LocalVarsSize (36) 33/* saved ebx 36 */ 34/* saved edi 40 */ 35/* saved esi 44 */ 36/* saved ebp 48 */ 37/* return address 52 */ 38#define deflatestate 56 /* the function arguments */ 39#define curmatch 60 40 41/* All the +zlib1222add offsets are due to the addition of fields 42 * in zlib in the deflate_state structure since the asm code was first written 43 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 44 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 45 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 46 */ 47 48#define zlib1222add (8) 49 50#define dsWSize (36+zlib1222add) 51#define dsWMask (44+zlib1222add) 52#define dsWindow (48+zlib1222add) 53#define dsPrev (56+zlib1222add) 54#define dsMatchLen (88+zlib1222add) 55#define dsPrevMatch (92+zlib1222add) 56#define dsStrStart (100+zlib1222add) 57#define dsMatchStart (104+zlib1222add) 58#define dsLookahead (108+zlib1222add) 59#define dsPrevLen (112+zlib1222add) 60#define dsMaxChainLen (116+zlib1222add) 61#define dsGoodMatch (132+zlib1222add) 62#define dsNiceMatch (136+zlib1222add) 63 64 65.file "match.S" 66 67.globl match_init, longest_match 68 69.text 70 71/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ 72 73longest_match: 74 75/* Save registers that the compiler may be using, and adjust %esp to */ 76/* make room for our stack frame. */ 77 78 pushl %ebp 79 pushl %edi 80 pushl %esi 81 pushl %ebx 82 subl $LocalVarsSize, %esp 83 84/* Retrieve the function arguments. %ecx will hold cur_match */ 85/* throughout the entire function. %edx will hold the pointer to the */ 86/* deflate_state structure during the function's setup (before */ 87/* entering the main loop). */ 88 89 movl deflatestate(%esp), %edx 90 movl curmatch(%esp), %ecx 91 92/* uInt wmask = s->w_mask; */ 93/* unsigned chain_length = s->max_chain_length; */ 94/* if (s->prev_length >= s->good_match) { */ 95/* chain_length >>= 2; */ 96/* } */ 97 98 movl dsPrevLen(%edx), %eax 99 movl dsGoodMatch(%edx), %ebx 100 cmpl %ebx, %eax 101 movl dsWMask(%edx), %eax 102 movl dsMaxChainLen(%edx), %ebx 103 jl LastMatchGood 104 shrl $2, %ebx 105LastMatchGood: 106 107/* chainlen is decremented once beforehand so that the function can */ 108/* use the sign flag instead of the zero flag for the exit test. */ 109/* It is then shifted into the high word, to make room for the wmask */ 110/* value, which it will always accompany. */ 111 112 decl %ebx 113 shll $16, %ebx 114 orl %eax, %ebx 115 movl %ebx, chainlenwmask(%esp) 116 117/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ 118 119 movl dsNiceMatch(%edx), %eax 120 movl dsLookahead(%edx), %ebx 121 cmpl %eax, %ebx 122 jl LookaheadLess 123 movl %eax, %ebx 124LookaheadLess: movl %ebx, nicematch(%esp) 125 126/* register Bytef *scan = s->window + s->strstart; */ 127 128 movl dsWindow(%edx), %esi 129 movl %esi, window(%esp) 130 movl dsStrStart(%edx), %ebp 131 lea (%esi,%ebp), %edi 132 movl %edi, scan(%esp) 133 134/* Determine how many bytes the scan ptr is off from being */ 135/* dword-aligned. */ 136 137 movl %edi, %eax 138 negl %eax 139 andl $3, %eax 140 movl %eax, scanalign(%esp) 141 142/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 143/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 144 145 movl dsWSize(%edx), %eax 146 subl $MIN_LOOKAHEAD, %eax 147 subl %eax, %ebp 148 jg LimitPositive 149 xorl %ebp, %ebp 150LimitPositive: 151 152/* int best_len = s->prev_length; */ 153 154 movl dsPrevLen(%edx), %eax 155 movl %eax, bestlen(%esp) 156 157/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 158 159 addl %eax, %esi 160 movl %esi, windowbestlen(%esp) 161 162/* register ush scan_start = *(ushf*)scan; */ 163/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 164/* Posf *prev = s->prev; */ 165 166 movzwl (%edi), %ebx 167 movl %ebx, scanstart(%esp) 168 movzwl -1(%edi,%eax), %ebx 169 movl %ebx, scanend(%esp) 170 movl dsPrev(%edx), %edi 171 172/* Jump into the main loop. */ 173 174 movl chainlenwmask(%esp), %edx 175 jmp LoopEntry 176 177.balign 16 178 179/* do { 180 * match = s->window + cur_match; 181 * if (*(ushf*)(match+best_len-1) != scan_end || 182 * *(ushf*)match != scan_start) continue; 183 * [...] 184 * } while ((cur_match = prev[cur_match & wmask]) > limit 185 * && --chain_length != 0); 186 * 187 * Here is the inner loop of the function. The function will spend the 188 * majority of its time in this loop, and majority of that time will 189 * be spent in the first ten instructions. 190 * 191 * Within this loop: 192 * %ebx = scanend 193 * %ecx = curmatch 194 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 195 * %esi = windowbestlen - i.e., (window + bestlen) 196 * %edi = prev 197 * %ebp = limit 198 */ 199LookupLoop: 200 andl %edx, %ecx 201 movzwl (%edi,%ecx,2), %ecx 202 cmpl %ebp, %ecx 203 jbe LeaveNow 204 subl $0x00010000, %edx 205 js LeaveNow 206LoopEntry: movzwl -1(%esi,%ecx), %eax 207 cmpl %ebx, %eax 208 jnz LookupLoop 209 movl window(%esp), %eax 210 movzwl (%eax,%ecx), %eax 211 cmpl scanstart(%esp), %eax 212 jnz LookupLoop 213 214/* Store the current value of chainlen. */ 215 216 movl %edx, chainlenwmask(%esp) 217 218/* Point %edi to the string under scrutiny, and %esi to the string we */ 219/* are hoping to match it up with. In actuality, %esi and %edi are */ 220/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 221/* initialized to -(MAX_MATCH_8 - scanalign). */ 222 223 movl window(%esp), %esi 224 movl scan(%esp), %edi 225 addl %ecx, %esi 226 movl scanalign(%esp), %eax 227 movl $(-MAX_MATCH_8), %edx 228 lea MAX_MATCH_8(%edi,%eax), %edi 229 lea MAX_MATCH_8(%esi,%eax), %esi 230 231/* Test the strings for equality, 8 bytes at a time. At the end, 232 * adjust %edx so that it is offset to the exact byte that mismatched. 233 * 234 * We already know at this point that the first three bytes of the 235 * strings match each other, and they can be safely passed over before 236 * starting the compare loop. So what this code does is skip over 0-3 237 * bytes, as much as necessary in order to dword-align the %edi 238 * pointer. (%esi will still be misaligned three times out of four.) 239 * 240 * It should be confessed that this loop usually does not represent 241 * much of the total running time. Replacing it with a more 242 * straightforward "rep cmpsb" would not drastically degrade 243 * performance. 244 */ 245LoopCmps: 246 movl (%esi,%edx), %eax 247 xorl (%edi,%edx), %eax 248 jnz LeaveLoopCmps 249 movl 4(%esi,%edx), %eax 250 xorl 4(%edi,%edx), %eax 251 jnz LeaveLoopCmps4 252 addl $8, %edx 253 jnz LoopCmps 254 jmp LenMaximum 255LeaveLoopCmps4: addl $4, %edx 256LeaveLoopCmps: testl $0x0000FFFF, %eax 257 jnz LenLower 258 addl $2, %edx 259 shrl $16, %eax 260LenLower: subb $1, %al 261 adcl $0, %edx 262 263/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 264/* then automatically accept it as the best possible match and leave. */ 265 266 lea (%edi,%edx), %eax 267 movl scan(%esp), %edi 268 subl %edi, %eax 269 cmpl $MAX_MATCH, %eax 270 jge LenMaximum 271 272/* If the length of the match is not longer than the best match we */ 273/* have so far, then forget it and return to the lookup loop. */ 274 275 movl deflatestate(%esp), %edx 276 movl bestlen(%esp), %ebx 277 cmpl %ebx, %eax 278 jg LongerMatch 279 movl windowbestlen(%esp), %esi 280 movl dsPrev(%edx), %edi 281 movl scanend(%esp), %ebx 282 movl chainlenwmask(%esp), %edx 283 jmp LookupLoop 284 285/* s->match_start = cur_match; */ 286/* best_len = len; */ 287/* if (len >= nice_match) break; */ 288/* scan_end = *(ushf*)(scan+best_len-1); */ 289 290LongerMatch: movl nicematch(%esp), %ebx 291 movl %eax, bestlen(%esp) 292 movl %ecx, dsMatchStart(%edx) 293 cmpl %ebx, %eax 294 jge LeaveNow 295 movl window(%esp), %esi 296 addl %eax, %esi 297 movl %esi, windowbestlen(%esp) 298 movzwl -1(%edi,%eax), %ebx 299 movl dsPrev(%edx), %edi 300 movl %ebx, scanend(%esp) 301 movl chainlenwmask(%esp), %edx 302 jmp LookupLoop 303 304/* Accept the current string, with the maximum possible length. */ 305 306LenMaximum: movl deflatestate(%esp), %edx 307 movl $MAX_MATCH, bestlen(%esp) 308 movl %ecx, dsMatchStart(%edx) 309 310/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 311/* return s->lookahead; */ 312 313LeaveNow: 314 movl deflatestate(%esp), %edx 315 movl bestlen(%esp), %ebx 316 movl dsLookahead(%edx), %eax 317 cmpl %eax, %ebx 318 jg LookaheadRet 319 movl %ebx, %eax 320LookaheadRet: 321 322/* Restore the stack and return from whence we came. */ 323 324 addl $LocalVarsSize, %esp 325 popl %ebx 326 popl %esi 327 popl %edi 328 popl %ebp 329match_init: ret 330