1 
2 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
3  *
4  * For Intel x86 CPU and Microsoft Visual C++ compiler
5  *
6  * Last changed in libpng 1.2.6 - August 15, 2004
7  * For conditions of distribution and use, see copyright notice in png.h
8  * Copyright (c) 1998-2004 Glenn Randers-Pehrson
9  * Copyright (c) 1998, Intel Corporation
10  *
11  * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
12  * Interface to libpng contributed by Gilles Vollant, 1999
13  *
14  *
15  * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
16  * a sign error in the post-MMX cleanup code for each pixel_depth resulted
17  * in bad pixels at the beginning of some rows of some images, and also
18  * (due to out-of-range memory reads and writes) caused heap corruption
19  * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
20  *
21  * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
22  *
23  * [runtime MMX configuration, GRR 20010102]
24  *
25  */
26 
27 #define PNG_INTERNAL
28 #include "png.h"
29 
30 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
31 
32 static int mmx_supported=2;
33 
34 
35 int PNGAPI
png_mmx_support(void)36 png_mmx_support(void)
37 {
38   int mmx_supported_local = 0;
39   _asm {
40     push ebx          //CPUID will trash these
41     push ecx
42     push edx
43 
44     pushfd            //Save Eflag to stack
45     pop eax           //Get Eflag from stack into eax
46     mov ecx, eax      //Make another copy of Eflag in ecx
47     xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
48     push eax          //Save modified Eflag back to stack
49 
50     popfd             //Restored modified value back to Eflag reg
51     pushfd            //Save Eflag to stack
52     pop eax           //Get Eflag from stack
53     push ecx          // save original Eflag to stack
54     popfd             // restore original Eflag
55     xor eax, ecx      //Compare the new Eflag with the original Eflag
56     jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
57                       //skip following instructions and jump to
58                       //NOT_SUPPORTED label
59 
60     xor eax, eax      //Set eax to zero
61 
62     _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
63     _asm _emit 0xa2
64 
65     cmp eax, 1        //make sure eax return non-zero value
66     jl NOT_SUPPORTED  //If eax is zero, mmx not supported
67 
68     xor eax, eax      //set eax to zero
69     inc eax           //Now increment eax to 1.  This instruction is
70                       //faster than the instruction "mov eax, 1"
71 
72     _asm _emit 0x0f   //CPUID instruction
73     _asm _emit 0xa2
74 
75     and edx, 0x00800000  //mask out all bits but mmx bit(24)
76     cmp edx, 0        // 0 = mmx not supported
77     jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
78 
79     mov  mmx_supported_local, 1  //set return value to 1
80 
81 NOT_SUPPORTED:
82     mov  eax, mmx_supported_local  //move return value to eax
83     pop edx          //CPUID trashed these
84     pop ecx
85     pop ebx
86   }
87 
88   //mmx_supported_local=0; // test code for force don't support MMX
89   //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
90 
91   mmx_supported = mmx_supported_local;
92   return mmx_supported_local;
93 }
94 
95 /* Combines the row recently read in with the previous row.
96    This routine takes care of alpha and transparency if requested.
97    This routine also handles the two methods of progressive display
98    of interlaced images, depending on the mask value.
99    The mask value describes which pixels are to be combined with
100    the row.  The pattern always repeats every 8 pixels, so just 8
101    bits are needed.  A one indicates the pixel is to be combined; a
102    zero indicates the pixel is to be skipped.  This is in addition
103    to any alpha or transparency value associated with the pixel.  If
104    you want all pixels to be combined, pass 0xff (255) in mask.  */
105 
106 /* Use this routine for x86 platform - uses faster MMX routine if machine
107    supports MMX */
108 
109 void /* PRIVATE */
png_combine_row(png_structp png_ptr,png_bytep row,int mask)110 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
111 {
112 #ifdef PNG_USE_LOCAL_ARRAYS
113    const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
114 #endif
115 
116    png_debug(1,"in png_combine_row_asm\n");
117 
118    if (mmx_supported == 2) {
119 #if !defined(PNG_1_0_X)
120        /* this should have happened in png_init_mmx_flags() already */
121        png_warning(png_ptr, "asm_flags may not have been initialized");
122 #endif
123        png_mmx_support();
124    }
125 
126    if (mask == 0xff)
127    {
128       png_memcpy(row, png_ptr->row_buf + 1,
129        (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
130        png_ptr->width));
131    }
132    /* GRR:  add "else if (mask == 0)" case?
133     *       or does png_combine_row() not even get called in that case? */
134    else
135    {
136       switch (png_ptr->row_info.pixel_depth)
137       {
138          case 1:
139          {
140             png_bytep sp;
141             png_bytep dp;
142             int s_inc, s_start, s_end;
143             int m;
144             int shift;
145             png_uint_32 i;
146 
147             sp = png_ptr->row_buf + 1;
148             dp = row;
149             m = 0x80;
150 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
151             if (png_ptr->transformations & PNG_PACKSWAP)
152             {
153                 s_start = 0;
154                 s_end = 7;
155                 s_inc = 1;
156             }
157             else
158 #endif
159             {
160                 s_start = 7;
161                 s_end = 0;
162                 s_inc = -1;
163             }
164 
165             shift = s_start;
166 
167             for (i = 0; i < png_ptr->width; i++)
168             {
169                if (m & mask)
170                {
171                   int value;
172 
173                   value = (*sp >> shift) & 0x1;
174                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
175                   *dp |= (png_byte)(value << shift);
176                }
177 
178                if (shift == s_end)
179                {
180                   shift = s_start;
181                   sp++;
182                   dp++;
183                }
184                else
185                   shift += s_inc;
186 
187                if (m == 1)
188                   m = 0x80;
189                else
190                   m >>= 1;
191             }
192             break;
193          }
194 
195          case 2:
196          {
197             png_bytep sp;
198             png_bytep dp;
199             int s_start, s_end, s_inc;
200             int m;
201             int shift;
202             png_uint_32 i;
203             int value;
204 
205             sp = png_ptr->row_buf + 1;
206             dp = row;
207             m = 0x80;
208 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
209             if (png_ptr->transformations & PNG_PACKSWAP)
210             {
211                s_start = 0;
212                s_end = 6;
213                s_inc = 2;
214             }
215             else
216 #endif
217             {
218                s_start = 6;
219                s_end = 0;
220                s_inc = -2;
221             }
222 
223             shift = s_start;
224 
225             for (i = 0; i < png_ptr->width; i++)
226             {
227                if (m & mask)
228                {
229                   value = (*sp >> shift) & 0x3;
230                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
231                   *dp |= (png_byte)(value << shift);
232                }
233 
234                if (shift == s_end)
235                {
236                   shift = s_start;
237                   sp++;
238                   dp++;
239                }
240                else
241                   shift += s_inc;
242                if (m == 1)
243                   m = 0x80;
244                else
245                   m >>= 1;
246             }
247             break;
248          }
249 
250          case 4:
251          {
252             png_bytep sp;
253             png_bytep dp;
254             int s_start, s_end, s_inc;
255             int m;
256             int shift;
257             png_uint_32 i;
258             int value;
259 
260             sp = png_ptr->row_buf + 1;
261             dp = row;
262             m = 0x80;
263 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
264             if (png_ptr->transformations & PNG_PACKSWAP)
265             {
266                s_start = 0;
267                s_end = 4;
268                s_inc = 4;
269             }
270             else
271 #endif
272             {
273                s_start = 4;
274                s_end = 0;
275                s_inc = -4;
276             }
277             shift = s_start;
278 
279             for (i = 0; i < png_ptr->width; i++)
280             {
281                if (m & mask)
282                {
283                   value = (*sp >> shift) & 0xf;
284                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
285                   *dp |= (png_byte)(value << shift);
286                }
287 
288                if (shift == s_end)
289                {
290                   shift = s_start;
291                   sp++;
292                   dp++;
293                }
294                else
295                   shift += s_inc;
296                if (m == 1)
297                   m = 0x80;
298                else
299                   m >>= 1;
300             }
301             break;
302          }
303 
304          case 8:
305          {
306             png_bytep srcptr;
307             png_bytep dstptr;
308             png_uint_32 len;
309             int m;
310             int diff, unmask;
311 
312             __int64 mask0=0x0102040810204080;
313 
314 #if !defined(PNG_1_0_X)
315             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
316                 /* && mmx_supported */ )
317 #else
318             if (mmx_supported)
319 #endif
320             {
321                srcptr = png_ptr->row_buf + 1;
322                dstptr = row;
323                m = 0x80;
324                unmask = ~mask;
325                len  = png_ptr->width &~7;  //reduce to multiple of 8
326                diff = png_ptr->width & 7;  //amount lost
327 
328                _asm
329                {
330                   movd       mm7, unmask   //load bit pattern
331                   psubb      mm6,mm6       //zero mm6
332                   punpcklbw  mm7,mm7
333                   punpcklwd  mm7,mm7
334                   punpckldq  mm7,mm7       //fill register with 8 masks
335 
336                   movq       mm0,mask0
337 
338                   pand       mm0,mm7       //nonzero if keep byte
339                   pcmpeqb    mm0,mm6       //zeros->1s, v versa
340 
341                   mov        ecx,len       //load length of line (pixels)
342                   mov        esi,srcptr    //load source
343                   mov        ebx,dstptr    //load dest
344                   cmp        ecx,0         //lcr
345                   je         mainloop8end
346 
347 mainloop8:
348                   movq       mm4,[esi]
349                   pand       mm4,mm0
350                   movq       mm6,mm0
351                   pandn      mm6,[ebx]
352                   por        mm4,mm6
353                   movq       [ebx],mm4
354 
355                   add        esi,8         //inc by 8 bytes processed
356                   add        ebx,8
357                   sub        ecx,8         //dec by 8 pixels processed
358 
359                   ja         mainloop8
360 mainloop8end:
361 
362                   mov        ecx,diff
363                   cmp        ecx,0
364                   jz         end8
365 
366                   mov        edx,mask
367                   sal        edx,24        //make low byte the high byte
368 
369 secondloop8:
370                   sal        edx,1         //move high bit to CF
371                   jnc        skip8         //if CF = 0
372                   mov        al,[esi]
373                   mov        [ebx],al
374 skip8:
375                   inc        esi
376                   inc        ebx
377 
378                   dec        ecx
379                   jnz        secondloop8
380 end8:
381                   emms
382                }
383             }
384             else /* mmx not supported - use modified C routine */
385             {
386                register unsigned int incr1, initial_val, final_val;
387                png_size_t pixel_bytes;
388                png_uint_32 i;
389                register int disp = png_pass_inc[png_ptr->pass];
390                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
391 
392                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
393                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
394                   pixel_bytes;
395                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
396                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
397                final_val = png_ptr->width*pixel_bytes;
398                incr1 = (disp)*pixel_bytes;
399                for (i = initial_val; i < final_val; i += incr1)
400                {
401                   png_memcpy(dstptr, srcptr, pixel_bytes);
402                   srcptr += incr1;
403                   dstptr += incr1;
404                }
405             } /* end of else */
406 
407             break;
408          }       // end 8 bpp
409 
410          case 16:
411          {
412             png_bytep srcptr;
413             png_bytep dstptr;
414             png_uint_32 len;
415             int unmask, diff;
416             __int64 mask1=0x0101020204040808,
417                     mask0=0x1010202040408080;
418 
419 #if !defined(PNG_1_0_X)
420             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
421                 /* && mmx_supported */ )
422 #else
423             if (mmx_supported)
424 #endif
425             {
426                srcptr = png_ptr->row_buf + 1;
427                dstptr = row;
428 
429                unmask = ~mask;
430                len     = (png_ptr->width)&~7;
431                diff = (png_ptr->width)&7;
432                _asm
433                {
434                   movd       mm7, unmask       //load bit pattern
435                   psubb      mm6,mm6           //zero mm6
436                   punpcklbw  mm7,mm7
437                   punpcklwd  mm7,mm7
438                   punpckldq  mm7,mm7           //fill register with 8 masks
439 
440                   movq       mm0,mask0
441                   movq       mm1,mask1
442 
443                   pand       mm0,mm7
444                   pand       mm1,mm7
445 
446                   pcmpeqb    mm0,mm6
447                   pcmpeqb    mm1,mm6
448 
449                   mov        ecx,len           //load length of line
450                   mov        esi,srcptr        //load source
451                   mov        ebx,dstptr        //load dest
452                   cmp        ecx,0             //lcr
453                   jz         mainloop16end
454 
455 mainloop16:
456                   movq       mm4,[esi]
457                   pand       mm4,mm0
458                   movq       mm6,mm0
459                   movq       mm7,[ebx]
460                   pandn      mm6,mm7
461                   por        mm4,mm6
462                   movq       [ebx],mm4
463 
464                   movq       mm5,[esi+8]
465                   pand       mm5,mm1
466                   movq       mm7,mm1
467                   movq       mm6,[ebx+8]
468                   pandn      mm7,mm6
469                   por        mm5,mm7
470                   movq       [ebx+8],mm5
471 
472                   add        esi,16            //inc by 16 bytes processed
473                   add        ebx,16
474                   sub        ecx,8             //dec by 8 pixels processed
475 
476                   ja         mainloop16
477 
478 mainloop16end:
479                   mov        ecx,diff
480                   cmp        ecx,0
481                   jz         end16
482 
483                   mov        edx,mask
484                   sal        edx,24            //make low byte the high byte
485 secondloop16:
486                   sal        edx,1             //move high bit to CF
487                   jnc        skip16            //if CF = 0
488                   mov        ax,[esi]
489                   mov        [ebx],ax
490 skip16:
491                   add        esi,2
492                   add        ebx,2
493 
494                   dec        ecx
495                   jnz        secondloop16
496 end16:
497                   emms
498                }
499             }
500             else /* mmx not supported - use modified C routine */
501             {
502                register unsigned int incr1, initial_val, final_val;
503                png_size_t pixel_bytes;
504                png_uint_32 i;
505                register int disp = png_pass_inc[png_ptr->pass];
506                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
507 
508                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
509                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
510                   pixel_bytes;
511                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
512                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
513                final_val = png_ptr->width*pixel_bytes;
514                incr1 = (disp)*pixel_bytes;
515                for (i = initial_val; i < final_val; i += incr1)
516                {
517                   png_memcpy(dstptr, srcptr, pixel_bytes);
518                   srcptr += incr1;
519                   dstptr += incr1;
520                }
521             } /* end of else */
522 
523             break;
524          }       // end 16 bpp
525 
526          case 24:
527          {
528             png_bytep srcptr;
529             png_bytep dstptr;
530             png_uint_32 len;
531             int unmask, diff;
532 
533             __int64 mask2=0x0101010202020404,  //24bpp
534                     mask1=0x0408080810101020,
535                     mask0=0x2020404040808080;
536 
537             srcptr = png_ptr->row_buf + 1;
538             dstptr = row;
539 
540             unmask = ~mask;
541             len     = (png_ptr->width)&~7;
542             diff = (png_ptr->width)&7;
543 
544 #if !defined(PNG_1_0_X)
545             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
546                 /* && mmx_supported */ )
547 #else
548             if (mmx_supported)
549 #endif
550             {
551                _asm
552                {
553                   movd       mm7, unmask       //load bit pattern
554                   psubb      mm6,mm6           //zero mm6
555                   punpcklbw  mm7,mm7
556                   punpcklwd  mm7,mm7
557                   punpckldq  mm7,mm7           //fill register with 8 masks
558 
559                   movq       mm0,mask0
560                   movq       mm1,mask1
561                   movq       mm2,mask2
562 
563                   pand       mm0,mm7
564                   pand       mm1,mm7
565                   pand       mm2,mm7
566 
567                   pcmpeqb    mm0,mm6
568                   pcmpeqb    mm1,mm6
569                   pcmpeqb    mm2,mm6
570 
571                   mov        ecx,len           //load length of line
572                   mov        esi,srcptr        //load source
573                   mov        ebx,dstptr        //load dest
574                   cmp        ecx,0
575                   jz         mainloop24end
576 
577 mainloop24:
578                   movq       mm4,[esi]
579                   pand       mm4,mm0
580                   movq       mm6,mm0
581                   movq       mm7,[ebx]
582                   pandn      mm6,mm7
583                   por        mm4,mm6
584                   movq       [ebx],mm4
585 
586 
587                   movq       mm5,[esi+8]
588                   pand       mm5,mm1
589                   movq       mm7,mm1
590                   movq       mm6,[ebx+8]
591                   pandn      mm7,mm6
592                   por        mm5,mm7
593                   movq       [ebx+8],mm5
594 
595                   movq       mm6,[esi+16]
596                   pand       mm6,mm2
597                   movq       mm4,mm2
598                   movq       mm7,[ebx+16]
599                   pandn      mm4,mm7
600                   por        mm6,mm4
601                   movq       [ebx+16],mm6
602 
603                   add        esi,24            //inc by 24 bytes processed
604                   add        ebx,24
605                   sub        ecx,8             //dec by 8 pixels processed
606 
607                   ja         mainloop24
608 
609 mainloop24end:
610                   mov        ecx,diff
611                   cmp        ecx,0
612                   jz         end24
613 
614                   mov        edx,mask
615                   sal        edx,24            //make low byte the high byte
616 secondloop24:
617                   sal        edx,1             //move high bit to CF
618                   jnc        skip24            //if CF = 0
619                   mov        ax,[esi]
620                   mov        [ebx],ax
621                   xor        eax,eax
622                   mov        al,[esi+2]
623                   mov        [ebx+2],al
624 skip24:
625                   add        esi,3
626                   add        ebx,3
627 
628                   dec        ecx
629                   jnz        secondloop24
630 
631 end24:
632                   emms
633                }
634             }
635             else /* mmx not supported - use modified C routine */
636             {
637                register unsigned int incr1, initial_val, final_val;
638                png_size_t pixel_bytes;
639                png_uint_32 i;
640                register int disp = png_pass_inc[png_ptr->pass];
641                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
642 
643                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
644                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
645                   pixel_bytes;
646                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
647                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
648                final_val = png_ptr->width*pixel_bytes;
649                incr1 = (disp)*pixel_bytes;
650                for (i = initial_val; i < final_val; i += incr1)
651                {
652                   png_memcpy(dstptr, srcptr, pixel_bytes);
653                   srcptr += incr1;
654                   dstptr += incr1;
655                }
656             } /* end of else */
657 
658             break;
659          }       // end 24 bpp
660 
661          case 32:
662          {
663             png_bytep srcptr;
664             png_bytep dstptr;
665             png_uint_32 len;
666             int unmask, diff;
667 
668             __int64 mask3=0x0101010102020202,  //32bpp
669                     mask2=0x0404040408080808,
670                     mask1=0x1010101020202020,
671                     mask0=0x4040404080808080;
672 
673             srcptr = png_ptr->row_buf + 1;
674             dstptr = row;
675 
676             unmask = ~mask;
677             len     = (png_ptr->width)&~7;
678             diff = (png_ptr->width)&7;
679 
680 #if !defined(PNG_1_0_X)
681             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
682                 /* && mmx_supported */ )
683 #else
684             if (mmx_supported)
685 #endif
686             {
687                _asm
688                {
689                   movd       mm7, unmask       //load bit pattern
690                   psubb      mm6,mm6           //zero mm6
691                   punpcklbw  mm7,mm7
692                   punpcklwd  mm7,mm7
693                   punpckldq  mm7,mm7           //fill register with 8 masks
694 
695                   movq       mm0,mask0
696                   movq       mm1,mask1
697                   movq       mm2,mask2
698                   movq       mm3,mask3
699 
700                   pand       mm0,mm7
701                   pand       mm1,mm7
702                   pand       mm2,mm7
703                   pand       mm3,mm7
704 
705                   pcmpeqb    mm0,mm6
706                   pcmpeqb    mm1,mm6
707                   pcmpeqb    mm2,mm6
708                   pcmpeqb    mm3,mm6
709 
710                   mov        ecx,len           //load length of line
711                   mov        esi,srcptr        //load source
712                   mov        ebx,dstptr        //load dest
713 
714                   cmp        ecx,0             //lcr
715                   jz         mainloop32end
716 
717 mainloop32:
718                   movq       mm4,[esi]
719                   pand       mm4,mm0
720                   movq       mm6,mm0
721                   movq       mm7,[ebx]
722                   pandn      mm6,mm7
723                   por        mm4,mm6
724                   movq       [ebx],mm4
725 
726                   movq       mm5,[esi+8]
727                   pand       mm5,mm1
728                   movq       mm7,mm1
729                   movq       mm6,[ebx+8]
730                   pandn      mm7,mm6
731                   por        mm5,mm7
732                   movq       [ebx+8],mm5
733 
734                   movq       mm6,[esi+16]
735                   pand       mm6,mm2
736                   movq       mm4,mm2
737                   movq       mm7,[ebx+16]
738                   pandn      mm4,mm7
739                   por        mm6,mm4
740                   movq       [ebx+16],mm6
741 
742                   movq       mm7,[esi+24]
743                   pand       mm7,mm3
744                   movq       mm5,mm3
745                   movq       mm4,[ebx+24]
746                   pandn      mm5,mm4
747                   por        mm7,mm5
748                   movq       [ebx+24],mm7
749 
750                   add        esi,32            //inc by 32 bytes processed
751                   add        ebx,32
752                   sub        ecx,8             //dec by 8 pixels processed
753 
754                   ja         mainloop32
755 
756 mainloop32end:
757                   mov        ecx,diff
758                   cmp        ecx,0
759                   jz         end32
760 
761                   mov        edx,mask
762                   sal        edx,24            //make low byte the high byte
763 secondloop32:
764                   sal        edx,1             //move high bit to CF
765                   jnc        skip32            //if CF = 0
766                   mov        eax,[esi]
767                   mov        [ebx],eax
768 skip32:
769                   add        esi,4
770                   add        ebx,4
771 
772                   dec        ecx
773                   jnz        secondloop32
774 
775 end32:
776                   emms
777                }
778             }
779             else /* mmx _not supported - Use modified C routine */
780             {
781                register unsigned int incr1, initial_val, final_val;
782                png_size_t pixel_bytes;
783                png_uint_32 i;
784                register int disp = png_pass_inc[png_ptr->pass];
785                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
786 
787                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
788                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
789                   pixel_bytes;
790                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
791                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
792                final_val = png_ptr->width*pixel_bytes;
793                incr1 = (disp)*pixel_bytes;
794                for (i = initial_val; i < final_val; i += incr1)
795                {
796                   png_memcpy(dstptr, srcptr, pixel_bytes);
797                   srcptr += incr1;
798                   dstptr += incr1;
799                }
800             } /* end of else */
801 
802             break;
803          }       // end 32 bpp
804 
805          case 48:
806          {
807             png_bytep srcptr;
808             png_bytep dstptr;
809             png_uint_32 len;
810             int unmask, diff;
811 
812             __int64 mask5=0x0101010101010202,
813                     mask4=0x0202020204040404,
814                     mask3=0x0404080808080808,
815                     mask2=0x1010101010102020,
816                     mask1=0x2020202040404040,
817                     mask0=0x4040808080808080;
818 
819 #if !defined(PNG_1_0_X)
820             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
821                 /* && mmx_supported */ )
822 #else
823             if (mmx_supported)
824 #endif
825             {
826                srcptr = png_ptr->row_buf + 1;
827                dstptr = row;
828 
829                unmask = ~mask;
830                len     = (png_ptr->width)&~7;
831                diff = (png_ptr->width)&7;
832                _asm
833                {
834                   movd       mm7, unmask       //load bit pattern
835                   psubb      mm6,mm6           //zero mm6
836                   punpcklbw  mm7,mm7
837                   punpcklwd  mm7,mm7
838                   punpckldq  mm7,mm7           //fill register with 8 masks
839 
840                   movq       mm0,mask0
841                   movq       mm1,mask1
842                   movq       mm2,mask2
843                   movq       mm3,mask3
844                   movq       mm4,mask4
845                   movq       mm5,mask5
846 
847                   pand       mm0,mm7
848                   pand       mm1,mm7
849                   pand       mm2,mm7
850                   pand       mm3,mm7
851                   pand       mm4,mm7
852                   pand       mm5,mm7
853 
854                   pcmpeqb    mm0,mm6
855                   pcmpeqb    mm1,mm6
856                   pcmpeqb    mm2,mm6
857                   pcmpeqb    mm3,mm6
858                   pcmpeqb    mm4,mm6
859                   pcmpeqb    mm5,mm6
860 
861                   mov        ecx,len           //load length of line
862                   mov        esi,srcptr        //load source
863                   mov        ebx,dstptr        //load dest
864 
865                   cmp        ecx,0
866                   jz         mainloop48end
867 
868 mainloop48:
869                   movq       mm7,[esi]
870                   pand       mm7,mm0
871                   movq       mm6,mm0
872                   pandn      mm6,[ebx]
873                   por        mm7,mm6
874                   movq       [ebx],mm7
875 
876                   movq       mm6,[esi+8]
877                   pand       mm6,mm1
878                   movq       mm7,mm1
879                   pandn      mm7,[ebx+8]
880                   por        mm6,mm7
881                   movq       [ebx+8],mm6
882 
883                   movq       mm6,[esi+16]
884                   pand       mm6,mm2
885                   movq       mm7,mm2
886                   pandn      mm7,[ebx+16]
887                   por        mm6,mm7
888                   movq       [ebx+16],mm6
889 
890                   movq       mm7,[esi+24]
891                   pand       mm7,mm3
892                   movq       mm6,mm3
893                   pandn      mm6,[ebx+24]
894                   por        mm7,mm6
895                   movq       [ebx+24],mm7
896 
897                   movq       mm6,[esi+32]
898                   pand       mm6,mm4
899                   movq       mm7,mm4
900                   pandn      mm7,[ebx+32]
901                   por        mm6,mm7
902                   movq       [ebx+32],mm6
903 
904                   movq       mm7,[esi+40]
905                   pand       mm7,mm5
906                   movq       mm6,mm5
907                   pandn      mm6,[ebx+40]
908                   por        mm7,mm6
909                   movq       [ebx+40],mm7
910 
911                   add        esi,48            //inc by 32 bytes processed
912                   add        ebx,48
913                   sub        ecx,8             //dec by 8 pixels processed
914 
915                   ja         mainloop48
916 mainloop48end:
917 
918                   mov        ecx,diff
919                   cmp        ecx,0
920                   jz         end48
921 
922                   mov        edx,mask
923                   sal        edx,24            //make low byte the high byte
924 
925 secondloop48:
926                   sal        edx,1             //move high bit to CF
927                   jnc        skip48            //if CF = 0
928                   mov        eax,[esi]
929                   mov        [ebx],eax
930 skip48:
931                   add        esi,4
932                   add        ebx,4
933 
934                   dec        ecx
935                   jnz        secondloop48
936 
937 end48:
938                   emms
939                }
940             }
941             else /* mmx _not supported - Use modified C routine */
942             {
943                register unsigned int incr1, initial_val, final_val;
944                png_size_t pixel_bytes;
945                png_uint_32 i;
946                register int disp = png_pass_inc[png_ptr->pass];
947                int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
948 
949                pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
950                srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
951                   pixel_bytes;
952                dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
953                initial_val = offset_table[png_ptr->pass]*pixel_bytes;
954                final_val = png_ptr->width*pixel_bytes;
955                incr1 = (disp)*pixel_bytes;
956                for (i = initial_val; i < final_val; i += incr1)
957                {
958                   png_memcpy(dstptr, srcptr, pixel_bytes);
959                   srcptr += incr1;
960                   dstptr += incr1;
961                }
962             } /* end of else */
963 
964             break;
965          }       // end 48 bpp
966 
967          default:
968          {
969             png_bytep sptr;
970             png_bytep dp;
971             png_size_t pixel_bytes;
972             int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
973             unsigned int i;
974             register int disp = png_pass_inc[png_ptr->pass];  // get the offset
975             register unsigned int incr1, initial_val, final_val;
976 
977             pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
978             sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
979                pixel_bytes;
980             dp = row + offset_table[png_ptr->pass]*pixel_bytes;
981             initial_val = offset_table[png_ptr->pass]*pixel_bytes;
982             final_val = png_ptr->width*pixel_bytes;
983             incr1 = (disp)*pixel_bytes;
984             for (i = initial_val; i < final_val; i += incr1)
985             {
986                png_memcpy(dp, sptr, pixel_bytes);
987                sptr += incr1;
988                dp += incr1;
989             }
990             break;
991          }
992       } /* end switch (png_ptr->row_info.pixel_depth) */
993    } /* end if (non-trivial mask) */
994 
995 } /* end png_combine_row() */
996 
997 
998 #if defined(PNG_READ_INTERLACING_SUPPORTED)
999 
1000 void /* PRIVATE */
png_do_read_interlace(png_structp png_ptr)1001 png_do_read_interlace(png_structp png_ptr)
1002 {
1003    png_row_infop row_info = &(png_ptr->row_info);
1004    png_bytep row = png_ptr->row_buf + 1;
1005    int pass = png_ptr->pass;
1006    png_uint_32 transformations = png_ptr->transformations;
1007 #ifdef PNG_USE_LOCAL_ARRAYS
1008    const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1009 #endif
1010 
1011    png_debug(1,"in png_do_read_interlace\n");
1012 
1013    if (mmx_supported == 2) {
1014 #if !defined(PNG_1_0_X)
1015        /* this should have happened in png_init_mmx_flags() already */
1016        png_warning(png_ptr, "asm_flags may not have been initialized");
1017 #endif
1018        png_mmx_support();
1019    }
1020 
1021    if (row != NULL && row_info != NULL)
1022    {
1023       png_uint_32 final_width;
1024 
1025       final_width = row_info->width * png_pass_inc[pass];
1026 
1027       switch (row_info->pixel_depth)
1028       {
1029          case 1:
1030          {
1031             png_bytep sp, dp;
1032             int sshift, dshift;
1033             int s_start, s_end, s_inc;
1034             png_byte v;
1035             png_uint_32 i;
1036             int j;
1037 
1038             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1039             dp = row + (png_size_t)((final_width - 1) >> 3);
1040 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1041             if (transformations & PNG_PACKSWAP)
1042             {
1043                sshift = (int)((row_info->width + 7) & 7);
1044                dshift = (int)((final_width + 7) & 7);
1045                s_start = 7;
1046                s_end = 0;
1047                s_inc = -1;
1048             }
1049             else
1050 #endif
1051             {
1052                sshift = 7 - (int)((row_info->width + 7) & 7);
1053                dshift = 7 - (int)((final_width + 7) & 7);
1054                s_start = 0;
1055                s_end = 7;
1056                s_inc = 1;
1057             }
1058 
1059             for (i = row_info->width; i; i--)
1060             {
1061                v = (png_byte)((*sp >> sshift) & 0x1);
1062                for (j = 0; j < png_pass_inc[pass]; j++)
1063                {
1064                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1065                   *dp |= (png_byte)(v << dshift);
1066                   if (dshift == s_end)
1067                   {
1068                      dshift = s_start;
1069                      dp--;
1070                   }
1071                   else
1072                      dshift += s_inc;
1073                }
1074                if (sshift == s_end)
1075                {
1076                   sshift = s_start;
1077                   sp--;
1078                }
1079                else
1080                   sshift += s_inc;
1081             }
1082             break;
1083          }
1084 
1085          case 2:
1086          {
1087             png_bytep sp, dp;
1088             int sshift, dshift;
1089             int s_start, s_end, s_inc;
1090             png_uint_32 i;
1091 
1092             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1093             dp = row + (png_size_t)((final_width - 1) >> 2);
1094 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1095             if (transformations & PNG_PACKSWAP)
1096             {
1097                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1098                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1099                s_start = 6;
1100                s_end = 0;
1101                s_inc = -2;
1102             }
1103             else
1104 #endif
1105             {
1106                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1107                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1108                s_start = 0;
1109                s_end = 6;
1110                s_inc = 2;
1111             }
1112 
1113             for (i = row_info->width; i; i--)
1114             {
1115                png_byte v;
1116                int j;
1117 
1118                v = (png_byte)((*sp >> sshift) & 0x3);
1119                for (j = 0; j < png_pass_inc[pass]; j++)
1120                {
1121                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1122                   *dp |= (png_byte)(v << dshift);
1123                   if (dshift == s_end)
1124                   {
1125                      dshift = s_start;
1126                      dp--;
1127                   }
1128                   else
1129                      dshift += s_inc;
1130                }
1131                if (sshift == s_end)
1132                {
1133                   sshift = s_start;
1134                   sp--;
1135                }
1136                else
1137                   sshift += s_inc;
1138             }
1139             break;
1140          }
1141 
1142          case 4:
1143          {
1144             png_bytep sp, dp;
1145             int sshift, dshift;
1146             int s_start, s_end, s_inc;
1147             png_uint_32 i;
1148 
1149             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1150             dp = row + (png_size_t)((final_width - 1) >> 1);
1151 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1152             if (transformations & PNG_PACKSWAP)
1153             {
1154                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1155                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1156                s_start = 4;
1157                s_end = 0;
1158                s_inc = -4;
1159             }
1160             else
1161 #endif
1162             {
1163                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1164                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1165                s_start = 0;
1166                s_end = 4;
1167                s_inc = 4;
1168             }
1169 
1170             for (i = row_info->width; i; i--)
1171             {
1172                png_byte v;
1173                int j;
1174 
1175                v = (png_byte)((*sp >> sshift) & 0xf);
1176                for (j = 0; j < png_pass_inc[pass]; j++)
1177                {
1178                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1179                   *dp |= (png_byte)(v << dshift);
1180                   if (dshift == s_end)
1181                   {
1182                      dshift = s_start;
1183                      dp--;
1184                   }
1185                   else
1186                      dshift += s_inc;
1187                }
1188                if (sshift == s_end)
1189                {
1190                   sshift = s_start;
1191                   sp--;
1192                }
1193                else
1194                   sshift += s_inc;
1195             }
1196             break;
1197          }
1198 
1199          default:         // This is the place where the routine is modified
1200          {
1201             __int64 const4 = 0x0000000000FFFFFF;
1202             // __int64 const5 = 0x000000FFFFFF0000;  // unused...
1203             __int64 const6 = 0x00000000000000FF;
1204             png_bytep sptr, dp;
1205             png_uint_32 i;
1206             png_size_t pixel_bytes;
1207             int width = row_info->width;
1208 
1209             pixel_bytes = (row_info->pixel_depth >> 3);
1210 
1211             sptr = row + (width - 1) * pixel_bytes;
1212             dp = row + (final_width - 1) * pixel_bytes;
1213             // New code by Nirav Chhatrapati - Intel Corporation
1214             // sign fix by GRR
1215             // NOTE:  there is NO MMX code for 48-bit and 64-bit images
1216 
1217             // use MMX routine if machine supports it
1218 #if !defined(PNG_1_0_X)
1219             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1220                 /* && mmx_supported */ )
1221 #else
1222             if (mmx_supported)
1223 #endif
1224             {
1225                if (pixel_bytes == 3)
1226                {
1227                   if (((pass == 0) || (pass == 1)) && width)
1228                   {
1229                      _asm
1230                      {
1231                         mov esi, sptr
1232                         mov edi, dp
1233                         mov ecx, width
1234                         sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
1235 loop_pass0:
1236                         movd mm0, [esi]     ; X X X X X v2 v1 v0
1237                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1238                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1239                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1240                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1241                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1242                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1243                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1244                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1245                         movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
1246                         psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
1247                         movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
1248                         punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
1249                         movq [edi+16] , mm4
1250                         psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
1251                         movq [edi+8] , mm3
1252                         punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
1253                         sub esi, 3
1254                         movq [edi], mm0
1255                         sub edi, 24
1256                         //sub esi, 3
1257                         dec ecx
1258                         jnz loop_pass0
1259                         EMMS
1260                      }
1261                   }
1262                   else if (((pass == 2) || (pass == 3)) && width)
1263                   {
1264                      _asm
1265                      {
1266                         mov esi, sptr
1267                         mov edi, dp
1268                         mov ecx, width
1269                         sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
1270 loop_pass2:
1271                         movd mm0, [esi]     ; X X X X X v2 v1 v0
1272                         pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1273                         movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1274                         psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1275                         movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1276                         psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1277                         psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1278                         por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1279                         por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1280                         movq [edi+4], mm0   ; move to memory
1281                         psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
1282                         movd [edi], mm0     ; move to memory
1283                         sub esi, 3
1284                         sub edi, 12
1285                         dec ecx
1286                         jnz loop_pass2
1287                         EMMS
1288                      }
1289                   }
1290                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1291                   {
1292                      int width_mmx = ((width >> 1) << 1) - 8;
1293                      if (width_mmx < 0)
1294                          width_mmx = 0;
1295                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1296                      if (width_mmx)
1297                      {
1298                         _asm
1299                         {
1300                            mov esi, sptr
1301                            mov edi, dp
1302                            mov ecx, width_mmx
1303                            sub esi, 3
1304                            sub edi, 9
1305 loop_pass4:
1306                            movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
1307                            movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
1308                            movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
1309                            psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
1310                            pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
1311                            psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
1312                            por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
1313                            movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
1314                            psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
1315                            movq [edi], mm0     ; move quad to memory
1316                            psrlq mm5, 16       ; 0 0 0 0 0 X X v2
1317                            pand mm5, const6    ; 0 0 0 0 0 0 0 v2
1318                            por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
1319                            movd [edi+8], mm6   ; move double to memory
1320                            sub esi, 6
1321                            sub edi, 12
1322                            sub ecx, 2
1323                            jnz loop_pass4
1324                            EMMS
1325                         }
1326                      }
1327 
1328                      sptr -= width_mmx*3;
1329                      dp -= width_mmx*6;
1330                      for (i = width; i; i--)
1331                      {
1332                         png_byte v[8];
1333                         int j;
1334 
1335                         png_memcpy(v, sptr, 3);
1336                         for (j = 0; j < png_pass_inc[pass]; j++)
1337                         {
1338                            png_memcpy(dp, v, 3);
1339                            dp -= 3;
1340                         }
1341                         sptr -= 3;
1342                      }
1343                   }
1344                } /* end of pixel_bytes == 3 */
1345 
1346                else if (pixel_bytes == 1)
1347                {
1348                   if (((pass == 0) || (pass == 1)) && width)
1349                   {
1350                      int width_mmx = ((width >> 2) << 2);
1351                      width -= width_mmx;
1352                      if (width_mmx)
1353                      {
1354                         _asm
1355                         {
1356                            mov esi, sptr
1357                            mov edi, dp
1358                            mov ecx, width_mmx
1359                            sub edi, 31
1360                            sub esi, 3
1361 loop1_pass0:
1362                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1363                            movq mm1, mm0       ; X X X X v0 v1 v2 v3
1364                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1365                            movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1366                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1367                            movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
1368                            punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
1369                            punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
1370                            movq [edi], mm0     ; move to memory v3
1371                            punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
1372                            movq [edi+8], mm3   ; move to memory v2
1373                            movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
1374                            punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
1375                            punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
1376                            movq [edi+16], mm2  ; move to memory v1
1377                            movq [edi+24], mm4  ; move to memory v0
1378                            sub esi, 4
1379                            sub edi, 32
1380                            sub ecx, 4
1381                            jnz loop1_pass0
1382                            EMMS
1383                         }
1384                      }
1385 
1386                      sptr -= width_mmx;
1387                      dp -= width_mmx*8;
1388                      for (i = width; i; i--)
1389                      {
1390                         int j;
1391 
1392                        /* I simplified this part in version 1.0.4e
1393                         * here and in several other instances where
1394                         * pixel_bytes == 1  -- GR-P
1395                         *
1396                         * Original code:
1397                         *
1398                         * png_byte v[8];
1399                         * png_memcpy(v, sptr, pixel_bytes);
1400                         * for (j = 0; j < png_pass_inc[pass]; j++)
1401                         * {
1402                         *    png_memcpy(dp, v, pixel_bytes);
1403                         *    dp -= pixel_bytes;
1404                         * }
1405                         * sptr -= pixel_bytes;
1406                         *
1407                         * Replacement code is in the next three lines:
1408                         */
1409 
1410                         for (j = 0; j < png_pass_inc[pass]; j++)
1411                            *dp-- = *sptr;
1412                         sptr--;
1413                      }
1414                   }
1415                   else if (((pass == 2) || (pass == 3)) && width)
1416                   {
1417                      int width_mmx = ((width >> 2) << 2);
1418                      width -= width_mmx;
1419                      if (width_mmx)
1420                      {
1421                         _asm
1422                         {
1423                            mov esi, sptr
1424                            mov edi, dp
1425                            mov ecx, width_mmx
1426                            sub edi, 15
1427                            sub esi, 3
1428 loop1_pass2:
1429                            movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1430                            punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1431                            movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1432                            punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1433                            punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
1434                            movq [edi], mm0     ; move to memory v2 and v3
1435                            sub esi, 4
1436                            movq [edi+8], mm1   ; move to memory v1     and v0
1437                            sub edi, 16
1438                            sub ecx, 4
1439                            jnz loop1_pass2
1440                            EMMS
1441                         }
1442                      }
1443 
1444                      sptr -= width_mmx;
1445                      dp -= width_mmx*4;
1446                      for (i = width; i; i--)
1447                      {
1448                         int j;
1449 
1450                         for (j = 0; j < png_pass_inc[pass]; j++)
1451                         {
1452                            *dp-- = *sptr;
1453                         }
1454                         sptr --;
1455                      }
1456                   }
1457                   else if (width) /* && ((pass == 4) || (pass == 5))) */
1458                   {
1459                      int width_mmx = ((width >> 3) << 3);
1460                      width -= width_mmx;
1461                      if (width_mmx)
1462                      {
1463                         _asm
1464                         {
1465                            mov esi, sptr
1466                            mov edi, dp
1467                            mov ecx, width_mmx
1468                            sub edi, 15
1469                            sub esi, 7
1470 loop1_pass4:
1471                            movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
1472                            movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
1473                            punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
1474                            //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
1475                            punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
1476                            movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
1477                            sub esi, 8
1478                            movq [edi], mm0     ; move to memory v4 v5 v6 and v7
1479                            //sub esi, 4
1480                            sub edi, 16
1481                            sub ecx, 8
1482                            jnz loop1_pass4
1483                            EMMS
1484                         }
1485                      }
1486 
1487                      sptr -= width_mmx;
1488                      dp -= width_mmx*2;
1489                      for (i = width; i; i--)
1490                      {
1491                         int j;
1492 
1493                         for (j = 0; j < png_pass_inc[pass]; j++)
1494                         {
1495                            *dp-- = *sptr;
1496                         }
1497                         sptr --;
1498                      }
1499                   }
1500                } /* end of pixel_bytes == 1 */
1501 
1502                else if (pixel_bytes == 2)
1503                {
1504                   if (((pass == 0) || (pass == 1)) && width)
1505                   {
1506                      int width_mmx = ((width >> 1) << 1);
1507                      width -= width_mmx;
1508                      if (width_mmx)
1509                      {
1510                         _asm
1511                         {
1512                            mov esi, sptr
1513                            mov edi, dp
1514                            mov ecx, width_mmx
1515                            sub esi, 2
1516                            sub edi, 30
1517 loop2_pass0:
1518                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1519                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1520                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1521                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1522                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1523                            movq [edi], mm0
1524                            movq [edi + 8], mm0
1525                            movq [edi + 16], mm1
1526                            movq [edi + 24], mm1
1527                            sub esi, 4
1528                            sub edi, 32
1529                            sub ecx, 2
1530                            jnz loop2_pass0
1531                            EMMS
1532                         }
1533                      }
1534 
1535                      sptr -= (width_mmx*2 - 2);            // sign fixed
1536                      dp -= (width_mmx*16 - 2);            // sign fixed
1537                      for (i = width; i; i--)
1538                      {
1539                         png_byte v[8];
1540                         int j;
1541                         sptr -= 2;
1542                         png_memcpy(v, sptr, 2);
1543                         for (j = 0; j < png_pass_inc[pass]; j++)
1544                         {
1545                            dp -= 2;
1546                            png_memcpy(dp, v, 2);
1547                         }
1548                      }
1549                   }
1550                   else if (((pass == 2) || (pass == 3)) && width)
1551                   {
1552                      int width_mmx = ((width >> 1) << 1) ;
1553                      width -= width_mmx;
1554                      if (width_mmx)
1555                      {
1556                         _asm
1557                         {
1558                            mov esi, sptr
1559                            mov edi, dp
1560                            mov ecx, width_mmx
1561                            sub esi, 2
1562                            sub edi, 14
1563 loop2_pass2:
1564                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1565                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1566                            movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1567                            punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1568                            punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1569                            movq [edi], mm0
1570                            sub esi, 4
1571                            movq [edi + 8], mm1
1572                            //sub esi, 4
1573                            sub edi, 16
1574                            sub ecx, 2
1575                            jnz loop2_pass2
1576                            EMMS
1577                         }
1578                      }
1579 
1580                      sptr -= (width_mmx*2 - 2);            // sign fixed
1581                      dp -= (width_mmx*8 - 2);            // sign fixed
1582                      for (i = width; i; i--)
1583                      {
1584                         png_byte v[8];
1585                         int j;
1586                         sptr -= 2;
1587                         png_memcpy(v, sptr, 2);
1588                         for (j = 0; j < png_pass_inc[pass]; j++)
1589                         {
1590                            dp -= 2;
1591                            png_memcpy(dp, v, 2);
1592                         }
1593                      }
1594                   }
1595                   else if (width)  // pass == 4 or 5
1596                   {
1597                      int width_mmx = ((width >> 1) << 1) ;
1598                      width -= width_mmx;
1599                      if (width_mmx)
1600                      {
1601                         _asm
1602                         {
1603                            mov esi, sptr
1604                            mov edi, dp
1605                            mov ecx, width_mmx
1606                            sub esi, 2
1607                            sub edi, 6
1608 loop2_pass4:
1609                            movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1610                            punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1611                            sub esi, 4
1612                            movq [edi], mm0
1613                            sub edi, 8
1614                            sub ecx, 2
1615                            jnz loop2_pass4
1616                            EMMS
1617                         }
1618                      }
1619 
1620                      sptr -= (width_mmx*2 - 2);            // sign fixed
1621                      dp -= (width_mmx*4 - 2);            // sign fixed
1622                      for (i = width; i; i--)
1623                      {
1624                         png_byte v[8];
1625                         int j;
1626                         sptr -= 2;
1627                         png_memcpy(v, sptr, 2);
1628                         for (j = 0; j < png_pass_inc[pass]; j++)
1629                         {
1630                            dp -= 2;
1631                            png_memcpy(dp, v, 2);
1632                         }
1633                      }
1634                   }
1635                } /* end of pixel_bytes == 2 */
1636 
1637                else if (pixel_bytes == 4)
1638                {
1639                   if (((pass == 0) || (pass == 1)) && width)
1640                   {
1641                      int width_mmx = ((width >> 1) << 1) ;
1642                      width -= width_mmx;
1643                      if (width_mmx)
1644                      {
1645                         _asm
1646                         {
1647                            mov esi, sptr
1648                            mov edi, dp
1649                            mov ecx, width_mmx
1650                            sub esi, 4
1651                            sub edi, 60
1652 loop4_pass0:
1653                            movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
1654                            movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
1655                            punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
1656                            punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
1657                            movq [edi], mm0
1658                            movq [edi + 8], mm0
1659                            movq [edi + 16], mm0
1660                            movq [edi + 24], mm0
1661                            movq [edi+32], mm1
1662                            movq [edi + 40], mm1
1663                            movq [edi+ 48], mm1
1664                            sub esi, 8
1665                            movq [edi + 56], mm1
1666                            sub edi, 64
1667                            sub ecx, 2
1668                            jnz loop4_pass0
1669                            EMMS
1670                         }
1671                      }
1672 
1673                      sptr -= (width_mmx*4 - 4);            // sign fixed
1674                      dp -= (width_mmx*32 - 4);            // sign fixed
1675                      for (i = width; i; i--)
1676                      {
1677                         png_byte v[8];
1678                         int j;
1679                         sptr -= 4;
1680                         png_memcpy(v, sptr, 4);
1681                         for (j = 0; j < png_pass_inc[pass]; j++)
1682                         {
1683                            dp -= 4;
1684                            png_memcpy(dp, v, 4);
1685                         }
1686                      }
1687                   }
1688                   else if (((pass == 2) || (pass == 3)) && width)
1689                   {
1690                      int width_mmx = ((width >> 1) << 1) ;
1691                      width -= width_mmx;
1692                      if (width_mmx)
1693                      {
1694                         _asm
1695                         {
1696                            mov esi, sptr
1697                            mov edi, dp
1698                            mov ecx, width_mmx
1699                            sub esi, 4
1700                            sub edi, 28
1701 loop4_pass2:
1702                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1703                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1704                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1705                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1706                            movq [edi], mm0
1707                            movq [edi + 8], mm0
1708                            movq [edi+16], mm1
1709                            movq [edi + 24], mm1
1710                            sub esi, 8
1711                            sub edi, 32
1712                            sub ecx, 2
1713                            jnz loop4_pass2
1714                            EMMS
1715                         }
1716                      }
1717 
1718                      sptr -= (width_mmx*4 - 4);            // sign fixed
1719                      dp -= (width_mmx*16 - 4);            // sign fixed
1720                      for (i = width; i; i--)
1721                      {
1722                         png_byte v[8];
1723                         int j;
1724                         sptr -= 4;
1725                         png_memcpy(v, sptr, 4);
1726                         for (j = 0; j < png_pass_inc[pass]; j++)
1727                         {
1728                            dp -= 4;
1729                            png_memcpy(dp, v, 4);
1730                         }
1731                      }
1732                   }
1733                   else if (width)  // pass == 4 or 5
1734                   {
1735                      int width_mmx = ((width >> 1) << 1) ;
1736                      width -= width_mmx;
1737                      if (width_mmx)
1738                      {
1739                         _asm
1740                         {
1741                            mov esi, sptr
1742                            mov edi, dp
1743                            mov ecx, width_mmx
1744                            sub esi, 4
1745                            sub edi, 12
1746 loop4_pass4:
1747                            movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1748                            movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1749                            punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1750                            punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1751                            movq [edi], mm0
1752                            sub esi, 8
1753                            movq [edi + 8], mm1
1754                            sub edi, 16
1755                            sub ecx, 2
1756                            jnz loop4_pass4
1757                            EMMS
1758                         }
1759                      }
1760 
1761                      sptr -= (width_mmx*4 - 4);          // sign fixed
1762                      dp -= (width_mmx*8 - 4);            // sign fixed
1763                      for (i = width; i; i--)
1764                      {
1765                         png_byte v[8];
1766                         int j;
1767                         sptr -= 4;
1768                         png_memcpy(v, sptr, 4);
1769                         for (j = 0; j < png_pass_inc[pass]; j++)
1770                         {
1771                            dp -= 4;
1772                            png_memcpy(dp, v, 4);
1773                         }
1774                      }
1775                   }
1776 
1777                } /* end of pixel_bytes == 4 */
1778 
1779                else if (pixel_bytes == 6)
1780                {
1781                   for (i = width; i; i--)
1782                   {
1783                      png_byte v[8];
1784                      int j;
1785                      png_memcpy(v, sptr, 6);
1786                      for (j = 0; j < png_pass_inc[pass]; j++)
1787                      {
1788                         png_memcpy(dp, v, 6);
1789                         dp -= 6;
1790                      }
1791                      sptr -= 6;
1792                   }
1793                } /* end of pixel_bytes == 6 */
1794 
1795                else
1796                {
1797                   for (i = width; i; i--)
1798                   {
1799                      png_byte v[8];
1800                      int j;
1801                      png_memcpy(v, sptr, pixel_bytes);
1802                      for (j = 0; j < png_pass_inc[pass]; j++)
1803                      {
1804                         png_memcpy(dp, v, pixel_bytes);
1805                         dp -= pixel_bytes;
1806                      }
1807                      sptr-= pixel_bytes;
1808                   }
1809                }
1810             } /* end of mmx_supported */
1811 
1812             else /* MMX not supported:  use modified C code - takes advantage
1813                   * of inlining of memcpy for a constant */
1814             {
1815                if (pixel_bytes == 1)
1816                {
1817                   for (i = width; i; i--)
1818                   {
1819                      int j;
1820                      for (j = 0; j < png_pass_inc[pass]; j++)
1821                         *dp-- = *sptr;
1822                      sptr--;
1823                   }
1824                }
1825                else if (pixel_bytes == 3)
1826                {
1827                   for (i = width; i; i--)
1828                   {
1829                      png_byte v[8];
1830                      int j;
1831                      png_memcpy(v, sptr, pixel_bytes);
1832                      for (j = 0; j < png_pass_inc[pass]; j++)
1833                      {
1834                         png_memcpy(dp, v, pixel_bytes);
1835                         dp -= pixel_bytes;
1836                      }
1837                      sptr -= pixel_bytes;
1838                   }
1839                }
1840                else if (pixel_bytes == 2)
1841                {
1842                   for (i = width; i; i--)
1843                   {
1844                      png_byte v[8];
1845                      int j;
1846                      png_memcpy(v, sptr, pixel_bytes);
1847                      for (j = 0; j < png_pass_inc[pass]; j++)
1848                      {
1849                         png_memcpy(dp, v, pixel_bytes);
1850                         dp -= pixel_bytes;
1851                      }
1852                      sptr -= pixel_bytes;
1853                   }
1854                }
1855                else if (pixel_bytes == 4)
1856                {
1857                   for (i = width; i; i--)
1858                   {
1859                      png_byte v[8];
1860                      int j;
1861                      png_memcpy(v, sptr, pixel_bytes);
1862                      for (j = 0; j < png_pass_inc[pass]; j++)
1863                      {
1864                         png_memcpy(dp, v, pixel_bytes);
1865                         dp -= pixel_bytes;
1866                      }
1867                      sptr -= pixel_bytes;
1868                   }
1869                }
1870                else if (pixel_bytes == 6)
1871                {
1872                   for (i = width; i; i--)
1873                   {
1874                      png_byte v[8];
1875                      int j;
1876                      png_memcpy(v, sptr, pixel_bytes);
1877                      for (j = 0; j < png_pass_inc[pass]; j++)
1878                      {
1879                         png_memcpy(dp, v, pixel_bytes);
1880                         dp -= pixel_bytes;
1881                      }
1882                      sptr -= pixel_bytes;
1883                   }
1884                }
1885                else
1886                {
1887                   for (i = width; i; i--)
1888                   {
1889                      png_byte v[8];
1890                      int j;
1891                      png_memcpy(v, sptr, pixel_bytes);
1892                      for (j = 0; j < png_pass_inc[pass]; j++)
1893                      {
1894                         png_memcpy(dp, v, pixel_bytes);
1895                         dp -= pixel_bytes;
1896                      }
1897                      sptr -= pixel_bytes;
1898                   }
1899                }
1900 
1901             } /* end of MMX not supported */
1902             break;
1903          }
1904       } /* end switch (row_info->pixel_depth) */
1905 
1906       row_info->width = final_width;
1907 
1908       row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1909    }
1910 
1911 }
1912 
1913 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1914 
1915 
1916 // These variables are utilized in the functions below.  They are declared
1917 // globally here to ensure alignment on 8-byte boundaries.
1918 
1919 union uAll {
1920    __int64 use;
1921    double  align;
1922 } LBCarryMask = {0x0101010101010101},
1923   HBClearMask = {0x7f7f7f7f7f7f7f7f},
1924   ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1925 
1926 
1927 // Optimized code for PNG Average filter decoder
1928 void /* PRIVATE */
png_read_filter_row_mmx_avg(png_row_infop row_info,png_bytep row,png_bytep prev_row)1929 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1930                             , png_bytep prev_row)
1931 {
1932    int bpp;
1933    png_uint_32 FullLength;
1934    png_uint_32 MMXLength;
1935    //png_uint_32 len;
1936    int diff;
1937 
1938    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1939    FullLength  = row_info->rowbytes; // # of bytes to filter
1940    _asm {
1941          // Init address pointers and offset
1942          mov edi, row          // edi ==> Avg(x)
1943          xor ebx, ebx          // ebx ==> x
1944          mov edx, edi
1945          mov esi, prev_row           // esi ==> Prior(x)
1946          sub edx, bpp          // edx ==> Raw(x-bpp)
1947 
1948          xor eax, eax
1949          // Compute the Raw value for the first bpp bytes
1950          //    Raw(x) = Avg(x) + (Prior(x)/2)
1951 davgrlp:
1952          mov al, [esi + ebx]   // Load al with Prior(x)
1953          inc ebx
1954          shr al, 1             // divide by 2
1955          add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
1956          cmp ebx, bpp
1957          mov [edi+ebx-1], al    // Write back Raw(x);
1958                             // mov does not affect flags; -1 to offset inc ebx
1959          jb davgrlp
1960          // get # of bytes to alignment
1961          mov diff, edi         // take start of row
1962          add diff, ebx         // add bpp
1963          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
1964          and diff, 0xfffffff8  // mask to alignment boundary
1965          sub diff, edi         // subtract from start ==> value ebx at alignment
1966          jz davggo
1967          // fix alignment
1968          // Compute the Raw value for the bytes upto the alignment boundary
1969          //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1970          xor ecx, ecx
1971 davglp1:
1972          xor eax, eax
1973          mov cl, [esi + ebx]        // load cl with Prior(x)
1974          mov al, [edx + ebx]  // load al with Raw(x-bpp)
1975          add ax, cx
1976          inc ebx
1977          shr ax, 1            // divide by 2
1978          add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
1979          cmp ebx, diff              // Check if at alignment boundary
1980          mov [edi+ebx-1], al        // Write back Raw(x);
1981                             // mov does not affect flags; -1 to offset inc ebx
1982          jb davglp1               // Repeat until at alignment boundary
1983 davggo:
1984          mov eax, FullLength
1985          mov ecx, eax
1986          sub eax, ebx          // subtract alignment fix
1987          and eax, 0x00000007   // calc bytes over mult of 8
1988          sub ecx, eax          // drop over bytes from original length
1989          mov MMXLength, ecx
1990    } // end _asm block
1991    // Now do the math for the rest of the row
1992    switch ( bpp )
1993    {
1994       case 3:
1995       {
1996          ActiveMask.use  = 0x0000000000ffffff;
1997          ShiftBpp.use = 24;    // == 3 * 8
1998          ShiftRem.use = 40;    // == 64 - 24
1999          _asm {
2000             // Re-init address pointers and offset
2001             movq mm7, ActiveMask
2002             mov ebx, diff      // ebx ==> x = offset to alignment boundary
2003             movq mm5, LBCarryMask
2004             mov edi, row       // edi ==> Avg(x)
2005             movq mm4, HBClearMask
2006             mov esi, prev_row        // esi ==> Prior(x)
2007             // PRIME the pump (load the first Raw(x-bpp) data set
2008             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2009                                // (we correct position in loop below)
2010 davg3lp:
2011             movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
2012             // Add (Prev_row/2) to Average
2013             movq mm3, mm5
2014             psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
2015             movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
2016             movq mm6, mm7
2017             pand mm3, mm1      // get lsb for each prev_row byte
2018             psrlq mm1, 1       // divide prev_row bytes by 2
2019             pand  mm1, mm4     // clear invalid bit 7 of each byte
2020             paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
2021             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2022             movq mm1, mm3      // now use mm1 for getting LBCarrys
2023             pand mm1, mm2      // get LBCarrys for each byte where both
2024                                // lsb's were == 1 (Only valid for active group)
2025             psrlq mm2, 1       // divide raw bytes by 2
2026             pand  mm2, mm4     // clear invalid bit 7 of each byte
2027             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2028             pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
2029             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2030                                //  byte
2031             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2032             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
2033             movq mm2, mm0        // mov updated Raws to mm2
2034             psllq mm2, ShiftBpp  // shift data to position correctly
2035             movq mm1, mm3        // now use mm1 for getting LBCarrys
2036             pand mm1, mm2      // get LBCarrys for each byte where both
2037                                // lsb's were == 1 (Only valid for active group)
2038             psrlq mm2, 1       // divide raw bytes by 2
2039             pand  mm2, mm4     // clear invalid bit 7 of each byte
2040             paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2041             pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
2042             paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2043                                //  byte
2044 
2045             // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2046             psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
2047                                  // bytes
2048             movq mm2, mm0        // mov updated Raws to mm2
2049             psllq mm2, ShiftBpp  // shift data to position correctly
2050                               // Data only needs to be shifted once here to
2051                               // get the correct x-bpp offset.
2052             movq mm1, mm3     // now use mm1 for getting LBCarrys
2053             pand mm1, mm2     // get LBCarrys for each byte where both
2054                               // lsb's were == 1 (Only valid for active group)
2055             psrlq mm2, 1      // divide raw bytes by 2
2056             pand  mm2, mm4    // clear invalid bit 7 of each byte
2057             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2058             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2059             add ebx, 8
2060             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2061                               // byte
2062 
2063             // Now ready to write back to memory
2064             movq [edi + ebx - 8], mm0
2065             // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2066             cmp ebx, MMXLength
2067             movq mm2, mm0     // mov updated Raw(x) to mm2
2068             jb davg3lp
2069          } // end _asm block
2070       }
2071       break;
2072 
2073       case 6:
2074       case 4:
2075       case 7:
2076       case 5:
2077       {
2078          ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
2079                                                 // appropriate inactive bytes
2080          ShiftBpp.use = bpp << 3;
2081          ShiftRem.use = 64 - ShiftBpp.use;
2082          _asm {
2083             movq mm4, HBClearMask
2084             // Re-init address pointers and offset
2085             mov ebx, diff       // ebx ==> x = offset to alignment boundary
2086             // Load ActiveMask and clear all bytes except for 1st active group
2087             movq mm7, ActiveMask
2088             mov edi, row         // edi ==> Avg(x)
2089             psrlq mm7, ShiftRem
2090             mov esi, prev_row    // esi ==> Prior(x)
2091             movq mm6, mm7
2092             movq mm5, LBCarryMask
2093             psllq mm6, ShiftBpp  // Create mask for 2nd active group
2094             // PRIME the pump (load the first Raw(x-bpp) data set
2095             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2096                                  // (we correct position in loop below)
2097 davg4lp:
2098             movq mm0, [edi + ebx]
2099             psrlq mm2, ShiftRem  // shift data to position correctly
2100             movq mm1, [esi + ebx]
2101             // Add (Prev_row/2) to Average
2102             movq mm3, mm5
2103             pand mm3, mm1     // get lsb for each prev_row byte
2104             psrlq mm1, 1      // divide prev_row bytes by 2
2105             pand  mm1, mm4    // clear invalid bit 7 of each byte
2106             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2107             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2108             movq mm1, mm3     // now use mm1 for getting LBCarrys
2109             pand mm1, mm2     // get LBCarrys for each byte where both
2110                               // lsb's were == 1 (Only valid for active group)
2111             psrlq mm2, 1      // divide raw bytes by 2
2112             pand  mm2, mm4    // clear invalid bit 7 of each byte
2113             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2114             pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
2115             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2116                               // byte
2117             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2118             movq mm2, mm0     // mov updated Raws to mm2
2119             psllq mm2, ShiftBpp // shift data to position correctly
2120             add ebx, 8
2121             movq mm1, mm3     // now use mm1 for getting LBCarrys
2122             pand mm1, mm2     // get LBCarrys for each byte where both
2123                               // lsb's were == 1 (Only valid for active group)
2124             psrlq mm2, 1      // divide raw bytes by 2
2125             pand  mm2, mm4    // clear invalid bit 7 of each byte
2126             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2127             pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2128             paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2129                               // byte
2130             cmp ebx, MMXLength
2131             // Now ready to write back to memory
2132             movq [edi + ebx - 8], mm0
2133             // Prep Raw(x-bpp) for next loop
2134             movq mm2, mm0     // mov updated Raws to mm2
2135             jb davg4lp
2136          } // end _asm block
2137       }
2138       break;
2139       case 2:
2140       {
2141          ActiveMask.use  = 0x000000000000ffff;
2142          ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
2143          ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
2144          _asm {
2145             // Load ActiveMask
2146             movq mm7, ActiveMask
2147             // Re-init address pointers and offset
2148             mov ebx, diff     // ebx ==> x = offset to alignment boundary
2149             movq mm5, LBCarryMask
2150             mov edi, row      // edi ==> Avg(x)
2151             movq mm4, HBClearMask
2152             mov esi, prev_row  // esi ==> Prior(x)
2153             // PRIME the pump (load the first Raw(x-bpp) data set
2154             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2155                               // (we correct position in loop below)
2156 davg2lp:
2157             movq mm0, [edi + ebx]
2158             psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
2159             movq mm1, [esi + ebx]
2160             // Add (Prev_row/2) to Average
2161             movq mm3, mm5
2162             pand mm3, mm1     // get lsb for each prev_row byte
2163             psrlq mm1, 1      // divide prev_row bytes by 2
2164             pand  mm1, mm4    // clear invalid bit 7 of each byte
2165             movq mm6, mm7
2166             paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2167             // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2168             movq mm1, mm3     // now use mm1 for getting LBCarrys
2169             pand mm1, mm2     // get LBCarrys for each byte where both
2170                               // lsb's were == 1 (Only valid for active group)
2171             psrlq mm2, 1      // divide raw bytes by 2
2172             pand  mm2, mm4    // clear invalid bit 7 of each byte
2173             paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2174             pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
2175             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2176             // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2177             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2178             movq mm2, mm0       // mov updated Raws to mm2
2179             psllq mm2, ShiftBpp // shift data to position correctly
2180             movq mm1, mm3       // now use mm1 for getting LBCarrys
2181             pand mm1, mm2       // get LBCarrys for each byte where both
2182                                 // lsb's were == 1 (Only valid for active group)
2183             psrlq mm2, 1        // divide raw bytes by 2
2184             pand  mm2, mm4      // clear invalid bit 7 of each byte
2185             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2186             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2187             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2188 
2189             // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2190             psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2191             movq mm2, mm0       // mov updated Raws to mm2
2192             psllq mm2, ShiftBpp // shift data to position correctly
2193                                 // Data only needs to be shifted once here to
2194                                 // get the correct x-bpp offset.
2195             movq mm1, mm3       // now use mm1 for getting LBCarrys
2196             pand mm1, mm2       // get LBCarrys for each byte where both
2197                                 // lsb's were == 1 (Only valid for active group)
2198             psrlq mm2, 1        // divide raw bytes by 2
2199             pand  mm2, mm4      // clear invalid bit 7 of each byte
2200             paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2201             pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2202             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2203 
2204             // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2205             psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
2206             movq mm2, mm0        // mov updated Raws to mm2
2207             psllq mm2, ShiftBpp  // shift data to position correctly
2208                                  // Data only needs to be shifted once here to
2209                                  // get the correct x-bpp offset.
2210             add ebx, 8
2211             movq mm1, mm3    // now use mm1 for getting LBCarrys
2212             pand mm1, mm2    // get LBCarrys for each byte where both
2213                              // lsb's were == 1 (Only valid for active group)
2214             psrlq mm2, 1     // divide raw bytes by 2
2215             pand  mm2, mm4   // clear invalid bit 7 of each byte
2216             paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
2217             pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
2218             paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2219 
2220             cmp ebx, MMXLength
2221             // Now ready to write back to memory
2222             movq [edi + ebx - 8], mm0
2223             // Prep Raw(x-bpp) for next loop
2224             movq mm2, mm0    // mov updated Raws to mm2
2225             jb davg2lp
2226         } // end _asm block
2227       }
2228       break;
2229 
2230       case 1:                 // bpp == 1
2231       {
2232          _asm {
2233             // Re-init address pointers and offset
2234             mov ebx, diff     // ebx ==> x = offset to alignment boundary
2235             mov edi, row      // edi ==> Avg(x)
2236             cmp ebx, FullLength  // Test if offset at end of array
2237             jnb davg1end
2238             // Do Paeth decode for remaining bytes
2239             mov esi, prev_row    // esi ==> Prior(x)
2240             mov edx, edi
2241             xor ecx, ecx         // zero ecx before using cl & cx in loop below
2242             sub edx, bpp         // edx ==> Raw(x-bpp)
2243 davg1lp:
2244             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2245             xor eax, eax
2246             mov cl, [esi + ebx]  // load cl with Prior(x)
2247             mov al, [edx + ebx]  // load al with Raw(x-bpp)
2248             add ax, cx
2249             inc ebx
2250             shr ax, 1            // divide by 2
2251             add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
2252             cmp ebx, FullLength  // Check if at end of array
2253             mov [edi+ebx-1], al  // Write back Raw(x);
2254                          // mov does not affect flags; -1 to offset inc ebx
2255             jb davg1lp
2256 davg1end:
2257          } // end _asm block
2258       }
2259       return;
2260 
2261       case 8:             // bpp == 8
2262       {
2263          _asm {
2264             // Re-init address pointers and offset
2265             mov ebx, diff           // ebx ==> x = offset to alignment boundary
2266             movq mm5, LBCarryMask
2267             mov edi, row            // edi ==> Avg(x)
2268             movq mm4, HBClearMask
2269             mov esi, prev_row       // esi ==> Prior(x)
2270             // PRIME the pump (load the first Raw(x-bpp) data set
2271             movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2272                                 // (NO NEED to correct position in loop below)
2273 davg8lp:
2274             movq mm0, [edi + ebx]
2275             movq mm3, mm5
2276             movq mm1, [esi + ebx]
2277             add ebx, 8
2278             pand mm3, mm1       // get lsb for each prev_row byte
2279             psrlq mm1, 1        // divide prev_row bytes by 2
2280             pand mm3, mm2       // get LBCarrys for each byte where both
2281                                 // lsb's were == 1
2282             psrlq mm2, 1        // divide raw bytes by 2
2283             pand  mm1, mm4      // clear invalid bit 7 of each byte
2284             paddb mm0, mm3      // add LBCarrys to Avg for each byte
2285             pand  mm2, mm4      // clear invalid bit 7 of each byte
2286             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2287             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2288             cmp ebx, MMXLength
2289             movq [edi + ebx - 8], mm0
2290             movq mm2, mm0       // reuse as Raw(x-bpp)
2291             jb davg8lp
2292         } // end _asm block
2293       }
2294       break;
2295       default:                  // bpp greater than 8
2296       {
2297         _asm {
2298             movq mm5, LBCarryMask
2299             // Re-init address pointers and offset
2300             mov ebx, diff       // ebx ==> x = offset to alignment boundary
2301             mov edi, row        // edi ==> Avg(x)
2302             movq mm4, HBClearMask
2303             mov edx, edi
2304             mov esi, prev_row   // esi ==> Prior(x)
2305             sub edx, bpp        // edx ==> Raw(x-bpp)
2306 davgAlp:
2307             movq mm0, [edi + ebx]
2308             movq mm3, mm5
2309             movq mm1, [esi + ebx]
2310             pand mm3, mm1       // get lsb for each prev_row byte
2311             movq mm2, [edx + ebx]
2312             psrlq mm1, 1        // divide prev_row bytes by 2
2313             pand mm3, mm2       // get LBCarrys for each byte where both
2314                                 // lsb's were == 1
2315             psrlq mm2, 1        // divide raw bytes by 2
2316             pand  mm1, mm4      // clear invalid bit 7 of each byte
2317             paddb mm0, mm3      // add LBCarrys to Avg for each byte
2318             pand  mm2, mm4      // clear invalid bit 7 of each byte
2319             paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2320             add ebx, 8
2321             paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2322             cmp ebx, MMXLength
2323             movq [edi + ebx - 8], mm0
2324             jb davgAlp
2325         } // end _asm block
2326       }
2327       break;
2328    }                         // end switch ( bpp )
2329 
2330    _asm {
2331          // MMX acceleration complete now do clean-up
2332          // Check if any remaining bytes left to decode
2333          mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
2334          mov edi, row          // edi ==> Avg(x)
2335          cmp ebx, FullLength   // Test if offset at end of array
2336          jnb davgend
2337          // Do Paeth decode for remaining bytes
2338          mov esi, prev_row     // esi ==> Prior(x)
2339          mov edx, edi
2340          xor ecx, ecx          // zero ecx before using cl & cx in loop below
2341          sub edx, bpp          // edx ==> Raw(x-bpp)
2342 davglp2:
2343          // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2344          xor eax, eax
2345          mov cl, [esi + ebx]   // load cl with Prior(x)
2346          mov al, [edx + ebx]   // load al with Raw(x-bpp)
2347          add ax, cx
2348          inc ebx
2349          shr ax, 1              // divide by 2
2350          add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
2351          cmp ebx, FullLength    // Check if at end of array
2352          mov [edi+ebx-1], al    // Write back Raw(x);
2353                           // mov does not affect flags; -1 to offset inc ebx
2354          jb davglp2
2355 davgend:
2356          emms             // End MMX instructions; prep for possible FP instrs.
2357    } // end _asm block
2358 }
2359 
2360 // Optimized code for PNG Paeth filter decoder
2361 void /* PRIVATE */
png_read_filter_row_mmx_paeth(png_row_infop row_info,png_bytep row,png_bytep prev_row)2362 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2363                               png_bytep prev_row)
2364 {
2365    png_uint_32 FullLength;
2366    png_uint_32 MMXLength;
2367    //png_uint_32 len;
2368    int bpp;
2369    int diff;
2370    //int ptemp;
2371    int patemp, pbtemp, pctemp;
2372 
2373    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2374    FullLength  = row_info->rowbytes; // # of bytes to filter
2375    _asm
2376    {
2377          xor ebx, ebx        // ebx ==> x offset
2378          mov edi, row
2379          xor edx, edx        // edx ==> x-bpp offset
2380          mov esi, prev_row
2381          xor eax, eax
2382 
2383          // Compute the Raw value for the first bpp bytes
2384          // Note: the formula works out to be always
2385          //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
2386 dpthrlp:
2387          mov al, [edi + ebx]
2388          add al, [esi + ebx]
2389          inc ebx
2390          cmp ebx, bpp
2391          mov [edi + ebx - 1], al
2392          jb dpthrlp
2393          // get # of bytes to alignment
2394          mov diff, edi         // take start of row
2395          add diff, ebx         // add bpp
2396          xor ecx, ecx
2397          add diff, 0xf         // add 7 + 8 to incr past alignment boundary
2398          and diff, 0xfffffff8  // mask to alignment boundary
2399          sub diff, edi         // subtract from start ==> value ebx at alignment
2400          jz dpthgo
2401          // fix alignment
2402 dpthlp1:
2403          xor eax, eax
2404          // pav = p - a = (a + b - c) - a = b - c
2405          mov al, [esi + ebx]   // load Prior(x) into al
2406          mov cl, [esi + edx]   // load Prior(x-bpp) into cl
2407          sub eax, ecx          // subtract Prior(x-bpp)
2408          mov patemp, eax       // Save pav for later use
2409          xor eax, eax
2410          // pbv = p - b = (a + b - c) - b = a - c
2411          mov al, [edi + edx]   // load Raw(x-bpp) into al
2412          sub eax, ecx          // subtract Prior(x-bpp)
2413          mov ecx, eax
2414          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2415          add eax, patemp       // pcv = pav + pbv
2416          // pc = abs(pcv)
2417          test eax, 0x80000000
2418          jz dpthpca
2419          neg eax               // reverse sign of neg values
2420 dpthpca:
2421          mov pctemp, eax       // save pc for later use
2422          // pb = abs(pbv)
2423          test ecx, 0x80000000
2424          jz dpthpba
2425          neg ecx               // reverse sign of neg values
2426 dpthpba:
2427          mov pbtemp, ecx       // save pb for later use
2428          // pa = abs(pav)
2429          mov eax, patemp
2430          test eax, 0x80000000
2431          jz dpthpaa
2432          neg eax               // reverse sign of neg values
2433 dpthpaa:
2434          mov patemp, eax       // save pa for later use
2435          // test if pa <= pb
2436          cmp eax, ecx
2437          jna dpthabb
2438          // pa > pb; now test if pb <= pc
2439          cmp ecx, pctemp
2440          jna dpthbbc
2441          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2442          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2443          jmp dpthpaeth
2444 dpthbbc:
2445          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2446          mov cl, [esi + ebx]   // load Prior(x) into cl
2447          jmp dpthpaeth
2448 dpthabb:
2449          // pa <= pb; now test if pa <= pc
2450          cmp eax, pctemp
2451          jna dpthabc
2452          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2453          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2454          jmp dpthpaeth
2455 dpthabc:
2456          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2457          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
2458 dpthpaeth:
2459          inc ebx
2460          inc edx
2461          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2462          add [edi + ebx - 1], cl
2463          cmp ebx, diff
2464          jb dpthlp1
2465 dpthgo:
2466          mov ecx, FullLength
2467          mov eax, ecx
2468          sub eax, ebx          // subtract alignment fix
2469          and eax, 0x00000007   // calc bytes over mult of 8
2470          sub ecx, eax          // drop over bytes from original length
2471          mov MMXLength, ecx
2472    } // end _asm block
2473    // Now do the math for the rest of the row
2474    switch ( bpp )
2475    {
2476       case 3:
2477       {
2478          ActiveMask.use = 0x0000000000ffffff;
2479          ActiveMaskEnd.use = 0xffff000000000000;
2480          ShiftBpp.use = 24;    // == bpp(3) * 8
2481          ShiftRem.use = 40;    // == 64 - 24
2482          _asm
2483          {
2484             mov ebx, diff
2485             mov edi, row
2486             mov esi, prev_row
2487             pxor mm0, mm0
2488             // PRIME the pump (load the first Raw(x-bpp) data set
2489             movq mm1, [edi+ebx-8]
2490 dpth3lp:
2491             psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2492             movq mm2, [esi + ebx]   // load b=Prior(x)
2493             punpcklbw mm1, mm0      // Unpack High bytes of a
2494             movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
2495             punpcklbw mm2, mm0      // Unpack High bytes of b
2496             psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2497             // pav = p - a = (a + b - c) - a = b - c
2498             movq mm4, mm2
2499             punpcklbw mm3, mm0      // Unpack High bytes of c
2500             // pbv = p - b = (a + b - c) - b = a - c
2501             movq mm5, mm1
2502             psubw mm4, mm3
2503             pxor mm7, mm7
2504             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2505             movq mm6, mm4
2506             psubw mm5, mm3
2507 
2508             // pa = abs(p-a) = abs(pav)
2509             // pb = abs(p-b) = abs(pbv)
2510             // pc = abs(p-c) = abs(pcv)
2511             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2512             paddw mm6, mm5
2513             pand mm0, mm4       // Only pav bytes < 0 in mm7
2514             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2515             psubw mm4, mm0
2516             pand mm7, mm5       // Only pbv bytes < 0 in mm0
2517             psubw mm4, mm0
2518             psubw mm5, mm7
2519             pxor mm0, mm0
2520             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2521             pand mm0, mm6       // Only pav bytes < 0 in mm7
2522             psubw mm5, mm7
2523             psubw mm6, mm0
2524             //  test pa <= pb
2525             movq mm7, mm4
2526             psubw mm6, mm0
2527             pcmpgtw mm7, mm5    // pa > pb?
2528             movq mm0, mm7
2529             // use mm7 mask to merge pa & pb
2530             pand mm5, mm7
2531             // use mm0 mask copy to merge a & b
2532             pand mm2, mm0
2533             pandn mm7, mm4
2534             pandn mm0, mm1
2535             paddw mm7, mm5
2536             paddw mm0, mm2
2537             //  test  ((pa <= pb)? pa:pb) <= pc
2538             pcmpgtw mm7, mm6       // pab > pc?
2539             pxor mm1, mm1
2540             pand mm3, mm7
2541             pandn mm7, mm0
2542             paddw mm7, mm3
2543             pxor mm0, mm0
2544             packuswb mm7, mm1
2545             movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
2546             pand mm7, ActiveMask
2547             movq mm2, mm3           // load b=Prior(x) step 1
2548             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2549             punpcklbw mm3, mm0      // Unpack High bytes of c
2550             movq [edi + ebx], mm7   // write back updated value
2551             movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
2552             // Now do Paeth for 2nd set of bytes (3-5)
2553             psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
2554             punpcklbw mm1, mm0      // Unpack High bytes of a
2555             pxor mm7, mm7
2556             punpcklbw mm2, mm0      // Unpack High bytes of b
2557             // pbv = p - b = (a + b - c) - b = a - c
2558             movq mm5, mm1
2559             // pav = p - a = (a + b - c) - a = b - c
2560             movq mm4, mm2
2561             psubw mm5, mm3
2562             psubw mm4, mm3
2563             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2564             //       pav + pbv = pbv + pav
2565             movq mm6, mm5
2566             paddw mm6, mm4
2567 
2568             // pa = abs(p-a) = abs(pav)
2569             // pb = abs(p-b) = abs(pbv)
2570             // pc = abs(p-c) = abs(pcv)
2571             pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
2572             pcmpgtw mm7, mm4       // Create mask pav bytes < 0
2573             pand mm0, mm5          // Only pbv bytes < 0 in mm0
2574             pand mm7, mm4          // Only pav bytes < 0 in mm7
2575             psubw mm5, mm0
2576             psubw mm4, mm7
2577             psubw mm5, mm0
2578             psubw mm4, mm7
2579             pxor mm0, mm0
2580             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2581             pand mm0, mm6          // Only pav bytes < 0 in mm7
2582             psubw mm6, mm0
2583             //  test pa <= pb
2584             movq mm7, mm4
2585             psubw mm6, mm0
2586             pcmpgtw mm7, mm5       // pa > pb?
2587             movq mm0, mm7
2588             // use mm7 mask to merge pa & pb
2589             pand mm5, mm7
2590             // use mm0 mask copy to merge a & b
2591             pand mm2, mm0
2592             pandn mm7, mm4
2593             pandn mm0, mm1
2594             paddw mm7, mm5
2595             paddw mm0, mm2
2596             //  test  ((pa <= pb)? pa:pb) <= pc
2597             pcmpgtw mm7, mm6       // pab > pc?
2598             movq mm2, [esi + ebx]  // load b=Prior(x)
2599             pand mm3, mm7
2600             pandn mm7, mm0
2601             pxor mm1, mm1
2602             paddw mm7, mm3
2603             pxor mm0, mm0
2604             packuswb mm7, mm1
2605             movq mm3, mm2           // load c=Prior(x-bpp) step 1
2606             pand mm7, ActiveMask
2607             punpckhbw mm2, mm0      // Unpack High bytes of b
2608             psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
2609              // pav = p - a = (a + b - c) - a = b - c
2610             movq mm4, mm2
2611             paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2612             psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
2613             movq [edi + ebx], mm7   // write back updated value
2614             movq mm1, mm7
2615             punpckhbw mm3, mm0      // Unpack High bytes of c
2616             psllq mm1, ShiftBpp     // Shift bytes
2617                                     // Now mm1 will be used as Raw(x-bpp)
2618             // Now do Paeth for 3rd, and final, set of bytes (6-7)
2619             pxor mm7, mm7
2620             punpckhbw mm1, mm0      // Unpack High bytes of a
2621             psubw mm4, mm3
2622             // pbv = p - b = (a + b - c) - b = a - c
2623             movq mm5, mm1
2624             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2625             movq mm6, mm4
2626             psubw mm5, mm3
2627             pxor mm0, mm0
2628             paddw mm6, mm5
2629 
2630             // pa = abs(p-a) = abs(pav)
2631             // pb = abs(p-b) = abs(pbv)
2632             // pc = abs(p-c) = abs(pcv)
2633             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2634             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2635             pand mm0, mm4       // Only pav bytes < 0 in mm7
2636             pand mm7, mm5       // Only pbv bytes < 0 in mm0
2637             psubw mm4, mm0
2638             psubw mm5, mm7
2639             psubw mm4, mm0
2640             psubw mm5, mm7
2641             pxor mm0, mm0
2642             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2643             pand mm0, mm6       // Only pav bytes < 0 in mm7
2644             psubw mm6, mm0
2645             //  test pa <= pb
2646             movq mm7, mm4
2647             psubw mm6, mm0
2648             pcmpgtw mm7, mm5    // pa > pb?
2649             movq mm0, mm7
2650             // use mm0 mask copy to merge a & b
2651             pand mm2, mm0
2652             // use mm7 mask to merge pa & pb
2653             pand mm5, mm7
2654             pandn mm0, mm1
2655             pandn mm7, mm4
2656             paddw mm0, mm2
2657             paddw mm7, mm5
2658             //  test  ((pa <= pb)? pa:pb) <= pc
2659             pcmpgtw mm7, mm6    // pab > pc?
2660             pand mm3, mm7
2661             pandn mm7, mm0
2662             paddw mm7, mm3
2663             pxor mm1, mm1
2664             packuswb mm1, mm7
2665             // Step ebx to next set of 8 bytes and repeat loop til done
2666             add ebx, 8
2667             pand mm1, ActiveMaskEnd
2668             paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2669 
2670             cmp ebx, MMXLength
2671             pxor mm0, mm0              // pxor does not affect flags
2672             movq [edi + ebx - 8], mm1  // write back updated value
2673                                  // mm1 will be used as Raw(x-bpp) next loop
2674                            // mm3 ready to be used as Prior(x-bpp) next loop
2675             jb dpth3lp
2676          } // end _asm block
2677       }
2678       break;
2679 
2680       case 6:
2681       case 7:
2682       case 5:
2683       {
2684          ActiveMask.use  = 0x00000000ffffffff;
2685          ActiveMask2.use = 0xffffffff00000000;
2686          ShiftBpp.use = bpp << 3;    // == bpp * 8
2687          ShiftRem.use = 64 - ShiftBpp.use;
2688          _asm
2689          {
2690             mov ebx, diff
2691             mov edi, row
2692             mov esi, prev_row
2693             // PRIME the pump (load the first Raw(x-bpp) data set
2694             movq mm1, [edi+ebx-8]
2695             pxor mm0, mm0
2696 dpth6lp:
2697             // Must shift to position Raw(x-bpp) data
2698             psrlq mm1, ShiftRem
2699             // Do first set of 4 bytes
2700             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2701             punpcklbw mm1, mm0      // Unpack Low bytes of a
2702             movq mm2, [esi + ebx]   // load b=Prior(x)
2703             punpcklbw mm2, mm0      // Unpack Low bytes of b
2704             // Must shift to position Prior(x-bpp) data
2705             psrlq mm3, ShiftRem
2706             // pav = p - a = (a + b - c) - a = b - c
2707             movq mm4, mm2
2708             punpcklbw mm3, mm0      // Unpack Low bytes of c
2709             // pbv = p - b = (a + b - c) - b = a - c
2710             movq mm5, mm1
2711             psubw mm4, mm3
2712             pxor mm7, mm7
2713             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2714             movq mm6, mm4
2715             psubw mm5, mm3
2716             // pa = abs(p-a) = abs(pav)
2717             // pb = abs(p-b) = abs(pbv)
2718             // pc = abs(p-c) = abs(pcv)
2719             pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2720             paddw mm6, mm5
2721             pand mm0, mm4       // Only pav bytes < 0 in mm7
2722             pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2723             psubw mm4, mm0
2724             pand mm7, mm5       // Only pbv bytes < 0 in mm0
2725             psubw mm4, mm0
2726             psubw mm5, mm7
2727             pxor mm0, mm0
2728             pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2729             pand mm0, mm6       // Only pav bytes < 0 in mm7
2730             psubw mm5, mm7
2731             psubw mm6, mm0
2732             //  test pa <= pb
2733             movq mm7, mm4
2734             psubw mm6, mm0
2735             pcmpgtw mm7, mm5    // pa > pb?
2736             movq mm0, mm7
2737             // use mm7 mask to merge pa & pb
2738             pand mm5, mm7
2739             // use mm0 mask copy to merge a & b
2740             pand mm2, mm0
2741             pandn mm7, mm4
2742             pandn mm0, mm1
2743             paddw mm7, mm5
2744             paddw mm0, mm2
2745             //  test  ((pa <= pb)? pa:pb) <= pc
2746             pcmpgtw mm7, mm6    // pab > pc?
2747             pxor mm1, mm1
2748             pand mm3, mm7
2749             pandn mm7, mm0
2750             paddw mm7, mm3
2751             pxor mm0, mm0
2752             packuswb mm7, mm1
2753             movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
2754             pand mm7, ActiveMask
2755             psrlq mm3, ShiftRem
2756             movq mm2, [esi + ebx]      // load b=Prior(x) step 1
2757             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2758             movq mm6, mm2
2759             movq [edi + ebx], mm7      // write back updated value
2760             movq mm1, [edi+ebx-8]
2761             psllq mm6, ShiftBpp
2762             movq mm5, mm7
2763             psrlq mm1, ShiftRem
2764             por mm3, mm6
2765             psllq mm5, ShiftBpp
2766             punpckhbw mm3, mm0         // Unpack High bytes of c
2767             por mm1, mm5
2768             // Do second set of 4 bytes
2769             punpckhbw mm2, mm0         // Unpack High bytes of b
2770             punpckhbw mm1, mm0         // Unpack High bytes of a
2771             // pav = p - a = (a + b - c) - a = b - c
2772             movq mm4, mm2
2773             // pbv = p - b = (a + b - c) - b = a - c
2774             movq mm5, mm1
2775             psubw mm4, mm3
2776             pxor mm7, mm7
2777             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2778             movq mm6, mm4
2779             psubw mm5, mm3
2780             // pa = abs(p-a) = abs(pav)
2781             // pb = abs(p-b) = abs(pbv)
2782             // pc = abs(p-c) = abs(pcv)
2783             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2784             paddw mm6, mm5
2785             pand mm0, mm4          // Only pav bytes < 0 in mm7
2786             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2787             psubw mm4, mm0
2788             pand mm7, mm5          // Only pbv bytes < 0 in mm0
2789             psubw mm4, mm0
2790             psubw mm5, mm7
2791             pxor mm0, mm0
2792             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2793             pand mm0, mm6          // Only pav bytes < 0 in mm7
2794             psubw mm5, mm7
2795             psubw mm6, mm0
2796             //  test pa <= pb
2797             movq mm7, mm4
2798             psubw mm6, mm0
2799             pcmpgtw mm7, mm5       // pa > pb?
2800             movq mm0, mm7
2801             // use mm7 mask to merge pa & pb
2802             pand mm5, mm7
2803             // use mm0 mask copy to merge a & b
2804             pand mm2, mm0
2805             pandn mm7, mm4
2806             pandn mm0, mm1
2807             paddw mm7, mm5
2808             paddw mm0, mm2
2809             //  test  ((pa <= pb)? pa:pb) <= pc
2810             pcmpgtw mm7, mm6           // pab > pc?
2811             pxor mm1, mm1
2812             pand mm3, mm7
2813             pandn mm7, mm0
2814             pxor mm1, mm1
2815             paddw mm7, mm3
2816             pxor mm0, mm0
2817             // Step ex to next set of 8 bytes and repeat loop til done
2818             add ebx, 8
2819             packuswb mm1, mm7
2820             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2821             cmp ebx, MMXLength
2822             movq [edi + ebx - 8], mm1      // write back updated value
2823                                 // mm1 will be used as Raw(x-bpp) next loop
2824             jb dpth6lp
2825          } // end _asm block
2826       }
2827       break;
2828 
2829       case 4:
2830       {
2831          ActiveMask.use  = 0x00000000ffffffff;
2832          _asm {
2833             mov ebx, diff
2834             mov edi, row
2835             mov esi, prev_row
2836             pxor mm0, mm0
2837             // PRIME the pump (load the first Raw(x-bpp) data set
2838             movq mm1, [edi+ebx-8]    // Only time should need to read
2839                                      //  a=Raw(x-bpp) bytes
2840 dpth4lp:
2841             // Do first set of 4 bytes
2842             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
2843             punpckhbw mm1, mm0       // Unpack Low bytes of a
2844             movq mm2, [esi + ebx]    // load b=Prior(x)
2845             punpcklbw mm2, mm0       // Unpack High bytes of b
2846             // pav = p - a = (a + b - c) - a = b - c
2847             movq mm4, mm2
2848             punpckhbw mm3, mm0       // Unpack High bytes of c
2849             // pbv = p - b = (a + b - c) - b = a - c
2850             movq mm5, mm1
2851             psubw mm4, mm3
2852             pxor mm7, mm7
2853             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2854             movq mm6, mm4
2855             psubw mm5, mm3
2856             // pa = abs(p-a) = abs(pav)
2857             // pb = abs(p-b) = abs(pbv)
2858             // pc = abs(p-c) = abs(pcv)
2859             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2860             paddw mm6, mm5
2861             pand mm0, mm4          // Only pav bytes < 0 in mm7
2862             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2863             psubw mm4, mm0
2864             pand mm7, mm5          // Only pbv bytes < 0 in mm0
2865             psubw mm4, mm0
2866             psubw mm5, mm7
2867             pxor mm0, mm0
2868             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2869             pand mm0, mm6          // Only pav bytes < 0 in mm7
2870             psubw mm5, mm7
2871             psubw mm6, mm0
2872             //  test pa <= pb
2873             movq mm7, mm4
2874             psubw mm6, mm0
2875             pcmpgtw mm7, mm5       // pa > pb?
2876             movq mm0, mm7
2877             // use mm7 mask to merge pa & pb
2878             pand mm5, mm7
2879             // use mm0 mask copy to merge a & b
2880             pand mm2, mm0
2881             pandn mm7, mm4
2882             pandn mm0, mm1
2883             paddw mm7, mm5
2884             paddw mm0, mm2
2885             //  test  ((pa <= pb)? pa:pb) <= pc
2886             pcmpgtw mm7, mm6       // pab > pc?
2887             pxor mm1, mm1
2888             pand mm3, mm7
2889             pandn mm7, mm0
2890             paddw mm7, mm3
2891             pxor mm0, mm0
2892             packuswb mm7, mm1
2893             movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
2894             pand mm7, ActiveMask
2895             movq mm2, mm3              // load b=Prior(x) step 1
2896             paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2897             punpcklbw mm3, mm0         // Unpack High bytes of c
2898             movq [edi + ebx], mm7      // write back updated value
2899             movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
2900             // Do second set of 4 bytes
2901             punpckhbw mm2, mm0         // Unpack Low bytes of b
2902             punpcklbw mm1, mm0         // Unpack Low bytes of a
2903             // pav = p - a = (a + b - c) - a = b - c
2904             movq mm4, mm2
2905             // pbv = p - b = (a + b - c) - b = a - c
2906             movq mm5, mm1
2907             psubw mm4, mm3
2908             pxor mm7, mm7
2909             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2910             movq mm6, mm4
2911             psubw mm5, mm3
2912             // pa = abs(p-a) = abs(pav)
2913             // pb = abs(p-b) = abs(pbv)
2914             // pc = abs(p-c) = abs(pcv)
2915             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2916             paddw mm6, mm5
2917             pand mm0, mm4          // Only pav bytes < 0 in mm7
2918             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2919             psubw mm4, mm0
2920             pand mm7, mm5          // Only pbv bytes < 0 in mm0
2921             psubw mm4, mm0
2922             psubw mm5, mm7
2923             pxor mm0, mm0
2924             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2925             pand mm0, mm6          // Only pav bytes < 0 in mm7
2926             psubw mm5, mm7
2927             psubw mm6, mm0
2928             //  test pa <= pb
2929             movq mm7, mm4
2930             psubw mm6, mm0
2931             pcmpgtw mm7, mm5       // pa > pb?
2932             movq mm0, mm7
2933             // use mm7 mask to merge pa & pb
2934             pand mm5, mm7
2935             // use mm0 mask copy to merge a & b
2936             pand mm2, mm0
2937             pandn mm7, mm4
2938             pandn mm0, mm1
2939             paddw mm7, mm5
2940             paddw mm0, mm2
2941             //  test  ((pa <= pb)? pa:pb) <= pc
2942             pcmpgtw mm7, mm6       // pab > pc?
2943             pxor mm1, mm1
2944             pand mm3, mm7
2945             pandn mm7, mm0
2946             pxor mm1, mm1
2947             paddw mm7, mm3
2948             pxor mm0, mm0
2949             // Step ex to next set of 8 bytes and repeat loop til done
2950             add ebx, 8
2951             packuswb mm1, mm7
2952             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2953             cmp ebx, MMXLength
2954             movq [edi + ebx - 8], mm1      // write back updated value
2955                                 // mm1 will be used as Raw(x-bpp) next loop
2956             jb dpth4lp
2957          } // end _asm block
2958       }
2959       break;
2960       case 8:                          // bpp == 8
2961       {
2962          ActiveMask.use  = 0x00000000ffffffff;
2963          _asm {
2964             mov ebx, diff
2965             mov edi, row
2966             mov esi, prev_row
2967             pxor mm0, mm0
2968             // PRIME the pump (load the first Raw(x-bpp) data set
2969             movq mm1, [edi+ebx-8]      // Only time should need to read
2970                                        //  a=Raw(x-bpp) bytes
2971 dpth8lp:
2972             // Do first set of 4 bytes
2973             movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2974             punpcklbw mm1, mm0         // Unpack Low bytes of a
2975             movq mm2, [esi + ebx]      // load b=Prior(x)
2976             punpcklbw mm2, mm0         // Unpack Low bytes of b
2977             // pav = p - a = (a + b - c) - a = b - c
2978             movq mm4, mm2
2979             punpcklbw mm3, mm0         // Unpack Low bytes of c
2980             // pbv = p - b = (a + b - c) - b = a - c
2981             movq mm5, mm1
2982             psubw mm4, mm3
2983             pxor mm7, mm7
2984             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2985             movq mm6, mm4
2986             psubw mm5, mm3
2987             // pa = abs(p-a) = abs(pav)
2988             // pb = abs(p-b) = abs(pbv)
2989             // pc = abs(p-c) = abs(pcv)
2990             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2991             paddw mm6, mm5
2992             pand mm0, mm4          // Only pav bytes < 0 in mm7
2993             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2994             psubw mm4, mm0
2995             pand mm7, mm5          // Only pbv bytes < 0 in mm0
2996             psubw mm4, mm0
2997             psubw mm5, mm7
2998             pxor mm0, mm0
2999             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
3000             pand mm0, mm6          // Only pav bytes < 0 in mm7
3001             psubw mm5, mm7
3002             psubw mm6, mm0
3003             //  test pa <= pb
3004             movq mm7, mm4
3005             psubw mm6, mm0
3006             pcmpgtw mm7, mm5       // pa > pb?
3007             movq mm0, mm7
3008             // use mm7 mask to merge pa & pb
3009             pand mm5, mm7
3010             // use mm0 mask copy to merge a & b
3011             pand mm2, mm0
3012             pandn mm7, mm4
3013             pandn mm0, mm1
3014             paddw mm7, mm5
3015             paddw mm0, mm2
3016             //  test  ((pa <= pb)? pa:pb) <= pc
3017             pcmpgtw mm7, mm6       // pab > pc?
3018             pxor mm1, mm1
3019             pand mm3, mm7
3020             pandn mm7, mm0
3021             paddw mm7, mm3
3022             pxor mm0, mm0
3023             packuswb mm7, mm1
3024             movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
3025             pand mm7, ActiveMask
3026             movq mm2, [esi + ebx]    // load b=Prior(x)
3027             paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
3028             punpckhbw mm3, mm0       // Unpack High bytes of c
3029             movq [edi + ebx], mm7    // write back updated value
3030             movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
3031 
3032             // Do second set of 4 bytes
3033             punpckhbw mm2, mm0       // Unpack High bytes of b
3034             punpckhbw mm1, mm0       // Unpack High bytes of a
3035             // pav = p - a = (a + b - c) - a = b - c
3036             movq mm4, mm2
3037             // pbv = p - b = (a + b - c) - b = a - c
3038             movq mm5, mm1
3039             psubw mm4, mm3
3040             pxor mm7, mm7
3041             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3042             movq mm6, mm4
3043             psubw mm5, mm3
3044             // pa = abs(p-a) = abs(pav)
3045             // pb = abs(p-b) = abs(pbv)
3046             // pc = abs(p-c) = abs(pcv)
3047             pcmpgtw mm0, mm4       // Create mask pav bytes < 0
3048             paddw mm6, mm5
3049             pand mm0, mm4          // Only pav bytes < 0 in mm7
3050             pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
3051             psubw mm4, mm0
3052             pand mm7, mm5          // Only pbv bytes < 0 in mm0
3053             psubw mm4, mm0
3054             psubw mm5, mm7
3055             pxor mm0, mm0
3056             pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
3057             pand mm0, mm6          // Only pav bytes < 0 in mm7
3058             psubw mm5, mm7
3059             psubw mm6, mm0
3060             //  test pa <= pb
3061             movq mm7, mm4
3062             psubw mm6, mm0
3063             pcmpgtw mm7, mm5       // pa > pb?
3064             movq mm0, mm7
3065             // use mm7 mask to merge pa & pb
3066             pand mm5, mm7
3067             // use mm0 mask copy to merge a & b
3068             pand mm2, mm0
3069             pandn mm7, mm4
3070             pandn mm0, mm1
3071             paddw mm7, mm5
3072             paddw mm0, mm2
3073             //  test  ((pa <= pb)? pa:pb) <= pc
3074             pcmpgtw mm7, mm6       // pab > pc?
3075             pxor mm1, mm1
3076             pand mm3, mm7
3077             pandn mm7, mm0
3078             pxor mm1, mm1
3079             paddw mm7, mm3
3080             pxor mm0, mm0
3081             // Step ex to next set of 8 bytes and repeat loop til done
3082             add ebx, 8
3083             packuswb mm1, mm7
3084             paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
3085             cmp ebx, MMXLength
3086             movq [edi + ebx - 8], mm1      // write back updated value
3087                             // mm1 will be used as Raw(x-bpp) next loop
3088             jb dpth8lp
3089          } // end _asm block
3090       }
3091       break;
3092 
3093       case 1:                // bpp = 1
3094       case 2:                // bpp = 2
3095       default:               // bpp > 8
3096       {
3097          _asm {
3098             mov ebx, diff
3099             cmp ebx, FullLength
3100             jnb dpthdend
3101             mov edi, row
3102             mov esi, prev_row
3103             // Do Paeth decode for remaining bytes
3104             mov edx, ebx
3105             xor ecx, ecx        // zero ecx before using cl & cx in loop below
3106             sub edx, bpp        // Set edx = ebx - bpp
3107 dpthdlp:
3108             xor eax, eax
3109             // pav = p - a = (a + b - c) - a = b - c
3110             mov al, [esi + ebx]        // load Prior(x) into al
3111             mov cl, [esi + edx]        // load Prior(x-bpp) into cl
3112             sub eax, ecx                 // subtract Prior(x-bpp)
3113             mov patemp, eax                 // Save pav for later use
3114             xor eax, eax
3115             // pbv = p - b = (a + b - c) - b = a - c
3116             mov al, [edi + edx]        // load Raw(x-bpp) into al
3117             sub eax, ecx                 // subtract Prior(x-bpp)
3118             mov ecx, eax
3119             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3120             add eax, patemp                 // pcv = pav + pbv
3121             // pc = abs(pcv)
3122             test eax, 0x80000000
3123             jz dpthdpca
3124             neg eax                     // reverse sign of neg values
3125 dpthdpca:
3126             mov pctemp, eax             // save pc for later use
3127             // pb = abs(pbv)
3128             test ecx, 0x80000000
3129             jz dpthdpba
3130             neg ecx                     // reverse sign of neg values
3131 dpthdpba:
3132             mov pbtemp, ecx             // save pb for later use
3133             // pa = abs(pav)
3134             mov eax, patemp
3135             test eax, 0x80000000
3136             jz dpthdpaa
3137             neg eax                     // reverse sign of neg values
3138 dpthdpaa:
3139             mov patemp, eax             // save pa for later use
3140             // test if pa <= pb
3141             cmp eax, ecx
3142             jna dpthdabb
3143             // pa > pb; now test if pb <= pc
3144             cmp ecx, pctemp
3145             jna dpthdbbc
3146             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3147             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3148             jmp dpthdpaeth
3149 dpthdbbc:
3150             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3151             mov cl, [esi + ebx]        // load Prior(x) into cl
3152             jmp dpthdpaeth
3153 dpthdabb:
3154             // pa <= pb; now test if pa <= pc
3155             cmp eax, pctemp
3156             jna dpthdabc
3157             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3158             mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3159             jmp dpthdpaeth
3160 dpthdabc:
3161             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3162             mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3163 dpthdpaeth:
3164             inc ebx
3165             inc edx
3166             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3167             add [edi + ebx - 1], cl
3168             cmp ebx, FullLength
3169             jb dpthdlp
3170 dpthdend:
3171          } // end _asm block
3172       }
3173       return;                   // No need to go further with this one
3174    }                         // end switch ( bpp )
3175    _asm
3176    {
3177          // MMX acceleration complete now do clean-up
3178          // Check if any remaining bytes left to decode
3179          mov ebx, MMXLength
3180          cmp ebx, FullLength
3181          jnb dpthend
3182          mov edi, row
3183          mov esi, prev_row
3184          // Do Paeth decode for remaining bytes
3185          mov edx, ebx
3186          xor ecx, ecx         // zero ecx before using cl & cx in loop below
3187          sub edx, bpp         // Set edx = ebx - bpp
3188 dpthlp2:
3189          xor eax, eax
3190          // pav = p - a = (a + b - c) - a = b - c
3191          mov al, [esi + ebx]  // load Prior(x) into al
3192          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3193          sub eax, ecx         // subtract Prior(x-bpp)
3194          mov patemp, eax      // Save pav for later use
3195          xor eax, eax
3196          // pbv = p - b = (a + b - c) - b = a - c
3197          mov al, [edi + edx]  // load Raw(x-bpp) into al
3198          sub eax, ecx         // subtract Prior(x-bpp)
3199          mov ecx, eax
3200          // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3201          add eax, patemp      // pcv = pav + pbv
3202          // pc = abs(pcv)
3203          test eax, 0x80000000
3204          jz dpthpca2
3205          neg eax              // reverse sign of neg values
3206 dpthpca2:
3207          mov pctemp, eax      // save pc for later use
3208          // pb = abs(pbv)
3209          test ecx, 0x80000000
3210          jz dpthpba2
3211          neg ecx              // reverse sign of neg values
3212 dpthpba2:
3213          mov pbtemp, ecx      // save pb for later use
3214          // pa = abs(pav)
3215          mov eax, patemp
3216          test eax, 0x80000000
3217          jz dpthpaa2
3218          neg eax              // reverse sign of neg values
3219 dpthpaa2:
3220          mov patemp, eax      // save pa for later use
3221          // test if pa <= pb
3222          cmp eax, ecx
3223          jna dpthabb2
3224          // pa > pb; now test if pb <= pc
3225          cmp ecx, pctemp
3226          jna dpthbbc2
3227          // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3228          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3229          jmp dpthpaeth2
3230 dpthbbc2:
3231          // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3232          mov cl, [esi + ebx]        // load Prior(x) into cl
3233          jmp dpthpaeth2
3234 dpthabb2:
3235          // pa <= pb; now test if pa <= pc
3236          cmp eax, pctemp
3237          jna dpthabc2
3238          // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3239          mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3240          jmp dpthpaeth2
3241 dpthabc2:
3242          // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3243          mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3244 dpthpaeth2:
3245          inc ebx
3246          inc edx
3247          // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3248          add [edi + ebx - 1], cl
3249          cmp ebx, FullLength
3250          jb dpthlp2
3251 dpthend:
3252          emms             // End MMX instructions; prep for possible FP instrs.
3253    } // end _asm block
3254 }
3255 
3256 // Optimized code for PNG Sub filter decoder
3257 void /* PRIVATE */
png_read_filter_row_mmx_sub(png_row_infop row_info,png_bytep row)3258 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3259 {
3260    //int test;
3261    int bpp;
3262    png_uint_32 FullLength;
3263    png_uint_32 MMXLength;
3264    int diff;
3265 
3266    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3267    FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
3268    _asm {
3269         mov edi, row
3270         mov esi, edi               // lp = row
3271         add edi, bpp               // rp = row + bpp
3272         xor eax, eax
3273         // get # of bytes to alignment
3274         mov diff, edi               // take start of row
3275         add diff, 0xf               // add 7 + 8 to incr past
3276                                         // alignment boundary
3277         xor ebx, ebx
3278         and diff, 0xfffffff8        // mask to alignment boundary
3279         sub diff, edi               // subtract from start ==> value
3280                                         //  ebx at alignment
3281         jz dsubgo
3282         // fix alignment
3283 dsublp1:
3284         mov al, [esi+ebx]
3285         add [edi+ebx], al
3286         inc ebx
3287         cmp ebx, diff
3288         jb dsublp1
3289 dsubgo:
3290         mov ecx, FullLength
3291         mov edx, ecx
3292         sub edx, ebx                  // subtract alignment fix
3293         and edx, 0x00000007           // calc bytes over mult of 8
3294         sub ecx, edx                  // drop over bytes from length
3295         mov MMXLength, ecx
3296    } // end _asm block
3297 
3298    // Now do the math for the rest of the row
3299    switch ( bpp )
3300    {
3301         case 3:
3302         {
3303          ActiveMask.use  = 0x0000ffffff000000;
3304          ShiftBpp.use = 24;       // == 3 * 8
3305          ShiftRem.use  = 40;      // == 64 - 24
3306          _asm {
3307             mov edi, row
3308             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3309             mov esi, edi              // lp = row
3310             add edi, bpp          // rp = row + bpp
3311             movq mm6, mm7
3312             mov ebx, diff
3313             psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
3314                                   // byte group
3315             // PRIME the pump (load the first Raw(x-bpp) data set
3316             movq mm1, [edi+ebx-8]
3317 dsub3lp:
3318             psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
3319                           // no need for mask; shift clears inactive bytes
3320             // Add 1st active group
3321             movq mm0, [edi+ebx]
3322             paddb mm0, mm1
3323             // Add 2nd active group
3324             movq mm1, mm0         // mov updated Raws to mm1
3325             psllq mm1, ShiftBpp   // shift data to position correctly
3326             pand mm1, mm7         // mask to use only 2nd active group
3327             paddb mm0, mm1
3328             // Add 3rd active group
3329             movq mm1, mm0         // mov updated Raws to mm1
3330             psllq mm1, ShiftBpp   // shift data to position correctly
3331             pand mm1, mm6         // mask to use only 3rd active group
3332             add ebx, 8
3333             paddb mm0, mm1
3334             cmp ebx, MMXLength
3335             movq [edi+ebx-8], mm0     // Write updated Raws back to array
3336             // Prep for doing 1st add at top of loop
3337             movq mm1, mm0
3338             jb dsub3lp
3339          } // end _asm block
3340       }
3341       break;
3342 
3343       case 1:
3344       {
3345          // Placed here just in case this is a duplicate of the
3346          // non-MMX code for the SUB filter in png_read_filter_row below
3347          //
3348          //         png_bytep rp;
3349          //         png_bytep lp;
3350          //         png_uint_32 i;
3351          //         bpp = (row_info->pixel_depth + 7) >> 3;
3352          //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3353          //            i < row_info->rowbytes; i++, rp++, lp++)
3354          //      {
3355          //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3356          //      }
3357          _asm {
3358             mov ebx, diff
3359             mov edi, row
3360             cmp ebx, FullLength
3361             jnb dsub1end
3362             mov esi, edi          // lp = row
3363             xor eax, eax
3364             add edi, bpp      // rp = row + bpp
3365 dsub1lp:
3366             mov al, [esi+ebx]
3367             add [edi+ebx], al
3368             inc ebx
3369             cmp ebx, FullLength
3370             jb dsub1lp
3371 dsub1end:
3372          } // end _asm block
3373       }
3374       return;
3375 
3376       case 6:
3377       case 7:
3378       case 4:
3379       case 5:
3380       {
3381          ShiftBpp.use = bpp << 3;
3382          ShiftRem.use = 64 - ShiftBpp.use;
3383          _asm {
3384             mov edi, row
3385             mov ebx, diff
3386             mov esi, edi               // lp = row
3387             add edi, bpp           // rp = row + bpp
3388             // PRIME the pump (load the first Raw(x-bpp) data set
3389             movq mm1, [edi+ebx-8]
3390 dsub4lp:
3391             psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3392                           // no need for mask; shift clears inactive bytes
3393             movq mm0, [edi+ebx]
3394             paddb mm0, mm1
3395             // Add 2nd active group
3396             movq mm1, mm0          // mov updated Raws to mm1
3397             psllq mm1, ShiftBpp    // shift data to position correctly
3398                                    // there is no need for any mask
3399                                    // since shift clears inactive bits/bytes
3400             add ebx, 8
3401             paddb mm0, mm1
3402             cmp ebx, MMXLength
3403             movq [edi+ebx-8], mm0
3404             movq mm1, mm0          // Prep for doing 1st add at top of loop
3405             jb dsub4lp
3406          } // end _asm block
3407       }
3408       break;
3409 
3410       case 2:
3411       {
3412          ActiveMask.use  = 0x00000000ffff0000;
3413          ShiftBpp.use = 16;       // == 2 * 8
3414          ShiftRem.use = 48;       // == 64 - 16
3415          _asm {
3416             movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3417             mov ebx, diff
3418             movq mm6, mm7
3419             mov edi, row
3420             psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
3421                                     //  byte group
3422             mov esi, edi            // lp = row
3423             movq mm5, mm6
3424             add edi, bpp            // rp = row + bpp
3425             psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
3426                                     //  byte group
3427             // PRIME the pump (load the first Raw(x-bpp) data set
3428             movq mm1, [edi+ebx-8]
3429 dsub2lp:
3430             // Add 1st active group
3431             psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
3432                                     // no need for mask; shift clears inactive
3433                                     //  bytes
3434             movq mm0, [edi+ebx]
3435             paddb mm0, mm1
3436             // Add 2nd active group
3437             movq mm1, mm0           // mov updated Raws to mm1
3438             psllq mm1, ShiftBpp     // shift data to position correctly
3439             pand mm1, mm7           // mask to use only 2nd active group
3440             paddb mm0, mm1
3441             // Add 3rd active group
3442             movq mm1, mm0           // mov updated Raws to mm1
3443             psllq mm1, ShiftBpp     // shift data to position correctly
3444             pand mm1, mm6           // mask to use only 3rd active group
3445             paddb mm0, mm1
3446             // Add 4th active group
3447             movq mm1, mm0           // mov updated Raws to mm1
3448             psllq mm1, ShiftBpp     // shift data to position correctly
3449             pand mm1, mm5           // mask to use only 4th active group
3450             add ebx, 8
3451             paddb mm0, mm1
3452             cmp ebx, MMXLength
3453             movq [edi+ebx-8], mm0   // Write updated Raws back to array
3454             movq mm1, mm0           // Prep for doing 1st add at top of loop
3455             jb dsub2lp
3456          } // end _asm block
3457       }
3458       break;
3459       case 8:
3460       {
3461          _asm {
3462             mov edi, row
3463             mov ebx, diff
3464             mov esi, edi            // lp = row
3465             add edi, bpp            // rp = row + bpp
3466             mov ecx, MMXLength
3467             movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
3468                                     // Raw(x-bpp) data set
3469             and ecx, 0x0000003f     // calc bytes over mult of 64
3470 dsub8lp:
3471             movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
3472             paddb mm0, mm7
3473             movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
3474             movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
3475                                    // Now mm0 will be used as Raw(x-bpp) for
3476                                    // the 2nd group of 8 bytes.  This will be
3477                                    // repeated for each group of 8 bytes with
3478                                    // the 8th group being used as the Raw(x-bpp)
3479                                    // for the 1st group of the next loop.
3480             paddb mm1, mm0
3481             movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
3482             movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
3483             paddb mm2, mm1
3484             movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
3485             movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
3486             paddb mm3, mm2
3487             movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
3488             movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
3489             paddb mm4, mm3
3490             movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
3491             movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
3492             paddb mm5, mm4
3493             movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
3494             movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
3495             paddb mm6, mm5
3496             movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
3497             movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
3498             add ebx, 64
3499             paddb mm7, mm6
3500             cmp ebx, ecx
3501             movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
3502             jb dsub8lp
3503             cmp ebx, MMXLength
3504             jnb dsub8lt8
3505 dsub8lpA:
3506             movq mm0, [edi+ebx]
3507             add ebx, 8
3508             paddb mm0, mm7
3509             cmp ebx, MMXLength
3510             movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
3511             movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
3512                                     // be the new Raw(x-bpp) for the next loop
3513             jb dsub8lpA
3514 dsub8lt8:
3515          } // end _asm block
3516       }
3517       break;
3518 
3519       default:                // bpp greater than 8 bytes
3520       {
3521          _asm {
3522             mov ebx, diff
3523             mov edi, row
3524             mov esi, edi           // lp = row
3525             add edi, bpp           // rp = row + bpp
3526 dsubAlp:
3527             movq mm0, [edi+ebx]
3528             movq mm1, [esi+ebx]
3529             add ebx, 8
3530             paddb mm0, mm1
3531             cmp ebx, MMXLength
3532             movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
3533                                    //  add ebx
3534             jb dsubAlp
3535          } // end _asm block
3536       }
3537       break;
3538 
3539    } // end switch ( bpp )
3540 
3541    _asm {
3542         mov ebx, MMXLength
3543         mov edi, row
3544         cmp ebx, FullLength
3545         jnb dsubend
3546         mov esi, edi               // lp = row
3547         xor eax, eax
3548         add edi, bpp               // rp = row + bpp
3549 dsublp2:
3550         mov al, [esi+ebx]
3551         add [edi+ebx], al
3552         inc ebx
3553         cmp ebx, FullLength
3554         jb dsublp2
3555 dsubend:
3556         emms             // End MMX instructions; prep for possible FP instrs.
3557    } // end _asm block
3558 }
3559 
3560 // Optimized code for PNG Up filter decoder
3561 void /* PRIVATE */
png_read_filter_row_mmx_up(png_row_infop row_info,png_bytep row,png_bytep prev_row)3562 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3563    png_bytep prev_row)
3564 {
3565    png_uint_32 len;
3566    len  = row_info->rowbytes;       // # of bytes to filter
3567    _asm {
3568       mov edi, row
3569       // get # of bytes to alignment
3570       mov ecx, edi
3571       xor ebx, ebx
3572       add ecx, 0x7
3573       xor eax, eax
3574       and ecx, 0xfffffff8
3575       mov esi, prev_row
3576       sub ecx, edi
3577       jz dupgo
3578       // fix alignment
3579 duplp1:
3580       mov al, [edi+ebx]
3581       add al, [esi+ebx]
3582       inc ebx
3583       cmp ebx, ecx
3584       mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
3585       jb duplp1
3586 dupgo:
3587       mov ecx, len
3588       mov edx, ecx
3589       sub edx, ebx                  // subtract alignment fix
3590       and edx, 0x0000003f           // calc bytes over mult of 64
3591       sub ecx, edx                  // drop over bytes from length
3592       // Unrolled loop - use all MMX registers and interleave to reduce
3593       // number of branch instructions (loops) and reduce partial stalls
3594 duploop:
3595       movq mm1, [esi+ebx]
3596       movq mm0, [edi+ebx]
3597       movq mm3, [esi+ebx+8]
3598       paddb mm0, mm1
3599       movq mm2, [edi+ebx+8]
3600       movq [edi+ebx], mm0
3601       paddb mm2, mm3
3602       movq mm5, [esi+ebx+16]
3603       movq [edi+ebx+8], mm2
3604       movq mm4, [edi+ebx+16]
3605       movq mm7, [esi+ebx+24]
3606       paddb mm4, mm5
3607       movq mm6, [edi+ebx+24]
3608       movq [edi+ebx+16], mm4
3609       paddb mm6, mm7
3610       movq mm1, [esi+ebx+32]
3611       movq [edi+ebx+24], mm6
3612       movq mm0, [edi+ebx+32]
3613       movq mm3, [esi+ebx+40]
3614       paddb mm0, mm1
3615       movq mm2, [edi+ebx+40]
3616       movq [edi+ebx+32], mm0
3617       paddb mm2, mm3
3618       movq mm5, [esi+ebx+48]
3619       movq [edi+ebx+40], mm2
3620       movq mm4, [edi+ebx+48]
3621       movq mm7, [esi+ebx+56]
3622       paddb mm4, mm5
3623       movq mm6, [edi+ebx+56]
3624       movq [edi+ebx+48], mm4
3625       add ebx, 64
3626       paddb mm6, mm7
3627       cmp ebx, ecx
3628       movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3629                                      // -8 to offset add ebx
3630       jb duploop
3631 
3632       cmp edx, 0                     // Test for bytes over mult of 64
3633       jz dupend
3634 
3635 
3636       // 2 lines added by lcreeve at netins.net
3637       // (mail 11 Jul 98 in png-implement list)
3638       cmp edx, 8 //test for less than 8 bytes
3639       jb duplt8
3640 
3641 
3642       add ecx, edx
3643       and edx, 0x00000007           // calc bytes over mult of 8
3644       sub ecx, edx                  // drop over bytes from length
3645       jz duplt8
3646       // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3647 duplpA:
3648       movq mm1, [esi+ebx]
3649       movq mm0, [edi+ebx]
3650       add ebx, 8
3651       paddb mm0, mm1
3652       cmp ebx, ecx
3653       movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3654       jb duplpA
3655       cmp edx, 0            // Test for bytes over mult of 8
3656       jz dupend
3657 duplt8:
3658       xor eax, eax
3659       add ecx, edx          // move over byte count into counter
3660       // Loop using x86 registers to update remaining bytes
3661 duplp2:
3662       mov al, [edi + ebx]
3663       add al, [esi + ebx]
3664       inc ebx
3665       cmp ebx, ecx
3666       mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3667       jb duplp2
3668 dupend:
3669       // Conversion of filtered row completed
3670       emms          // End MMX instructions; prep for possible FP instrs.
3671    } // end _asm block
3672 }
3673 
3674 
3675 // Optimized png_read_filter_row routines
3676 void /* PRIVATE */
png_read_filter_row(png_structp png_ptr,png_row_infop row_info,png_bytep row,png_bytep prev_row,int filter)3677 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3678    row, png_bytep prev_row, int filter)
3679 {
3680 #ifdef PNG_DEBUG
3681    char filnm[10];
3682 #endif
3683 
3684    if (mmx_supported == 2) {
3685 #if !defined(PNG_1_0_X)
3686        /* this should have happened in png_init_mmx_flags() already */
3687        png_warning(png_ptr, "asm_flags may not have been initialized");
3688 #endif
3689        png_mmx_support();
3690    }
3691 
3692 #ifdef PNG_DEBUG
3693    png_debug(1, "in png_read_filter_row\n");
3694    switch (filter)
3695    {
3696       case 0: sprintf(filnm, "none");
3697          break;
3698 #if !defined(PNG_1_0_X)
3699       case 1: sprintf(filnm, "sub-%s",
3700         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3701          break;
3702       case 2: sprintf(filnm, "up-%s",
3703         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3704          break;
3705       case 3: sprintf(filnm, "avg-%s",
3706         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3707          break;
3708       case 4: sprintf(filnm, "Paeth-%s",
3709         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3710          break;
3711 #else
3712       case 1: sprintf(filnm, "sub");
3713          break;
3714       case 2: sprintf(filnm, "up");
3715          break;
3716       case 3: sprintf(filnm, "avg");
3717          break;
3718       case 4: sprintf(filnm, "Paeth");
3719          break;
3720 #endif
3721       default: sprintf(filnm, "unknw");
3722          break;
3723    }
3724    png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3725    png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3726       (int)((row_info->pixel_depth + 7) >> 3));
3727    png_debug1(0,"len=%8d, ", row_info->rowbytes);
3728 #endif /* PNG_DEBUG */
3729 
3730    switch (filter)
3731    {
3732       case PNG_FILTER_VALUE_NONE:
3733          break;
3734 
3735       case PNG_FILTER_VALUE_SUB:
3736       {
3737 #if !defined(PNG_1_0_X)
3738          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3739              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3740              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3741 #else
3742          if (mmx_supported)
3743 #endif
3744          {
3745             png_read_filter_row_mmx_sub(row_info, row);
3746          }
3747          else
3748          {
3749             png_uint_32 i;
3750             png_uint_32 istop = row_info->rowbytes;
3751             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3752             png_bytep rp = row + bpp;
3753             png_bytep lp = row;
3754 
3755             for (i = bpp; i < istop; i++)
3756             {
3757                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3758                rp++;
3759             }
3760          }
3761          break;
3762       }
3763 
3764       case PNG_FILTER_VALUE_UP:
3765       {
3766 #if !defined(PNG_1_0_X)
3767          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3768              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3769              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3770 #else
3771          if (mmx_supported)
3772 #endif
3773          {
3774             png_read_filter_row_mmx_up(row_info, row, prev_row);
3775          }
3776          else
3777          {
3778             png_uint_32 i;
3779             png_uint_32 istop = row_info->rowbytes;
3780             png_bytep rp = row;
3781             png_bytep pp = prev_row;
3782 
3783             for (i = 0; i < istop; ++i)
3784             {
3785                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3786                rp++;
3787             }
3788          }
3789          break;
3790       }
3791 
3792       case PNG_FILTER_VALUE_AVG:
3793       {
3794 #if !defined(PNG_1_0_X)
3795          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3796              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3797              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3798 #else
3799          if (mmx_supported)
3800 #endif
3801          {
3802             png_read_filter_row_mmx_avg(row_info, row, prev_row);
3803          }
3804          else
3805          {
3806             png_uint_32 i;
3807             png_bytep rp = row;
3808             png_bytep pp = prev_row;
3809             png_bytep lp = row;
3810             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3811             png_uint_32 istop = row_info->rowbytes - bpp;
3812 
3813             for (i = 0; i < bpp; i++)
3814             {
3815                *rp = (png_byte)(((int)(*rp) +
3816                   ((int)(*pp++) >> 1)) & 0xff);
3817                rp++;
3818             }
3819 
3820             for (i = 0; i < istop; i++)
3821             {
3822                *rp = (png_byte)(((int)(*rp) +
3823                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3824                rp++;
3825             }
3826          }
3827          break;
3828       }
3829 
3830       case PNG_FILTER_VALUE_PAETH:
3831       {
3832 #if !defined(PNG_1_0_X)
3833          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3834              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3835              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3836 #else
3837          if (mmx_supported)
3838 #endif
3839          {
3840             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3841          }
3842          else
3843          {
3844             png_uint_32 i;
3845             png_bytep rp = row;
3846             png_bytep pp = prev_row;
3847             png_bytep lp = row;
3848             png_bytep cp = prev_row;
3849             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3850             png_uint_32 istop=row_info->rowbytes - bpp;
3851 
3852             for (i = 0; i < bpp; i++)
3853             {
3854                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3855                rp++;
3856             }
3857 
3858             for (i = 0; i < istop; i++)   // use leftover rp,pp
3859             {
3860                int a, b, c, pa, pb, pc, p;
3861 
3862                a = *lp++;
3863                b = *pp++;
3864                c = *cp++;
3865 
3866                p = b - c;
3867                pc = a - c;
3868 
3869 #ifdef PNG_USE_ABS
3870                pa = abs(p);
3871                pb = abs(pc);
3872                pc = abs(p + pc);
3873 #else
3874                pa = p < 0 ? -p : p;
3875                pb = pc < 0 ? -pc : pc;
3876                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3877 #endif
3878 
3879                /*
3880                   if (pa <= pb && pa <= pc)
3881                      p = a;
3882                   else if (pb <= pc)
3883                      p = b;
3884                   else
3885                      p = c;
3886                 */
3887 
3888                p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3889 
3890                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3891                rp++;
3892             }
3893          }
3894          break;
3895       }
3896 
3897       default:
3898          png_warning(png_ptr, "Ignoring bad row filter type");
3899          *row=0;
3900          break;
3901    }
3902 }
3903 
3904 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */
3905