1 /* filter_vsx_intrinsics.c - PowerPC optimised filter functions
2  *
3  * Copyright (c) 2017 Glenn Randers-Pehrson
4  * Written by Vadim Barkov, 2017.
5  * Last changed in libpng 1.6.29 [March 16, 2017]
6  *
7  * This code is released under the libpng license.
8  * For conditions of distribution and use, see the disclaimer
9  * and license in png.h
10  */
11 #include <stdio.h>
12 #include <stdint.h>
13 #include "../pngpriv.h"
14 
15 #ifdef PNG_READ_SUPPORTED
16 
17 /* This code requires -maltivec and -mvsx on the command line: */
18 #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
19 
20 #include <altivec.h>
21 
22 #if PNG_POWERPC_VSX_OPT > 0
23 
24 #ifndef __VSX__
25 #  error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
26 #endif
27 
28 #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
29 #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
30 
31 
32 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
33  * They're positioned like this:
34  *    prev:  c b
35  *    row:   a d
36  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
37  * whichever of a, b, or c is closest to p=a+b-c.
38  * ( this is taken from ../intel/filter_sse2_intrinsics.c )
39  */
40 
41 #define vsx_declare_common_vars(row_info,row,prev_row,offset) \
42    png_byte i;\
43    png_bytep rp = row + offset;\
44    png_const_bytep pp = prev_row;\
45    png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\
46    png_size_t istop;\
47    if(unaligned_top == 16)\
48       unaligned_top = 0;\
49    istop = row_info->rowbytes;\
50    if((unaligned_top < istop))\
51       istop -= unaligned_top;\
52    else{\
53       unaligned_top = istop;\
54       istop = 0;\
55    }
56 
png_read_filter_row_up_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)57 void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row,
58                                 png_const_bytep prev_row)
59 {
60    vector unsigned char rp_vec;
61    vector unsigned char pp_vec;
62    vsx_declare_common_vars(row_info,row,prev_row,0)
63 
64    /* Altivec operations require 16-byte aligned data
65     * but input can be unaligned. So we calculate
66     * unaligned part as usual.
67     */
68    for (i = 0; i < unaligned_top; i++)
69    {
70       *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
71       rp++;
72    }
73 
74    /* Using SIMD while we can */
75    while( istop >= 16 )
76    {
77       rp_vec = vec_ld(0,rp);
78       vec_ld_unaligned(pp_vec,pp);
79 
80       rp_vec = vec_add(rp_vec,pp_vec);
81 
82       vec_st(rp_vec,0,rp);
83 
84       pp += 16;
85       rp += 16;
86       istop -= 16;
87    }
88 
89    if(istop > 0)
90    {
91       /* If byte count of row is not divisible by 16
92        * we will process remaining part as usual
93        */
94       for (i = 0; i < istop; i++)
95       {
96          *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
97          rp++;
98       }
99 }
100 
101 }
102 
103 static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
104 static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
105 static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
106 
107 static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
108 static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
109 static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
110 static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
111 
112 static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
113 static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
114 static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
115 
116 static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
117 static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
118 static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
119 static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
120 
121 static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
122 #ifdef __LITTLE_ENDIAN__
123 
124 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
125 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
126 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
127 
128 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
129 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
130 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
131 
132 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
133 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
134 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
135 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
136 
137 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
138 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
139 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
140 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
141 
142 #elif defined(__BIG_ENDIAN__)
143 
144 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
145 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
146 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
147 
148 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
149 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
150 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
151 
152 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
153 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
154 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
155 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
156 
157 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
158 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
159 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
160 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
161 
162 #endif
163 
164 #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
165 #define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
166 
167 #ifdef PNG_USE_ABS
168 #  define vsx_abs(number) abs(number)
169 #else
170 #  define vsx_abs(number) (number > 0) ? (number) : -(number)
171 #endif
172 
png_read_filter_row_sub4_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)173 void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row,
174                                   png_const_bytep prev_row)
175 {
176    const png_byte bpp = 4;
177 
178    vector unsigned char rp_vec;
179    vector unsigned char part_vec;
180 
181    vsx_declare_common_vars(row_info,row,prev_row,bpp)
182 
183    PNG_UNUSED(pp)
184 
185    /* Altivec operations require 16-byte aligned data
186     * but input can be unaligned. So we calculate
187     * unaligned part as usual.
188     */
189    for (i = 0; i < unaligned_top; i++)
190    {
191       *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
192       rp++;
193    }
194 
195    /* Using SIMD while we can */
196    while( istop >= 16 )
197    {
198       for(i=0;i < bpp ; i++)
199       {
200          *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
201          rp++;
202       }
203       rp -= bpp;
204 
205       rp_vec = vec_ld(0,rp);
206       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
207       rp_vec = vec_add(rp_vec,part_vec);
208 
209       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
210       rp_vec = vec_add(rp_vec,part_vec);
211 
212       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
213       rp_vec = vec_add(rp_vec,part_vec);
214 
215       vec_st(rp_vec,0,rp);
216 
217       rp += 16;
218       istop -= 16;
219    }
220 
221    if(istop > 0)
222       for (i = 0; i < istop % 16; i++)
223       {
224          *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff);
225          rp++;
226       }
227 
228 }
229 
png_read_filter_row_sub3_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)230 void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row,
231                                   png_const_bytep prev_row)
232 {
233    const png_byte bpp = 3;
234 
235    vector unsigned char rp_vec;
236    vector unsigned char part_vec;
237 
238    vsx_declare_common_vars(row_info,row,prev_row,bpp)
239 
240    PNG_UNUSED(pp)
241 
242    /* Altivec operations require 16-byte aligned data
243     * but input can be unaligned. So we calculate
244     * unaligned part as usual.
245     */
246    for (i = 0; i < unaligned_top; i++)
247    {
248       *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
249       rp++;
250    }
251 
252    /* Using SIMD while we can */
253    while( istop >= 16 )
254    {
255       for(i=0;i < bpp ; i++)
256       {
257          *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
258          rp++;
259       }
260       rp -= bpp;
261 
262       rp_vec = vec_ld(0,rp);
263       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
264       rp_vec = vec_add(rp_vec,part_vec);
265 
266       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
267       rp_vec = vec_add(rp_vec,part_vec);
268 
269       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
270       rp_vec = vec_add(rp_vec,part_vec);
271 
272       part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
273       rp_vec = vec_add(rp_vec,part_vec);
274 
275       vec_st(rp_vec,0,rp);
276       rp += 15;
277       istop -= 16;
278 
279       /* Since 16 % bpp = 16 % 3 = 1, last element of array must
280        * be proceeded manually
281        */
282       *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
283       rp++;
284    }
285 
286    if(istop > 0)
287       for (i = 0; i < istop % 16; i++)
288       {
289          *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
290          rp++;
291       }
292 }
293 
png_read_filter_row_avg4_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)294 void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row,
295                                   png_const_bytep prev_row)
296 {
297    const png_byte bpp = 4;
298 
299    vector unsigned char rp_vec;
300    vector unsigned char pp_vec;
301    vector unsigned char pp_part_vec;
302    vector unsigned char rp_part_vec;
303    vector unsigned char avg_vec;
304 
305    vsx_declare_common_vars(row_info,row,prev_row,bpp)
306    rp -= bpp;
307    if(istop >= bpp)
308       istop -= bpp;
309 
310    for (i = 0; i < bpp; i++)
311    {
312       *rp = (png_byte)(((int)(*rp) +
313          ((int)(*pp++) / 2 )) & 0xff);
314 
315       rp++;
316    }
317 
318    /* Altivec operations require 16-byte aligned data
319     * but input can be unaligned. So we calculate
320     * unaligned part as usual.
321     */
322    for (i = 0; i < unaligned_top; i++)
323    {
324       *rp = (png_byte)(((int)(*rp) +
325          (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
326 
327       rp++;
328    }
329 
330    /* Using SIMD while we can */
331    while( istop >= 16 )
332    {
333       for(i=0;i < bpp ; i++)
334       {
335          *rp = (png_byte)(((int)(*rp) +
336             (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
337 
338          rp++;
339       }
340       rp -= bpp;
341       pp -= bpp;
342 
343       vec_ld_unaligned(pp_vec,pp);
344       rp_vec = vec_ld(0,rp);
345 
346       rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
347       pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4);
348       avg_vec = vec_avg(rp_part_vec,pp_part_vec);
349       avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
350       rp_vec = vec_add(rp_vec,avg_vec);
351 
352       rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
353       pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4);
354       avg_vec = vec_avg(rp_part_vec,pp_part_vec);
355       avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
356       rp_vec = vec_add(rp_vec,avg_vec);
357 
358       rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
359       pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4);
360       avg_vec = vec_avg(rp_part_vec,pp_part_vec);
361       avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
362       rp_vec = vec_add(rp_vec,avg_vec);
363 
364       vec_st(rp_vec,0,rp);
365 
366       rp += 16;
367       pp += 16;
368       istop -= 16;
369    }
370 
371    if(istop  > 0)
372       for (i = 0; i < istop % 16; i++)
373       {
374          *rp = (png_byte)(((int)(*rp) +
375             (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
376 
377          rp++;
378       }
379 }
380 
png_read_filter_row_avg3_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)381 void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row,
382                                   png_const_bytep prev_row)
383 {
384   const png_byte bpp = 3;
385 
386   vector unsigned char rp_vec;
387   vector unsigned char pp_vec;
388   vector unsigned char pp_part_vec;
389   vector unsigned char rp_part_vec;
390   vector unsigned char avg_vec;
391 
392   vsx_declare_common_vars(row_info,row,prev_row,bpp)
393   rp -= bpp;
394   if(istop >= bpp)
395      istop -= bpp;
396 
397   for (i = 0; i < bpp; i++)
398   {
399      *rp = (png_byte)(((int)(*rp) +
400         ((int)(*pp++) / 2 )) & 0xff);
401 
402      rp++;
403   }
404 
405   /* Altivec operations require 16-byte aligned data
406    * but input can be unaligned. So we calculate
407    * unaligned part as usual.
408    */
409   for (i = 0; i < unaligned_top; i++)
410   {
411      *rp = (png_byte)(((int)(*rp) +
412         (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
413 
414      rp++;
415   }
416 
417   /* Using SIMD while we can */
418   while( istop >= 16 )
419   {
420      for(i=0;i < bpp ; i++)
421      {
422         *rp = (png_byte)(((int)(*rp) +
423            (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
424 
425         rp++;
426      }
427      rp -= bpp;
428      pp -= bpp;
429 
430      vec_ld_unaligned(pp_vec,pp);
431      rp_vec = vec_ld(0,rp);
432 
433      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
434      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3);
435      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
436      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
437      rp_vec = vec_add(rp_vec,avg_vec);
438 
439      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
440      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3);
441      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
442      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
443      rp_vec = vec_add(rp_vec,avg_vec);
444 
445      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
446      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3);
447      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
448      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
449      rp_vec = vec_add(rp_vec,avg_vec);
450 
451      rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
452      pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3);
453      avg_vec = vec_avg(rp_part_vec,pp_part_vec);
454      avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
455      rp_vec = vec_add(rp_vec,avg_vec);
456 
457      vec_st(rp_vec,0,rp);
458 
459      rp += 15;
460      pp += 15;
461      istop -= 16;
462 
463      /* Since 16 % bpp = 16 % 3 = 1, last element of array must
464       * be proceeded manually
465       */
466      *rp = (png_byte)(((int)(*rp) +
467         (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
468      rp++;
469   }
470 
471   if(istop  > 0)
472      for (i = 0; i < istop % 16; i++)
473      {
474         *rp = (png_byte)(((int)(*rp) +
475            (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
476 
477         rp++;
478      }
479 }
480 
481 /* Bytewise c ? t : e. */
482 #define if_then_else(c,t,e) vec_sel(e,t,c)
483 
484 #define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
485       c = *(pp - bpp);\
486       a = *(rp - bpp);\
487       b = *pp++;\
488       p = b - c;\
489       pc = a - c;\
490       pa = vsx_abs(p);\
491       pb = vsx_abs(pc);\
492       pc = vsx_abs(p + pc);\
493       if (pb < pa) pa = pb, a = b;\
494       if (pc < pa) a = c;\
495       a += *rp;\
496       *rp++ = (png_byte)a;\
497       }
498 
png_read_filter_row_paeth4_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)499 void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row,
500    png_const_bytep prev_row)
501 {
502    const png_byte bpp = 4;
503 
504    int a, b, c, pa, pb, pc, p;
505    vector unsigned char rp_vec;
506    vector unsigned char pp_vec;
507    vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
508    vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
509 
510    vsx_declare_common_vars(row_info,row,prev_row,bpp)
511    rp -= bpp;
512    if(istop >= bpp)
513       istop -= bpp;
514 
515    /* Process the first pixel in the row completely (this is the same as 'up'
516     * because there is only one candidate predictor for the first row).
517     */
518    for(i = 0; i < bpp ; i++)
519    {
520       *rp = (png_byte)( *rp + *pp);
521       rp++;
522       pp++;
523    }
524 
525    for(i = 0; i < unaligned_top ; i++)
526    {
527       vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
528    }
529 
530    while( istop >= 16)
531    {
532       for(i = 0; i < bpp ; i++)
533       {
534          vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
535       }
536 
537       rp -= bpp;
538       pp -= bpp;
539       rp_vec = vec_ld(0,rp);
540       vec_ld_unaligned(pp_vec,pp);
541 
542       a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
543       b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4);
544       c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
545       pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
546       pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
547       pc_vec = vec_add(pa_vec,pb_vec);
548       pa_vec = vec_abs(pa_vec);
549       pb_vec = vec_abs(pb_vec);
550       pc_vec = vec_abs(pc_vec);
551       smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
552       nearest_vec =  if_then_else(
553             vec_cmpeq(pa_vec,smallest_vec),
554             a_vec,
555             if_then_else(
556               vec_cmpeq(pb_vec,smallest_vec),
557               b_vec,
558               c_vec
559               )
560             );
561       rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4)));
562 
563       a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
564       b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4);
565       c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
566       pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
567       pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
568       pc_vec = vec_add(pa_vec,pb_vec);
569       pa_vec = vec_abs(pa_vec);
570       pb_vec = vec_abs(pb_vec);
571       pc_vec = vec_abs(pc_vec);
572       smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
573       nearest_vec =  if_then_else(
574             vec_cmpeq(pa_vec,smallest_vec),
575             a_vec,
576             if_then_else(
577               vec_cmpeq(pb_vec,smallest_vec),
578               b_vec,
579               c_vec
580               )
581             );
582       rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4)));
583 
584       a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
585       b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4);
586       c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
587       pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
588       pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
589       pc_vec = vec_add(pa_vec,pb_vec);
590       pa_vec = vec_abs(pa_vec);
591       pb_vec = vec_abs(pb_vec);
592       pc_vec = vec_abs(pc_vec);
593       smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
594       nearest_vec =  if_then_else(
595             vec_cmpeq(pa_vec,smallest_vec),
596             a_vec,
597             if_then_else(
598               vec_cmpeq(pb_vec,smallest_vec),
599               b_vec,
600               c_vec
601               )
602             );
603       rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4)));
604 
605       vec_st(rp_vec,0,rp);
606 
607       rp += 16;
608       pp += 16;
609       istop -= 16;
610    }
611 
612    if(istop > 0)
613       for (i = 0; i < istop % 16; i++)
614       {
615          vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
616       }
617 }
618 
png_read_filter_row_paeth3_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)619 void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row,
620    png_const_bytep prev_row)
621 {
622   const png_byte bpp = 3;
623 
624   int a, b, c, pa, pb, pc, p;
625   vector unsigned char rp_vec;
626   vector unsigned char pp_vec;
627   vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
628   vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
629 
630   vsx_declare_common_vars(row_info,row,prev_row,bpp)
631   rp -= bpp;
632   if(istop >= bpp)
633      istop -= bpp;
634 
635   /* Process the first pixel in the row completely (this is the same as 'up'
636    * because there is only one candidate predictor for the first row).
637    */
638   for(i = 0; i < bpp ; i++)
639   {
640      *rp = (png_byte)( *rp + *pp);
641      rp++;
642      pp++;
643   }
644 
645   for(i = 0; i < unaligned_top ; i++)
646   {
647      vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
648   }
649 
650   while( istop >= 16)
651   {
652      for(i = 0; i < bpp ; i++)
653      {
654         vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
655      }
656 
657      rp -= bpp;
658      pp -= bpp;
659      rp_vec = vec_ld(0,rp);
660      vec_ld_unaligned(pp_vec,pp);
661 
662      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
663      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3);
664      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
665      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
666      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
667      pc_vec = vec_add(pa_vec,pb_vec);
668      pa_vec = vec_abs(pa_vec);
669      pb_vec = vec_abs(pb_vec);
670      pc_vec = vec_abs(pc_vec);
671      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
672      nearest_vec =  if_then_else(
673            vec_cmpeq(pa_vec,smallest_vec),
674            a_vec,
675            if_then_else(
676              vec_cmpeq(pb_vec,smallest_vec),
677              b_vec,
678              c_vec
679              )
680            );
681      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3)));
682 
683      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
684      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3);
685      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
686      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
687      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
688      pc_vec = vec_add(pa_vec,pb_vec);
689      pa_vec = vec_abs(pa_vec);
690      pb_vec = vec_abs(pb_vec);
691      pc_vec = vec_abs(pc_vec);
692      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
693      nearest_vec =  if_then_else(
694            vec_cmpeq(pa_vec,smallest_vec),
695            a_vec,
696            if_then_else(
697              vec_cmpeq(pb_vec,smallest_vec),
698              b_vec,
699              c_vec
700              )
701            );
702      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3)));
703 
704      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
705      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3);
706      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
707      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
708      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
709      pc_vec = vec_add(pa_vec,pb_vec);
710      pa_vec = vec_abs(pa_vec);
711      pb_vec = vec_abs(pb_vec);
712      pc_vec = vec_abs(pc_vec);
713      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
714      nearest_vec =  if_then_else(
715            vec_cmpeq(pa_vec,smallest_vec),
716            a_vec,
717            if_then_else(
718              vec_cmpeq(pb_vec,smallest_vec),
719              b_vec,
720              c_vec
721              )
722            );
723      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3)));
724 
725      a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
726      b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3);
727      c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
728      pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
729      pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
730      pc_vec = vec_add(pa_vec,pb_vec);
731      pa_vec = vec_abs(pa_vec);
732      pb_vec = vec_abs(pb_vec);
733      pc_vec = vec_abs(pc_vec);
734      smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
735      nearest_vec =  if_then_else(
736            vec_cmpeq(pa_vec,smallest_vec),
737            a_vec,
738            if_then_else(
739              vec_cmpeq(pb_vec,smallest_vec),
740              b_vec,
741              c_vec
742              )
743            );
744      rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3)));
745 
746      vec_st(rp_vec,0,rp);
747 
748      rp += 15;
749      pp += 15;
750      istop -= 16;
751 
752      /* Since 16 % bpp = 16 % 3 = 1, last element of array must
753       * be proceeded manually
754       */
755      vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
756   }
757 
758   if(istop > 0)
759      for (i = 0; i < istop % 16; i++)
760      {
761         vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
762      }
763 }
764 
765 #endif /* PNG_POWERPC_VSX_OPT > 0 */
766 #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */
767 #endif /* READ */
768