1 /* filter_vsx_intrinsics.c - PowerPC optimised filter functions
2 *
3 * Copyright (c) 2017 Glenn Randers-Pehrson
4 * Written by Vadim Barkov, 2017.
5 * Last changed in libpng 1.6.29 [March 16, 2017]
6 *
7 * This code is released under the libpng license.
8 * For conditions of distribution and use, see the disclaimer
9 * and license in png.h
10 */
11 #include <stdio.h>
12 #include <stdint.h>
13 #include "../pngpriv.h"
14
15 #ifdef PNG_READ_SUPPORTED
16
17 /* This code requires -maltivec and -mvsx on the command line: */
18 #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
19
20 #include <altivec.h>
21
22 #if PNG_POWERPC_VSX_OPT > 0
23
24 #ifndef __VSX__
25 # error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
26 #endif
27
28 #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
29 #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
30
31
32 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
33 * They're positioned like this:
34 * prev: c b
35 * row: a d
36 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
37 * whichever of a, b, or c is closest to p=a+b-c.
38 * ( this is taken from ../intel/filter_sse2_intrinsics.c )
39 */
40
41 #define vsx_declare_common_vars(row_info,row,prev_row,offset) \
42 png_byte i;\
43 png_bytep rp = row + offset;\
44 png_const_bytep pp = prev_row;\
45 png_size_t unaligned_top = 16 - (((png_size_t)rp % 16));\
46 png_size_t istop;\
47 if(unaligned_top == 16)\
48 unaligned_top = 0;\
49 istop = row_info->rowbytes;\
50 if((unaligned_top < istop))\
51 istop -= unaligned_top;\
52 else{\
53 unaligned_top = istop;\
54 istop = 0;\
55 }
56
png_read_filter_row_up_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)57 void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row,
58 png_const_bytep prev_row)
59 {
60 vector unsigned char rp_vec;
61 vector unsigned char pp_vec;
62 vsx_declare_common_vars(row_info,row,prev_row,0)
63
64 /* Altivec operations require 16-byte aligned data
65 * but input can be unaligned. So we calculate
66 * unaligned part as usual.
67 */
68 for (i = 0; i < unaligned_top; i++)
69 {
70 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
71 rp++;
72 }
73
74 /* Using SIMD while we can */
75 while( istop >= 16 )
76 {
77 rp_vec = vec_ld(0,rp);
78 vec_ld_unaligned(pp_vec,pp);
79
80 rp_vec = vec_add(rp_vec,pp_vec);
81
82 vec_st(rp_vec,0,rp);
83
84 pp += 16;
85 rp += 16;
86 istop -= 16;
87 }
88
89 if(istop > 0)
90 {
91 /* If byte count of row is not divisible by 16
92 * we will process remaining part as usual
93 */
94 for (i = 0; i < istop; i++)
95 {
96 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
97 rp++;
98 }
99 }
100
101 }
102
103 static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
104 static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
105 static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
106
107 static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
108 static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
109 static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
110 static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
111
112 static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
113 static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
114 static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
115
116 static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
117 static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
118 static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
119 static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
120
121 static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
122 #ifdef __LITTLE_ENDIAN__
123
124 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
125 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
126 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
127
128 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
129 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
130 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
131
132 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
133 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
134 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
135 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
136
137 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
138 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
139 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
140 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
141
142 #elif defined(__BIG_ENDIAN__)
143
144 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
145 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
146 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
147
148 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
149 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
150 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
151
152 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
153 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
154 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
155 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
156
157 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
158 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
159 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
160 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
161
162 #endif
163
164 #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
165 #define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
166
167 #ifdef PNG_USE_ABS
168 # define vsx_abs(number) abs(number)
169 #else
170 # define vsx_abs(number) (number > 0) ? (number) : -(number)
171 #endif
172
png_read_filter_row_sub4_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)173 void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row,
174 png_const_bytep prev_row)
175 {
176 const png_byte bpp = 4;
177
178 vector unsigned char rp_vec;
179 vector unsigned char part_vec;
180
181 vsx_declare_common_vars(row_info,row,prev_row,bpp)
182
183 PNG_UNUSED(pp)
184
185 /* Altivec operations require 16-byte aligned data
186 * but input can be unaligned. So we calculate
187 * unaligned part as usual.
188 */
189 for (i = 0; i < unaligned_top; i++)
190 {
191 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
192 rp++;
193 }
194
195 /* Using SIMD while we can */
196 while( istop >= 16 )
197 {
198 for(i=0;i < bpp ; i++)
199 {
200 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
201 rp++;
202 }
203 rp -= bpp;
204
205 rp_vec = vec_ld(0,rp);
206 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
207 rp_vec = vec_add(rp_vec,part_vec);
208
209 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
210 rp_vec = vec_add(rp_vec,part_vec);
211
212 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
213 rp_vec = vec_add(rp_vec,part_vec);
214
215 vec_st(rp_vec,0,rp);
216
217 rp += 16;
218 istop -= 16;
219 }
220
221 if(istop > 0)
222 for (i = 0; i < istop % 16; i++)
223 {
224 *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff);
225 rp++;
226 }
227
228 }
229
png_read_filter_row_sub3_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)230 void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row,
231 png_const_bytep prev_row)
232 {
233 const png_byte bpp = 3;
234
235 vector unsigned char rp_vec;
236 vector unsigned char part_vec;
237
238 vsx_declare_common_vars(row_info,row,prev_row,bpp)
239
240 PNG_UNUSED(pp)
241
242 /* Altivec operations require 16-byte aligned data
243 * but input can be unaligned. So we calculate
244 * unaligned part as usual.
245 */
246 for (i = 0; i < unaligned_top; i++)
247 {
248 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
249 rp++;
250 }
251
252 /* Using SIMD while we can */
253 while( istop >= 16 )
254 {
255 for(i=0;i < bpp ; i++)
256 {
257 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
258 rp++;
259 }
260 rp -= bpp;
261
262 rp_vec = vec_ld(0,rp);
263 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
264 rp_vec = vec_add(rp_vec,part_vec);
265
266 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
267 rp_vec = vec_add(rp_vec,part_vec);
268
269 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
270 rp_vec = vec_add(rp_vec,part_vec);
271
272 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
273 rp_vec = vec_add(rp_vec,part_vec);
274
275 vec_st(rp_vec,0,rp);
276 rp += 15;
277 istop -= 16;
278
279 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
280 * be proceeded manually
281 */
282 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
283 rp++;
284 }
285
286 if(istop > 0)
287 for (i = 0; i < istop % 16; i++)
288 {
289 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
290 rp++;
291 }
292 }
293
png_read_filter_row_avg4_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)294 void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row,
295 png_const_bytep prev_row)
296 {
297 const png_byte bpp = 4;
298
299 vector unsigned char rp_vec;
300 vector unsigned char pp_vec;
301 vector unsigned char pp_part_vec;
302 vector unsigned char rp_part_vec;
303 vector unsigned char avg_vec;
304
305 vsx_declare_common_vars(row_info,row,prev_row,bpp)
306 rp -= bpp;
307 if(istop >= bpp)
308 istop -= bpp;
309
310 for (i = 0; i < bpp; i++)
311 {
312 *rp = (png_byte)(((int)(*rp) +
313 ((int)(*pp++) / 2 )) & 0xff);
314
315 rp++;
316 }
317
318 /* Altivec operations require 16-byte aligned data
319 * but input can be unaligned. So we calculate
320 * unaligned part as usual.
321 */
322 for (i = 0; i < unaligned_top; i++)
323 {
324 *rp = (png_byte)(((int)(*rp) +
325 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
326
327 rp++;
328 }
329
330 /* Using SIMD while we can */
331 while( istop >= 16 )
332 {
333 for(i=0;i < bpp ; i++)
334 {
335 *rp = (png_byte)(((int)(*rp) +
336 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
337
338 rp++;
339 }
340 rp -= bpp;
341 pp -= bpp;
342
343 vec_ld_unaligned(pp_vec,pp);
344 rp_vec = vec_ld(0,rp);
345
346 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
347 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4);
348 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
349 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
350 rp_vec = vec_add(rp_vec,avg_vec);
351
352 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
353 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4);
354 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
355 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
356 rp_vec = vec_add(rp_vec,avg_vec);
357
358 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
359 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4);
360 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
361 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
362 rp_vec = vec_add(rp_vec,avg_vec);
363
364 vec_st(rp_vec,0,rp);
365
366 rp += 16;
367 pp += 16;
368 istop -= 16;
369 }
370
371 if(istop > 0)
372 for (i = 0; i < istop % 16; i++)
373 {
374 *rp = (png_byte)(((int)(*rp) +
375 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
376
377 rp++;
378 }
379 }
380
png_read_filter_row_avg3_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)381 void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row,
382 png_const_bytep prev_row)
383 {
384 const png_byte bpp = 3;
385
386 vector unsigned char rp_vec;
387 vector unsigned char pp_vec;
388 vector unsigned char pp_part_vec;
389 vector unsigned char rp_part_vec;
390 vector unsigned char avg_vec;
391
392 vsx_declare_common_vars(row_info,row,prev_row,bpp)
393 rp -= bpp;
394 if(istop >= bpp)
395 istop -= bpp;
396
397 for (i = 0; i < bpp; i++)
398 {
399 *rp = (png_byte)(((int)(*rp) +
400 ((int)(*pp++) / 2 )) & 0xff);
401
402 rp++;
403 }
404
405 /* Altivec operations require 16-byte aligned data
406 * but input can be unaligned. So we calculate
407 * unaligned part as usual.
408 */
409 for (i = 0; i < unaligned_top; i++)
410 {
411 *rp = (png_byte)(((int)(*rp) +
412 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
413
414 rp++;
415 }
416
417 /* Using SIMD while we can */
418 while( istop >= 16 )
419 {
420 for(i=0;i < bpp ; i++)
421 {
422 *rp = (png_byte)(((int)(*rp) +
423 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
424
425 rp++;
426 }
427 rp -= bpp;
428 pp -= bpp;
429
430 vec_ld_unaligned(pp_vec,pp);
431 rp_vec = vec_ld(0,rp);
432
433 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
434 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3);
435 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
436 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
437 rp_vec = vec_add(rp_vec,avg_vec);
438
439 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
440 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3);
441 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
442 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
443 rp_vec = vec_add(rp_vec,avg_vec);
444
445 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
446 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3);
447 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
448 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
449 rp_vec = vec_add(rp_vec,avg_vec);
450
451 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
452 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3);
453 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
454 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
455 rp_vec = vec_add(rp_vec,avg_vec);
456
457 vec_st(rp_vec,0,rp);
458
459 rp += 15;
460 pp += 15;
461 istop -= 16;
462
463 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
464 * be proceeded manually
465 */
466 *rp = (png_byte)(((int)(*rp) +
467 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
468 rp++;
469 }
470
471 if(istop > 0)
472 for (i = 0; i < istop % 16; i++)
473 {
474 *rp = (png_byte)(((int)(*rp) +
475 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
476
477 rp++;
478 }
479 }
480
481 /* Bytewise c ? t : e. */
482 #define if_then_else(c,t,e) vec_sel(e,t,c)
483
484 #define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
485 c = *(pp - bpp);\
486 a = *(rp - bpp);\
487 b = *pp++;\
488 p = b - c;\
489 pc = a - c;\
490 pa = vsx_abs(p);\
491 pb = vsx_abs(pc);\
492 pc = vsx_abs(p + pc);\
493 if (pb < pa) pa = pb, a = b;\
494 if (pc < pa) a = c;\
495 a += *rp;\
496 *rp++ = (png_byte)a;\
497 }
498
png_read_filter_row_paeth4_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)499 void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row,
500 png_const_bytep prev_row)
501 {
502 const png_byte bpp = 4;
503
504 int a, b, c, pa, pb, pc, p;
505 vector unsigned char rp_vec;
506 vector unsigned char pp_vec;
507 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
508 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
509
510 vsx_declare_common_vars(row_info,row,prev_row,bpp)
511 rp -= bpp;
512 if(istop >= bpp)
513 istop -= bpp;
514
515 /* Process the first pixel in the row completely (this is the same as 'up'
516 * because there is only one candidate predictor for the first row).
517 */
518 for(i = 0; i < bpp ; i++)
519 {
520 *rp = (png_byte)( *rp + *pp);
521 rp++;
522 pp++;
523 }
524
525 for(i = 0; i < unaligned_top ; i++)
526 {
527 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
528 }
529
530 while( istop >= 16)
531 {
532 for(i = 0; i < bpp ; i++)
533 {
534 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
535 }
536
537 rp -= bpp;
538 pp -= bpp;
539 rp_vec = vec_ld(0,rp);
540 vec_ld_unaligned(pp_vec,pp);
541
542 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
543 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4);
544 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
545 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
546 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
547 pc_vec = vec_add(pa_vec,pb_vec);
548 pa_vec = vec_abs(pa_vec);
549 pb_vec = vec_abs(pb_vec);
550 pc_vec = vec_abs(pc_vec);
551 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
552 nearest_vec = if_then_else(
553 vec_cmpeq(pa_vec,smallest_vec),
554 a_vec,
555 if_then_else(
556 vec_cmpeq(pb_vec,smallest_vec),
557 b_vec,
558 c_vec
559 )
560 );
561 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4)));
562
563 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
564 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4);
565 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
566 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
567 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
568 pc_vec = vec_add(pa_vec,pb_vec);
569 pa_vec = vec_abs(pa_vec);
570 pb_vec = vec_abs(pb_vec);
571 pc_vec = vec_abs(pc_vec);
572 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
573 nearest_vec = if_then_else(
574 vec_cmpeq(pa_vec,smallest_vec),
575 a_vec,
576 if_then_else(
577 vec_cmpeq(pb_vec,smallest_vec),
578 b_vec,
579 c_vec
580 )
581 );
582 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4)));
583
584 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
585 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4);
586 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
587 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
588 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
589 pc_vec = vec_add(pa_vec,pb_vec);
590 pa_vec = vec_abs(pa_vec);
591 pb_vec = vec_abs(pb_vec);
592 pc_vec = vec_abs(pc_vec);
593 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
594 nearest_vec = if_then_else(
595 vec_cmpeq(pa_vec,smallest_vec),
596 a_vec,
597 if_then_else(
598 vec_cmpeq(pb_vec,smallest_vec),
599 b_vec,
600 c_vec
601 )
602 );
603 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4)));
604
605 vec_st(rp_vec,0,rp);
606
607 rp += 16;
608 pp += 16;
609 istop -= 16;
610 }
611
612 if(istop > 0)
613 for (i = 0; i < istop % 16; i++)
614 {
615 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
616 }
617 }
618
png_read_filter_row_paeth3_vsx(png_row_infop row_info,png_bytep row,png_const_bytep prev_row)619 void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row,
620 png_const_bytep prev_row)
621 {
622 const png_byte bpp = 3;
623
624 int a, b, c, pa, pb, pc, p;
625 vector unsigned char rp_vec;
626 vector unsigned char pp_vec;
627 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
628 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
629
630 vsx_declare_common_vars(row_info,row,prev_row,bpp)
631 rp -= bpp;
632 if(istop >= bpp)
633 istop -= bpp;
634
635 /* Process the first pixel in the row completely (this is the same as 'up'
636 * because there is only one candidate predictor for the first row).
637 */
638 for(i = 0; i < bpp ; i++)
639 {
640 *rp = (png_byte)( *rp + *pp);
641 rp++;
642 pp++;
643 }
644
645 for(i = 0; i < unaligned_top ; i++)
646 {
647 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
648 }
649
650 while( istop >= 16)
651 {
652 for(i = 0; i < bpp ; i++)
653 {
654 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
655 }
656
657 rp -= bpp;
658 pp -= bpp;
659 rp_vec = vec_ld(0,rp);
660 vec_ld_unaligned(pp_vec,pp);
661
662 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
663 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3);
664 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
665 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
666 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
667 pc_vec = vec_add(pa_vec,pb_vec);
668 pa_vec = vec_abs(pa_vec);
669 pb_vec = vec_abs(pb_vec);
670 pc_vec = vec_abs(pc_vec);
671 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
672 nearest_vec = if_then_else(
673 vec_cmpeq(pa_vec,smallest_vec),
674 a_vec,
675 if_then_else(
676 vec_cmpeq(pb_vec,smallest_vec),
677 b_vec,
678 c_vec
679 )
680 );
681 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3)));
682
683 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
684 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3);
685 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
686 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
687 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
688 pc_vec = vec_add(pa_vec,pb_vec);
689 pa_vec = vec_abs(pa_vec);
690 pb_vec = vec_abs(pb_vec);
691 pc_vec = vec_abs(pc_vec);
692 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
693 nearest_vec = if_then_else(
694 vec_cmpeq(pa_vec,smallest_vec),
695 a_vec,
696 if_then_else(
697 vec_cmpeq(pb_vec,smallest_vec),
698 b_vec,
699 c_vec
700 )
701 );
702 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3)));
703
704 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
705 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3);
706 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
707 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
708 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
709 pc_vec = vec_add(pa_vec,pb_vec);
710 pa_vec = vec_abs(pa_vec);
711 pb_vec = vec_abs(pb_vec);
712 pc_vec = vec_abs(pc_vec);
713 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
714 nearest_vec = if_then_else(
715 vec_cmpeq(pa_vec,smallest_vec),
716 a_vec,
717 if_then_else(
718 vec_cmpeq(pb_vec,smallest_vec),
719 b_vec,
720 c_vec
721 )
722 );
723 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3)));
724
725 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
726 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3);
727 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
728 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
729 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
730 pc_vec = vec_add(pa_vec,pb_vec);
731 pa_vec = vec_abs(pa_vec);
732 pb_vec = vec_abs(pb_vec);
733 pc_vec = vec_abs(pc_vec);
734 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
735 nearest_vec = if_then_else(
736 vec_cmpeq(pa_vec,smallest_vec),
737 a_vec,
738 if_then_else(
739 vec_cmpeq(pb_vec,smallest_vec),
740 b_vec,
741 c_vec
742 )
743 );
744 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3)));
745
746 vec_st(rp_vec,0,rp);
747
748 rp += 15;
749 pp += 15;
750 istop -= 16;
751
752 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
753 * be proceeded manually
754 */
755 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
756 }
757
758 if(istop > 0)
759 for (i = 0; i < istop % 16; i++)
760 {
761 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
762 }
763 }
764
765 #endif /* PNG_POWERPC_VSX_OPT > 0 */
766 #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */
767 #endif /* READ */
768