1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* CHROMA UPSAMPLING */
24 
25 #include "jsimd_altivec.h"
26 
27 
28 void
jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)29 jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
30                                    JDIMENSION downsampled_width,
31                                    JSAMPARRAY input_data,
32                                    JSAMPARRAY *output_data_ptr)
33 {
34   JSAMPARRAY output_data = *output_data_ptr;
35   JSAMPROW inptr, outptr;
36   int inrow, incol;
37 
38   __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
39     out;
40   __vector short this0e, this0o, this0l, this0h, last0l, last0h,
41     next0l, next0h, outle, outhe, outlo, outho;
42 
43   /* Constants */
44   __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
45     last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
46     last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
47     next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
48     next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
49 #if __BIG_ENDIAN__
50     merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
51 #else
52     merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
53 #endif
54   __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
55 
56   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
57     inptr = input_data[inrow];
58     outptr = output_data[inrow];
59 
60     if (downsampled_width & 15)
61       inptr[downsampled_width] = inptr[downsampled_width - 1];
62 
63     this0 = vec_ld(0, inptr);
64     p_last0 = vec_perm(this0, this0, last_index_col0);
65     last0 = this0;
66 
67     for (incol = downsampled_width; incol > 0;
68          incol -= 16, inptr += 16, outptr += 32) {
69 
70       if (downsampled_width - incol > 0) {
71         p_last0 = vec_perm(last0, this0, last_index);
72         last0 = this0;
73       }
74 
75       if (incol <= 16)
76         p_next0 = vec_perm(this0, this0, next_index_lastcol);
77       else {
78         next0 = vec_ld(16, inptr);
79         p_next0 = vec_perm(this0, next0, next_index);
80       }
81 
82       this0e = (__vector short)vec_mule(this0, pb_three);
83       this0o = (__vector short)vec_mulo(this0, pb_three);
84       this0l = vec_mergeh(this0e, this0o);
85       this0h = vec_mergel(this0e, this0o);
86 
87       last0l = (__vector short)VEC_UNPACKHU(p_last0);
88       last0h = (__vector short)VEC_UNPACKLU(p_last0);
89       last0l = vec_add(last0l, pw_one);
90 
91       next0l = (__vector short)VEC_UNPACKHU(p_next0);
92       next0h = (__vector short)VEC_UNPACKLU(p_next0);
93       next0l = vec_add(next0l, pw_two);
94 
95       outle = vec_add(this0l, last0l);
96       outlo = vec_add(this0l, next0l);
97       outle = vec_sr(outle, (__vector unsigned short)pw_two);
98       outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
99 
100       out = vec_perm((__vector unsigned char)outle,
101                      (__vector unsigned char)outlo, merge_pack_index);
102       vec_st(out, 0, outptr);
103 
104       if (incol > 8) {
105         last0h = vec_add(last0h, pw_one);
106         next0h = vec_add(next0h, pw_two);
107 
108         outhe = vec_add(this0h, last0h);
109         outho = vec_add(this0h, next0h);
110         outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
111         outho = vec_sr(outho, (__vector unsigned short)pw_two);
112 
113         out = vec_perm((__vector unsigned char)outhe,
114                        (__vector unsigned char)outho, merge_pack_index);
115         vec_st(out, 16, outptr);
116       }
117 
118       this0 = next0;
119     }
120   }
121 }
122 
123 
124 void
jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)125 jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
126                                    JDIMENSION downsampled_width,
127                                    JSAMPARRAY input_data,
128                                    JSAMPARRAY *output_data_ptr)
129 {
130   JSAMPARRAY output_data = *output_data_ptr;
131   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
132   int inrow, outrow, incol;
133 
134   __vector unsigned char this_1, this0, this1, out;
135   __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
136     lastcolsum_1h, lastcolsum1h,
137     p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
138     thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
139     nextcolsum_1l = {0}, nextcolsum_1h = {0},
140     nextcolsum1l = {0}, nextcolsum1h = {0},
141     p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
142     tmpl, tmph, outle, outhe, outlo, outho;
143 
144   /* Constants */
145   __vector unsigned char pb_zero = { __16X(0) },
146     last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
147     last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
148     next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
149     next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
150 #if __BIG_ENDIAN__
151     merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
152 #else
153     merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
154 #endif
155   __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
156     pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
157   __vector unsigned short pw_four = { __8X(4) };
158 
159   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
160 
161     inptr_1 = input_data[inrow - 1];
162     inptr0 = input_data[inrow];
163     inptr1 = input_data[inrow + 1];
164     outptr0 = output_data[outrow++];
165     outptr1 = output_data[outrow++];
166 
167     if (downsampled_width & 15) {
168       inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
169       inptr0[downsampled_width] = inptr0[downsampled_width - 1];
170       inptr1[downsampled_width] = inptr1[downsampled_width - 1];
171     }
172 
173     this0 = vec_ld(0, inptr0);
174     this0l = (__vector short)VEC_UNPACKHU(this0);
175     this0h = (__vector short)VEC_UNPACKLU(this0);
176     this0l = vec_mladd(this0l, pw_three, pw_zero);
177     this0h = vec_mladd(this0h, pw_three, pw_zero);
178 
179     this_1 = vec_ld(0, inptr_1);
180     this_1l = (__vector short)VEC_UNPACKHU(this_1);
181     this_1h = (__vector short)VEC_UNPACKLU(this_1);
182     thiscolsum_1l = vec_add(this0l, this_1l);
183     thiscolsum_1h = vec_add(this0h, this_1h);
184     lastcolsum_1h = thiscolsum_1h;
185     p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
186     p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
187 
188     this1 = vec_ld(0, inptr1);
189     this1l = (__vector short)VEC_UNPACKHU(this1);
190     this1h = (__vector short)VEC_UNPACKLU(this1);
191     thiscolsum1l = vec_add(this0l, this1l);
192     thiscolsum1h = vec_add(this0h, this1h);
193     lastcolsum1h = thiscolsum1h;
194     p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
195     p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
196 
197     for (incol = downsampled_width; incol > 0;
198          incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
199          outptr0 += 32, outptr1 += 32) {
200 
201       if (downsampled_width - incol > 0) {
202         p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
203         p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
204         p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
205         p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
206         lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
207       }
208 
209       if (incol <= 16) {
210         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
211         p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
212                                    next_index_lastcol);
213         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
214         p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
215                                   next_index_lastcol);
216       } else {
217         this0 = vec_ld(16, inptr0);
218         this0l = (__vector short)VEC_UNPACKHU(this0);
219         this0h = (__vector short)VEC_UNPACKLU(this0);
220         this0l = vec_mladd(this0l, pw_three, pw_zero);
221         this0h = vec_mladd(this0h, pw_three, pw_zero);
222 
223         this_1 = vec_ld(16, inptr_1);
224         this_1l = (__vector short)VEC_UNPACKHU(this_1);
225         this_1h = (__vector short)VEC_UNPACKLU(this_1);
226         nextcolsum_1l = vec_add(this0l, this_1l);
227         nextcolsum_1h = vec_add(this0h, this_1h);
228         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
229         p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
230 
231         this1 = vec_ld(16, inptr1);
232         this1l = (__vector short)VEC_UNPACKHU(this1);
233         this1h = (__vector short)VEC_UNPACKLU(this1);
234         nextcolsum1l = vec_add(this0l, this1l);
235         nextcolsum1h = vec_add(this0h, this1h);
236         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
237         p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
238       }
239 
240       /* Process the upper row */
241 
242       tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
243       outle = vec_add(tmpl, p_lastcolsum_1l);
244       outle = vec_add(outle, pw_eight);
245       outle = vec_sr(outle, pw_four);
246 
247       outlo = vec_add(tmpl, p_nextcolsum_1l);
248       outlo = vec_add(outlo, pw_seven);
249       outlo = vec_sr(outlo, pw_four);
250 
251       out = vec_perm((__vector unsigned char)outle,
252                      (__vector unsigned char)outlo, merge_pack_index);
253       vec_st(out, 0, outptr0);
254 
255       if (incol > 8) {
256         tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
257         outhe = vec_add(tmph, p_lastcolsum_1h);
258         outhe = vec_add(outhe, pw_eight);
259         outhe = vec_sr(outhe, pw_four);
260 
261         outho = vec_add(tmph, p_nextcolsum_1h);
262         outho = vec_add(outho, pw_seven);
263         outho = vec_sr(outho, pw_four);
264 
265         out = vec_perm((__vector unsigned char)outhe,
266                        (__vector unsigned char)outho, merge_pack_index);
267         vec_st(out, 16, outptr0);
268       }
269 
270       /* Process the lower row */
271 
272       tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
273       outle = vec_add(tmpl, p_lastcolsum1l);
274       outle = vec_add(outle, pw_eight);
275       outle = vec_sr(outle, pw_four);
276 
277       outlo = vec_add(tmpl, p_nextcolsum1l);
278       outlo = vec_add(outlo, pw_seven);
279       outlo = vec_sr(outlo, pw_four);
280 
281       out = vec_perm((__vector unsigned char)outle,
282                      (__vector unsigned char)outlo, merge_pack_index);
283       vec_st(out, 0, outptr1);
284 
285       if (incol > 8) {
286         tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
287         outhe = vec_add(tmph, p_lastcolsum1h);
288         outhe = vec_add(outhe, pw_eight);
289         outhe = vec_sr(outhe, pw_four);
290 
291         outho = vec_add(tmph, p_nextcolsum1h);
292         outho = vec_add(outho, pw_seven);
293         outho = vec_sr(outho, pw_four);
294 
295         out = vec_perm((__vector unsigned char)outhe,
296                        (__vector unsigned char)outho, merge_pack_index);
297         vec_st(out, 16, outptr1);
298       }
299 
300       thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
301       thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
302     }
303   }
304 }
305 
306 
307 /* These are rarely used (mainly just for decompressing YCCK images) */
308 
309 void
jsimd_h2v1_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)310 jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
311                              JDIMENSION output_width,
312                              JSAMPARRAY input_data,
313                              JSAMPARRAY *output_data_ptr)
314 {
315   JSAMPARRAY output_data = *output_data_ptr;
316   JSAMPROW inptr, outptr;
317   int inrow, incol;
318 
319   __vector unsigned char in, inl, inh;
320 
321   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
322     inptr = input_data[inrow];
323     outptr = output_data[inrow];
324 
325     for (incol = (output_width + 31) & (~31); incol > 0;
326          incol -= 64, inptr += 32, outptr += 64) {
327 
328       in = vec_ld(0, inptr);
329       inl = vec_mergeh(in, in);
330       inh = vec_mergel(in, in);
331 
332       vec_st(inl, 0, outptr);
333       vec_st(inh, 16, outptr);
334 
335       if (incol > 32) {
336         in = vec_ld(16, inptr);
337         inl = vec_mergeh(in, in);
338         inh = vec_mergel(in, in);
339 
340         vec_st(inl, 32, outptr);
341         vec_st(inh, 48, outptr);
342       }
343     }
344   }
345 }
346 
347 
348 void
jsimd_h2v2_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)349 jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
350                              JDIMENSION output_width,
351                              JSAMPARRAY input_data,
352                              JSAMPARRAY *output_data_ptr)
353 {
354   JSAMPARRAY output_data = *output_data_ptr;
355   JSAMPROW inptr, outptr0, outptr1;
356   int inrow, outrow, incol;
357 
358   __vector unsigned char in, inl, inh;
359 
360   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
361 
362     inptr = input_data[inrow];
363     outptr0 = output_data[outrow++];
364     outptr1 = output_data[outrow++];
365 
366     for (incol = (output_width + 31) & (~31); incol > 0;
367          incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
368 
369       in = vec_ld(0, inptr);
370       inl = vec_mergeh(in, in);
371       inh = vec_mergel(in, in);
372 
373       vec_st(inl, 0, outptr0);
374       vec_st(inl, 0, outptr1);
375 
376       vec_st(inh, 16, outptr0);
377       vec_st(inh, 16, outptr1);
378 
379       if (incol > 32) {
380         in = vec_ld(16, inptr);
381         inl = vec_mergeh(in, in);
382         inh = vec_mergel(in, in);
383 
384         vec_st(inl, 32, outptr0);
385         vec_st(inl, 32, outptr1);
386 
387         vec_st(inh, 48, outptr0);
388         vec_st(inh, 48, outptr1);
389       }
390     }
391   }
392 }
393