1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* CHROMA UPSAMPLING */
24 
25 #include "jsimd_altivec.h"
26 
27 
jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)28 void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
29                                        JDIMENSION downsampled_width,
30                                        JSAMPARRAY input_data,
31                                        JSAMPARRAY *output_data_ptr)
32 {
33   JSAMPARRAY output_data = *output_data_ptr;
34   JSAMPROW inptr, outptr;
35   int inrow, incol;
36 
37   __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
38     out;
39   __vector short this0e, this0o, this0l, this0h, last0l, last0h,
40     next0l, next0h, outle, outhe, outlo, outho;
41 
42   /* Constants */
43   __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
44     last_index_col0 =
45       {  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14 },
46     last_index =
47       { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
48     next_index =
49       {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
50     next_index_lastcol =
51       {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15 },
52 #if __BIG_ENDIAN__
53     merge_pack_index =
54       {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
55 #else
56     merge_pack_index =
57       {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
58 #endif
59   __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
60 
61   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
62     inptr = input_data[inrow];
63     outptr = output_data[inrow];
64 
65     if (downsampled_width & 15)
66       inptr[downsampled_width] = inptr[downsampled_width - 1];
67 
68     this0 = vec_ld(0, inptr);
69     p_last0 = vec_perm(this0, this0, last_index_col0);
70     last0 = this0;
71 
72     for (incol = downsampled_width; incol > 0;
73          incol -= 16, inptr += 16, outptr += 32) {
74 
75       if (downsampled_width - incol > 0) {
76         p_last0 = vec_perm(last0, this0, last_index);
77         last0 = this0;
78       }
79 
80       if (incol <= 16)
81         p_next0 = vec_perm(this0, this0, next_index_lastcol);
82       else {
83         next0 = vec_ld(16, inptr);
84         p_next0 = vec_perm(this0, next0, next_index);
85       }
86 
87       this0e = (__vector short)vec_mule(this0, pb_three);
88       this0o = (__vector short)vec_mulo(this0, pb_three);
89       this0l = vec_mergeh(this0e, this0o);
90       this0h = vec_mergel(this0e, this0o);
91 
92       last0l = (__vector short)VEC_UNPACKHU(p_last0);
93       last0h = (__vector short)VEC_UNPACKLU(p_last0);
94       last0l = vec_add(last0l, pw_one);
95 
96       next0l = (__vector short)VEC_UNPACKHU(p_next0);
97       next0h = (__vector short)VEC_UNPACKLU(p_next0);
98       next0l = vec_add(next0l, pw_two);
99 
100       outle = vec_add(this0l, last0l);
101       outlo = vec_add(this0l, next0l);
102       outle = vec_sr(outle, (__vector unsigned short)pw_two);
103       outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
104 
105       out = vec_perm((__vector unsigned char)outle,
106                      (__vector unsigned char)outlo, merge_pack_index);
107       vec_st(out, 0, outptr);
108 
109       if (incol > 8) {
110         last0h = vec_add(last0h, pw_one);
111         next0h = vec_add(next0h, pw_two);
112 
113         outhe = vec_add(this0h, last0h);
114         outho = vec_add(this0h, next0h);
115         outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
116         outho = vec_sr(outho, (__vector unsigned short)pw_two);
117 
118         out = vec_perm((__vector unsigned char)outhe,
119                        (__vector unsigned char)outho, merge_pack_index);
120         vec_st(out, 16, outptr);
121       }
122 
123       this0 = next0;
124     }
125   }
126 }
127 
128 
jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)129 void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
130                                        JDIMENSION downsampled_width,
131                                        JSAMPARRAY input_data,
132                                        JSAMPARRAY *output_data_ptr)
133 {
134   JSAMPARRAY output_data = *output_data_ptr;
135   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
136   int inrow, outrow, incol;
137 
138   __vector unsigned char this_1, this0, this1, out;
139   __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
140     lastcolsum_1h, lastcolsum1h,
141     p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
142     thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
143     nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
144     nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
145     p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
146     tmpl, tmph, outle, outhe, outlo, outho;
147 
148   /* Constants */
149   __vector unsigned char pb_zero = { __16X(0) },
150     last_index_col0 =
151       {  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13 },
152     last_index =
153       { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
154     next_index =
155       {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
156     next_index_lastcol =
157       {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15 },
158 #if __BIG_ENDIAN__
159     merge_pack_index =
160       {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
161 #else
162     merge_pack_index =
163       {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
164 #endif
165   __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
166     pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
167   __vector unsigned short pw_four = { __8X(4) };
168 
169   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
170 
171     inptr_1 = input_data[inrow - 1];
172     inptr0 = input_data[inrow];
173     inptr1 = input_data[inrow + 1];
174     outptr0 = output_data[outrow++];
175     outptr1 = output_data[outrow++];
176 
177     if (downsampled_width & 15) {
178       inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
179       inptr0[downsampled_width] = inptr0[downsampled_width - 1];
180       inptr1[downsampled_width] = inptr1[downsampled_width - 1];
181     }
182 
183     this0 = vec_ld(0, inptr0);
184     this0l = (__vector short)VEC_UNPACKHU(this0);
185     this0h = (__vector short)VEC_UNPACKLU(this0);
186     this0l = vec_mladd(this0l, pw_three, pw_zero);
187     this0h = vec_mladd(this0h, pw_three, pw_zero);
188 
189     this_1 = vec_ld(0, inptr_1);
190     this_1l = (__vector short)VEC_UNPACKHU(this_1);
191     this_1h = (__vector short)VEC_UNPACKLU(this_1);
192     thiscolsum_1l = vec_add(this0l, this_1l);
193     thiscolsum_1h = vec_add(this0h, this_1h);
194     lastcolsum_1h = thiscolsum_1h;
195     p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
196     p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
197 
198     this1 = vec_ld(0, inptr1);
199     this1l = (__vector short)VEC_UNPACKHU(this1);
200     this1h = (__vector short)VEC_UNPACKLU(this1);
201     thiscolsum1l = vec_add(this0l, this1l);
202     thiscolsum1h = vec_add(this0h, this1h);
203     lastcolsum1h = thiscolsum1h;
204     p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
205     p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
206 
207     for (incol = downsampled_width; incol > 0;
208          incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
209          outptr0 += 32, outptr1 += 32) {
210 
211       if (downsampled_width - incol > 0) {
212         p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
213         p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
214         p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
215         p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
216         lastcolsum_1h = thiscolsum_1h;  lastcolsum1h = thiscolsum1h;
217       }
218 
219       if (incol <= 16) {
220         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
221         p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
222                                    next_index_lastcol);
223         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
224         p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
225                                   next_index_lastcol);
226       } else {
227         this0 = vec_ld(16, inptr0);
228         this0l = (__vector short)VEC_UNPACKHU(this0);
229         this0h = (__vector short)VEC_UNPACKLU(this0);
230         this0l = vec_mladd(this0l, pw_three, pw_zero);
231         this0h = vec_mladd(this0h, pw_three, pw_zero);
232 
233         this_1 = vec_ld(16, inptr_1);
234         this_1l = (__vector short)VEC_UNPACKHU(this_1);
235         this_1h = (__vector short)VEC_UNPACKLU(this_1);
236         nextcolsum_1l = vec_add(this0l, this_1l);
237         nextcolsum_1h = vec_add(this0h, this_1h);
238         p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
239         p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
240 
241         this1 = vec_ld(16, inptr1);
242         this1l = (__vector short)VEC_UNPACKHU(this1);
243         this1h = (__vector short)VEC_UNPACKLU(this1);
244         nextcolsum1l = vec_add(this0l, this1l);
245         nextcolsum1h = vec_add(this0h, this1h);
246         p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
247         p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
248       }
249 
250       /* Process the upper row */
251 
252       tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
253       outle = vec_add(tmpl, p_lastcolsum_1l);
254       outle = vec_add(outle, pw_eight);
255       outle = vec_sr(outle, pw_four);
256 
257       outlo = vec_add(tmpl, p_nextcolsum_1l);
258       outlo = vec_add(outlo, pw_seven);
259       outlo = vec_sr(outlo, pw_four);
260 
261       out = vec_perm((__vector unsigned char)outle,
262                      (__vector unsigned char)outlo, merge_pack_index);
263       vec_st(out, 0, outptr0);
264 
265       if (incol > 8) {
266         tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
267         outhe = vec_add(tmph, p_lastcolsum_1h);
268         outhe = vec_add(outhe, pw_eight);
269         outhe = vec_sr(outhe, pw_four);
270 
271         outho = vec_add(tmph, p_nextcolsum_1h);
272         outho = vec_add(outho, pw_seven);
273         outho = vec_sr(outho, pw_four);
274 
275         out = vec_perm((__vector unsigned char)outhe,
276                        (__vector unsigned char)outho, merge_pack_index);
277         vec_st(out, 16, outptr0);
278       }
279 
280       /* Process the lower row */
281 
282       tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
283       outle = vec_add(tmpl, p_lastcolsum1l);
284       outle = vec_add(outle, pw_eight);
285       outle = vec_sr(outle, pw_four);
286 
287       outlo = vec_add(tmpl, p_nextcolsum1l);
288       outlo = vec_add(outlo, pw_seven);
289       outlo = vec_sr(outlo, pw_four);
290 
291       out = vec_perm((__vector unsigned char)outle,
292                      (__vector unsigned char)outlo, merge_pack_index);
293       vec_st(out, 0, outptr1);
294 
295       if (incol > 8) {
296         tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
297         outhe = vec_add(tmph, p_lastcolsum1h);
298         outhe = vec_add(outhe, pw_eight);
299         outhe = vec_sr(outhe, pw_four);
300 
301         outho = vec_add(tmph, p_nextcolsum1h);
302         outho = vec_add(outho, pw_seven);
303         outho = vec_sr(outho, pw_four);
304 
305         out = vec_perm((__vector unsigned char)outhe,
306                        (__vector unsigned char)outho, merge_pack_index);
307         vec_st(out, 16, outptr1);
308       }
309 
310       thiscolsum_1l = nextcolsum_1l;  thiscolsum_1h = nextcolsum_1h;
311       thiscolsum1l = nextcolsum1l;  thiscolsum1h = nextcolsum1h;
312     }
313   }
314 }
315 
316 
317 /* These are rarely used (mainly just for decompressing YCCK images) */
318 
jsimd_h2v1_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)319 void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
320                                  JDIMENSION output_width,
321                                  JSAMPARRAY input_data,
322                                  JSAMPARRAY *output_data_ptr)
323 {
324   JSAMPARRAY output_data = *output_data_ptr;
325   JSAMPROW inptr, outptr;
326   int inrow, incol;
327 
328   __vector unsigned char in, inl, inh;
329 
330   for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
331     inptr = input_data[inrow];
332     outptr = output_data[inrow];
333 
334     for (incol = (output_width + 31) & (~31); incol > 0;
335          incol -= 64, inptr += 32, outptr += 64) {
336 
337       in = vec_ld(0, inptr);
338       inl = vec_mergeh(in, in);
339       inh = vec_mergel(in, in);
340 
341       vec_st(inl, 0, outptr);
342       vec_st(inh, 16, outptr);
343 
344       if (incol > 32) {
345         in = vec_ld(16, inptr);
346         inl = vec_mergeh(in, in);
347         inh = vec_mergel(in, in);
348 
349         vec_st(inl, 32, outptr);
350         vec_st(inh, 48, outptr);
351       }
352     }
353   }
354 }
355 
356 
jsimd_h2v2_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)357 void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
358                                  JDIMENSION output_width,
359                                  JSAMPARRAY input_data,
360                                  JSAMPARRAY *output_data_ptr)
361 {
362   JSAMPARRAY output_data = *output_data_ptr;
363   JSAMPROW inptr, outptr0, outptr1;
364   int inrow, outrow, incol;
365 
366   __vector unsigned char in, inl, inh;
367 
368   for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
369 
370     inptr = input_data[inrow];
371     outptr0 = output_data[outrow++];
372     outptr1 = output_data[outrow++];
373 
374     for (incol = (output_width + 31) & (~31); incol > 0;
375          incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
376 
377       in = vec_ld(0, inptr);
378       inl = vec_mergeh(in, in);
379       inh = vec_mergel(in, in);
380 
381       vec_st(inl, 0, outptr0);
382       vec_st(inl, 0, outptr1);
383 
384       vec_st(inh, 16, outptr0);
385       vec_st(inh, 16, outptr1);
386 
387       if (incol > 32) {
388         in = vec_ld(16, inptr);
389         inl = vec_mergeh(in, in);
390         inh = vec_mergel(in, in);
391 
392         vec_st(inl, 32, outptr0);
393         vec_st(inl, 32, outptr1);
394 
395         vec_st(inh, 48, outptr0);
396         vec_st(inh, 48, outptr1);
397       }
398     }
399   }
400 }
401