1 /*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty. In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 * claim that you wrote the original software. If you use this software
16 * in a product, an acknowledgment in the product documentation would be
17 * appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 * misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23 /* CHROMA UPSAMPLING */
24
25 #include "jsimd_altivec.h"
26
27
28 void
jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)29 jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
30 JDIMENSION downsampled_width,
31 JSAMPARRAY input_data,
32 JSAMPARRAY *output_data_ptr)
33 {
34 JSAMPARRAY output_data = *output_data_ptr;
35 JSAMPROW inptr, outptr;
36 int inrow, incol;
37
38 __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
39 out;
40 __vector short this0e, this0o, this0l, this0h, last0l, last0h,
41 next0l, next0h, outle, outhe, outlo, outho;
42
43 /* Constants */
44 __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
45 last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
46 last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
47 next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
48 next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
49 #if __BIG_ENDIAN__
50 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
51 #else
52 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
53 #endif
54 __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
55
56 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
57 inptr = input_data[inrow];
58 outptr = output_data[inrow];
59
60 if (downsampled_width & 15)
61 inptr[downsampled_width] = inptr[downsampled_width - 1];
62
63 this0 = vec_ld(0, inptr);
64 p_last0 = vec_perm(this0, this0, last_index_col0);
65 last0 = this0;
66
67 for (incol = downsampled_width; incol > 0;
68 incol -= 16, inptr += 16, outptr += 32) {
69
70 if (downsampled_width - incol > 0) {
71 p_last0 = vec_perm(last0, this0, last_index);
72 last0 = this0;
73 }
74
75 if (incol <= 16)
76 p_next0 = vec_perm(this0, this0, next_index_lastcol);
77 else {
78 next0 = vec_ld(16, inptr);
79 p_next0 = vec_perm(this0, next0, next_index);
80 }
81
82 this0e = (__vector short)vec_mule(this0, pb_three);
83 this0o = (__vector short)vec_mulo(this0, pb_three);
84 this0l = vec_mergeh(this0e, this0o);
85 this0h = vec_mergel(this0e, this0o);
86
87 last0l = (__vector short)VEC_UNPACKHU(p_last0);
88 last0h = (__vector short)VEC_UNPACKLU(p_last0);
89 last0l = vec_add(last0l, pw_one);
90
91 next0l = (__vector short)VEC_UNPACKHU(p_next0);
92 next0h = (__vector short)VEC_UNPACKLU(p_next0);
93 next0l = vec_add(next0l, pw_two);
94
95 outle = vec_add(this0l, last0l);
96 outlo = vec_add(this0l, next0l);
97 outle = vec_sr(outle, (__vector unsigned short)pw_two);
98 outlo = vec_sr(outlo, (__vector unsigned short)pw_two);
99
100 out = vec_perm((__vector unsigned char)outle,
101 (__vector unsigned char)outlo, merge_pack_index);
102 vec_st(out, 0, outptr);
103
104 if (incol > 8) {
105 last0h = vec_add(last0h, pw_one);
106 next0h = vec_add(next0h, pw_two);
107
108 outhe = vec_add(this0h, last0h);
109 outho = vec_add(this0h, next0h);
110 outhe = vec_sr(outhe, (__vector unsigned short)pw_two);
111 outho = vec_sr(outho, (__vector unsigned short)pw_two);
112
113 out = vec_perm((__vector unsigned char)outhe,
114 (__vector unsigned char)outho, merge_pack_index);
115 vec_st(out, 16, outptr);
116 }
117
118 this0 = next0;
119 }
120 }
121 }
122
123
124 void
jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,JDIMENSION downsampled_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)125 jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
126 JDIMENSION downsampled_width,
127 JSAMPARRAY input_data,
128 JSAMPARRAY *output_data_ptr)
129 {
130 JSAMPARRAY output_data = *output_data_ptr;
131 JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
132 int inrow, outrow, incol;
133
134 __vector unsigned char this_1, this0, this1, out;
135 __vector short this_1l, this_1h, this0l, this0h, this1l, this1h,
136 lastcolsum_1h, lastcolsum1h,
137 p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
138 thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
139 nextcolsum_1l = {0}, nextcolsum_1h = {0},
140 nextcolsum1l = {0}, nextcolsum1h = {0},
141 p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
142 tmpl, tmph, outle, outhe, outlo, outho;
143
144 /* Constants */
145 __vector unsigned char pb_zero = { __16X(0) },
146 last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
147 last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
148 next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
149 next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
150 #if __BIG_ENDIAN__
151 merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
152 #else
153 merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
154 #endif
155 __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
156 pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
157 __vector unsigned short pw_four = { __8X(4) };
158
159 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
160
161 inptr_1 = input_data[inrow - 1];
162 inptr0 = input_data[inrow];
163 inptr1 = input_data[inrow + 1];
164 outptr0 = output_data[outrow++];
165 outptr1 = output_data[outrow++];
166
167 if (downsampled_width & 15) {
168 inptr_1[downsampled_width] = inptr_1[downsampled_width - 1];
169 inptr0[downsampled_width] = inptr0[downsampled_width - 1];
170 inptr1[downsampled_width] = inptr1[downsampled_width - 1];
171 }
172
173 this0 = vec_ld(0, inptr0);
174 this0l = (__vector short)VEC_UNPACKHU(this0);
175 this0h = (__vector short)VEC_UNPACKLU(this0);
176 this0l = vec_mladd(this0l, pw_three, pw_zero);
177 this0h = vec_mladd(this0h, pw_three, pw_zero);
178
179 this_1 = vec_ld(0, inptr_1);
180 this_1l = (__vector short)VEC_UNPACKHU(this_1);
181 this_1h = (__vector short)VEC_UNPACKLU(this_1);
182 thiscolsum_1l = vec_add(this0l, this_1l);
183 thiscolsum_1h = vec_add(this0h, this_1h);
184 lastcolsum_1h = thiscolsum_1h;
185 p_lastcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1l, last_index_col0);
186 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
187
188 this1 = vec_ld(0, inptr1);
189 this1l = (__vector short)VEC_UNPACKHU(this1);
190 this1h = (__vector short)VEC_UNPACKLU(this1);
191 thiscolsum1l = vec_add(this0l, this1l);
192 thiscolsum1h = vec_add(this0h, this1h);
193 lastcolsum1h = thiscolsum1h;
194 p_lastcolsum1l = vec_perm(thiscolsum1l, thiscolsum1l, last_index_col0);
195 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
196
197 for (incol = downsampled_width; incol > 0;
198 incol -= 16, inptr_1 += 16, inptr0 += 16, inptr1 += 16,
199 outptr0 += 32, outptr1 += 32) {
200
201 if (downsampled_width - incol > 0) {
202 p_lastcolsum_1l = vec_perm(lastcolsum_1h, thiscolsum_1l, last_index);
203 p_lastcolsum_1h = vec_perm(thiscolsum_1l, thiscolsum_1h, last_index);
204 p_lastcolsum1l = vec_perm(lastcolsum1h, thiscolsum1l, last_index);
205 p_lastcolsum1h = vec_perm(thiscolsum1l, thiscolsum1h, last_index);
206 lastcolsum_1h = thiscolsum_1h; lastcolsum1h = thiscolsum1h;
207 }
208
209 if (incol <= 16) {
210 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
211 p_nextcolsum_1h = vec_perm(thiscolsum_1h, thiscolsum_1h,
212 next_index_lastcol);
213 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
214 p_nextcolsum1h = vec_perm(thiscolsum1h, thiscolsum1h,
215 next_index_lastcol);
216 } else {
217 this0 = vec_ld(16, inptr0);
218 this0l = (__vector short)VEC_UNPACKHU(this0);
219 this0h = (__vector short)VEC_UNPACKLU(this0);
220 this0l = vec_mladd(this0l, pw_three, pw_zero);
221 this0h = vec_mladd(this0h, pw_three, pw_zero);
222
223 this_1 = vec_ld(16, inptr_1);
224 this_1l = (__vector short)VEC_UNPACKHU(this_1);
225 this_1h = (__vector short)VEC_UNPACKLU(this_1);
226 nextcolsum_1l = vec_add(this0l, this_1l);
227 nextcolsum_1h = vec_add(this0h, this_1h);
228 p_nextcolsum_1l = vec_perm(thiscolsum_1l, thiscolsum_1h, next_index);
229 p_nextcolsum_1h = vec_perm(thiscolsum_1h, nextcolsum_1l, next_index);
230
231 this1 = vec_ld(16, inptr1);
232 this1l = (__vector short)VEC_UNPACKHU(this1);
233 this1h = (__vector short)VEC_UNPACKLU(this1);
234 nextcolsum1l = vec_add(this0l, this1l);
235 nextcolsum1h = vec_add(this0h, this1h);
236 p_nextcolsum1l = vec_perm(thiscolsum1l, thiscolsum1h, next_index);
237 p_nextcolsum1h = vec_perm(thiscolsum1h, nextcolsum1l, next_index);
238 }
239
240 /* Process the upper row */
241
242 tmpl = vec_mladd(thiscolsum_1l, pw_three, pw_zero);
243 outle = vec_add(tmpl, p_lastcolsum_1l);
244 outle = vec_add(outle, pw_eight);
245 outle = vec_sr(outle, pw_four);
246
247 outlo = vec_add(tmpl, p_nextcolsum_1l);
248 outlo = vec_add(outlo, pw_seven);
249 outlo = vec_sr(outlo, pw_four);
250
251 out = vec_perm((__vector unsigned char)outle,
252 (__vector unsigned char)outlo, merge_pack_index);
253 vec_st(out, 0, outptr0);
254
255 if (incol > 8) {
256 tmph = vec_mladd(thiscolsum_1h, pw_three, pw_zero);
257 outhe = vec_add(tmph, p_lastcolsum_1h);
258 outhe = vec_add(outhe, pw_eight);
259 outhe = vec_sr(outhe, pw_four);
260
261 outho = vec_add(tmph, p_nextcolsum_1h);
262 outho = vec_add(outho, pw_seven);
263 outho = vec_sr(outho, pw_four);
264
265 out = vec_perm((__vector unsigned char)outhe,
266 (__vector unsigned char)outho, merge_pack_index);
267 vec_st(out, 16, outptr0);
268 }
269
270 /* Process the lower row */
271
272 tmpl = vec_mladd(thiscolsum1l, pw_three, pw_zero);
273 outle = vec_add(tmpl, p_lastcolsum1l);
274 outle = vec_add(outle, pw_eight);
275 outle = vec_sr(outle, pw_four);
276
277 outlo = vec_add(tmpl, p_nextcolsum1l);
278 outlo = vec_add(outlo, pw_seven);
279 outlo = vec_sr(outlo, pw_four);
280
281 out = vec_perm((__vector unsigned char)outle,
282 (__vector unsigned char)outlo, merge_pack_index);
283 vec_st(out, 0, outptr1);
284
285 if (incol > 8) {
286 tmph = vec_mladd(thiscolsum1h, pw_three, pw_zero);
287 outhe = vec_add(tmph, p_lastcolsum1h);
288 outhe = vec_add(outhe, pw_eight);
289 outhe = vec_sr(outhe, pw_four);
290
291 outho = vec_add(tmph, p_nextcolsum1h);
292 outho = vec_add(outho, pw_seven);
293 outho = vec_sr(outho, pw_four);
294
295 out = vec_perm((__vector unsigned char)outhe,
296 (__vector unsigned char)outho, merge_pack_index);
297 vec_st(out, 16, outptr1);
298 }
299
300 thiscolsum_1l = nextcolsum_1l; thiscolsum_1h = nextcolsum_1h;
301 thiscolsum1l = nextcolsum1l; thiscolsum1h = nextcolsum1h;
302 }
303 }
304 }
305
306
307 /* These are rarely used (mainly just for decompressing YCCK images) */
308
309 void
jsimd_h2v1_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)310 jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
311 JDIMENSION output_width,
312 JSAMPARRAY input_data,
313 JSAMPARRAY *output_data_ptr)
314 {
315 JSAMPARRAY output_data = *output_data_ptr;
316 JSAMPROW inptr, outptr;
317 int inrow, incol;
318
319 __vector unsigned char in, inl, inh;
320
321 for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
322 inptr = input_data[inrow];
323 outptr = output_data[inrow];
324
325 for (incol = (output_width + 31) & (~31); incol > 0;
326 incol -= 64, inptr += 32, outptr += 64) {
327
328 in = vec_ld(0, inptr);
329 inl = vec_mergeh(in, in);
330 inh = vec_mergel(in, in);
331
332 vec_st(inl, 0, outptr);
333 vec_st(inh, 16, outptr);
334
335 if (incol > 32) {
336 in = vec_ld(16, inptr);
337 inl = vec_mergeh(in, in);
338 inh = vec_mergel(in, in);
339
340 vec_st(inl, 32, outptr);
341 vec_st(inh, 48, outptr);
342 }
343 }
344 }
345 }
346
347
348 void
jsimd_h2v2_upsample_altivec(int max_v_samp_factor,JDIMENSION output_width,JSAMPARRAY input_data,JSAMPARRAY * output_data_ptr)349 jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
350 JDIMENSION output_width,
351 JSAMPARRAY input_data,
352 JSAMPARRAY *output_data_ptr)
353 {
354 JSAMPARRAY output_data = *output_data_ptr;
355 JSAMPROW inptr, outptr0, outptr1;
356 int inrow, outrow, incol;
357
358 __vector unsigned char in, inl, inh;
359
360 for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
361
362 inptr = input_data[inrow];
363 outptr0 = output_data[outrow++];
364 outptr1 = output_data[outrow++];
365
366 for (incol = (output_width + 31) & (~31); incol > 0;
367 incol -= 64, inptr += 32, outptr0 += 64, outptr1 += 64) {
368
369 in = vec_ld(0, inptr);
370 inl = vec_mergeh(in, in);
371 inh = vec_mergel(in, in);
372
373 vec_st(inl, 0, outptr0);
374 vec_st(inl, 0, outptr1);
375
376 vec_st(inh, 16, outptr0);
377 vec_st(inh, 16, outptr1);
378
379 if (incol > 32) {
380 in = vec_ld(16, inptr);
381 inl = vec_mergeh(in, in);
382 inh = vec_mergel(in, in);
383
384 vec_st(inl, 32, outptr0);
385 vec_st(inl, 32, outptr1);
386
387 vec_st(inh, 48, outptr0);
388 vec_st(inh, 48, outptr1);
389 }
390 }
391 }
392 }
393