1 /*
2 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26
27
28 /*
29 * FUNCTIONS
30 * mlib_v_ImageChannelInsert_U8_12_D1
31 * mlib_v_ImageChannelInsert_U8_13_D1
32 * mlib_v_ImageChannelInsert_U8_14_D1
33 *
34 * ARGUMENT
35 * src pointer to source image data
36 * dst pointer to destination image data
37 * slb source image line stride in bytes
38 * dlb destination image line stride in bytes
39 * dsize image data size in pixels
40 * xsize image width in pixels
41 * ysize image height in lines
42 * cmask channel mask
43 *
44 * DESCRIPTION
45 * Copy the 1-channel source image into the selected channel
46 * of the destination image -- VIS version low level functions.
47 *
48 * NOTE
49 * These functions are separated from mlib_v_ImageChannelInsert.c
50 * for loop unrolling and structure clarity.
51 */
52
53 #include "vis_proto.h"
54 #include "mlib_image.h"
55 #include "mlib_v_ImageChannelInsert.h"
56
57 /***************************************************************/
58 #define INSERT_U8_12(sd0, dd0, dd1) /* channel duplicate */ \
59 dd0 = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
60 dd1 = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0))
61
62 /***************************************************************/
63 /* insert one channel to a 2-channel image.
64 */
65
mlib_v_ImageChannelInsert_U8_12_D1(const mlib_u8 * src,mlib_u8 * dst,mlib_s32 dsize,mlib_s32 cmask)66 void mlib_v_ImageChannelInsert_U8_12_D1(const mlib_u8 *src,
67 mlib_u8 *dst,
68 mlib_s32 dsize,
69 mlib_s32 cmask)
70 {
71 mlib_u8 *sa, *da;
72 mlib_u8 *dend, *dend2; /* end points in dst */
73 mlib_d64 *dp; /* 8-byte aligned start points in dst */
74 mlib_d64 *sp; /* 8-byte aligned start point in src */
75 mlib_d64 sd0, sd1; /* 8-byte source data */
76 mlib_d64 dd0, dd1, dd2, dd3; /* 8-byte destination data */
77 mlib_s32 soff; /* offset of address in src */
78 mlib_s32 doff; /* offset of address in dst */
79 mlib_s32 off; /* offset of src over dst */
80 mlib_s32 emask; /* edge mask */
81 mlib_s32 bmask; /* channel mask */
82 mlib_s32 i, n;
83
84 bmask = cmask | (cmask << 2) | (cmask << 4) | (cmask << 6);
85
86 sa = (void *)src;
87 da = dst;
88
89 /* prepare the source address */
90 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
91 soff = ((mlib_addr) sa & 7);
92
93 /* prepare the destination addresses */
94 dp = (mlib_d64 *) ((mlib_addr) da & (~7));
95 doff = ((mlib_addr) da & 7);
96 dend = da + dsize * 2 - 1;
97 dend2 = dend - 15;
98
99 /* calculate the src's offset over dst */
100 off = soff * 2 - doff;
101
102 if (doff % 2 != 0) {
103 bmask = (~bmask) & 0xff;
104 }
105
106 if (off == 0) { /* src and dst have same alignment */
107
108 /* load 8 bytes */
109 sd0 = *sp++;
110
111 /* insert, including some garbage at the start point */
112 INSERT_U8_12(sd0, dd0, dd1);
113
114 /* store 16 bytes result */
115 emask = vis_edge8(da, dend);
116 vis_pst_8(dd0, dp++, emask & bmask);
117 if ((mlib_addr) dp <= (mlib_addr) dend) {
118 emask = vis_edge8(dp, dend);
119 vis_pst_8(dd1, dp++, emask & bmask);
120 }
121
122 if ((mlib_addr) dp <= (mlib_addr) dend2) {
123 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
124
125 /* 8-pixel column loop, emask not needed */
126 #pragma pipeloop(0)
127 for (i = 0; i < n; i++) {
128 sd0 = *sp++;
129 INSERT_U8_12(sd0, dd0, dd1);
130 vis_pst_8(dd0, dp++, bmask);
131 vis_pst_8(dd1, dp++, bmask);
132 }
133 }
134
135 /* end point handling */
136 if ((mlib_addr) dp <= (mlib_addr) dend) {
137 sd0 = *sp++;
138 INSERT_U8_12(sd0, dd0, dd1);
139 emask = vis_edge8(dp, dend);
140 vis_pst_8(dd0, dp++, emask & bmask);
141 if ((mlib_addr) dp <= (mlib_addr) dend) {
142 emask = vis_edge8(dp, dend);
143 vis_pst_8(dd1, dp++, emask & bmask);
144 }
145 }
146 }
147 else if (off < 0) {
148 vis_alignaddr((void *)0, off);
149
150 /* generate edge mask for the start point */
151 emask = vis_edge8(da, dend);
152
153 /* load 8 bytes */
154 sd0 = *sp++;
155
156 /* insert and store 16 bytes */
157 INSERT_U8_12(sd0, dd0, dd1);
158 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
159 if ((mlib_addr) dp <= (mlib_addr) dend) {
160 emask = vis_edge8(dp, dend);
161 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
162 }
163
164 if ((mlib_addr) dp <= (mlib_addr) dend2) {
165 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
166
167 /* 8-pixel column loop, emask not needed */
168 #pragma pipeloop(0)
169 for (i = 0; i < n; i++) {
170 dd2 = dd1;
171 sd0 = *sp++;
172 INSERT_U8_12(sd0, dd0, dd1);
173 vis_pst_8(vis_faligndata(dd2, dd0), dp++, bmask);
174 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
175 }
176 }
177
178 /* end point handling */
179 if ((mlib_addr) dp <= (mlib_addr) dend) {
180 emask = vis_edge8(dp, dend);
181 dd2 = dd1;
182 sd0 = *sp++;
183 INSERT_U8_12(sd0, dd0, dd1);
184 vis_pst_8(vis_faligndata(dd2, dd0), dp++, emask & bmask);
185 if ((mlib_addr) dp <= (mlib_addr) dend) {
186 emask = vis_edge8(dp, dend);
187 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
188 }
189 }
190 }
191 else if (off < 8) {
192 vis_alignaddr((void *)0, off);
193
194 /* generate edge mask for the start point */
195 emask = vis_edge8(da, dend);
196
197 /* load 16 bytes */
198 sd0 = *sp++;
199 sd1 = *sp++;
200
201 /* insert and store 16 bytes */
202 INSERT_U8_12(sd0, dd0, dd1);
203 INSERT_U8_12(sd1, dd2, dd3);
204 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
205 if ((mlib_addr) dp <= (mlib_addr) dend) {
206 emask = vis_edge8(dp, dend);
207 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
208 }
209
210 if ((mlib_addr) dp <= (mlib_addr) dend2) {
211 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
212
213 /* 8-pixel column loop, emask not needed */
214 #pragma pipeloop(0)
215 for (i = 0; i < n; i++) {
216 dd0 = dd2;
217 dd1 = dd3;
218 sd1 = *sp++;
219 INSERT_U8_12(sd1, dd2, dd3);
220 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
221 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
222 }
223 }
224
225 /* end point handling */
226 if ((mlib_addr) dp <= (mlib_addr) dend) {
227 emask = vis_edge8(dp, dend);
228 dd0 = dd2;
229 dd1 = dd3;
230 sd1 = *sp++;
231 INSERT_U8_12(sd1, dd2, dd3);
232 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
233 if ((mlib_addr) dp <= (mlib_addr) dend) {
234 emask = vis_edge8(dp, dend);
235 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
236 }
237 }
238 }
239 else { /* (off >= 8) */
240 vis_alignaddr((void *)0, off);
241
242 /* generate edge mask for the start point */
243 emask = vis_edge8(da, dend);
244
245 /* load 16 bytes */
246 sd0 = *sp++;
247 sd1 = *sp++;
248
249 /* insert and store 16 bytes */
250 INSERT_U8_12(sd0, dd0, dd1);
251 INSERT_U8_12(sd1, dd2, dd3);
252 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
253 if ((mlib_addr) dp <= (mlib_addr) dend) {
254 emask = vis_edge8(dp, dend);
255 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
256 }
257
258 if ((mlib_addr) dp <= (mlib_addr) dend2) {
259 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 16 + 1;
260
261 /* 8-pixel column loop, emask not needed */
262 #pragma pipeloop(0)
263 for (i = 0; i < n; i++) {
264 dd0 = dd2;
265 dd1 = dd3;
266 sd1 = *sp++;
267 INSERT_U8_12(sd1, dd2, dd3);
268 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
269 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
270 }
271 }
272
273 /* end point handling */
274 if ((mlib_addr) dp <= (mlib_addr) dend) {
275 emask = vis_edge8(dp, dend);
276 dd0 = dd2;
277 dd1 = dd3;
278 sd1 = *sp++;
279 INSERT_U8_12(sd1, dd2, dd3);
280 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
281 if ((mlib_addr) dp <= (mlib_addr) dend) {
282 emask = vis_edge8(dp, dend);
283 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
284 }
285 }
286 }
287 }
288
289 /***************************************************************/
290 #define LOAD_INSERT_STORE_U8(channeld) \
291 vis_alignaddr((void *)0, off); \
292 sd0 = sd1; \
293 sd1 = *sp++; \
294 sd = vis_faligndata(sd0, sd1); \
295 vis_alignaddr((void *)0, 1); \
296 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
297 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
298 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
299 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
300 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
301 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
302 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld; \
303 vis_st_u8(sd = vis_faligndata(sd, sd), da); da += channeld
304
305 /***************************************************************/
mlib_v_ImageChannelInsert_U8_13_D1(const mlib_u8 * src,mlib_u8 * dst,mlib_s32 dsize,mlib_s32 cmask)306 void mlib_v_ImageChannelInsert_U8_13_D1(const mlib_u8 *src,
307 mlib_u8 *dst,
308 mlib_s32 dsize,
309 mlib_s32 cmask)
310 {
311 mlib_u8 *sa, *da;
312 mlib_u8 *dend; /* end point in destination */
313 mlib_d64 *sp; /* 8-byte aligned start points in src */
314 mlib_d64 sd0, sd1, sd; /* 8-byte registers for source data */
315 mlib_s32 off; /* offset of address alignment in src */
316 mlib_s32 i;
317
318 /* prepare the src address */
319 sa = (void *)src;
320 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
321 off = (mlib_addr) sa & 7;
322
323 /* prepare the dst address */
324 da = dst + (2 / cmask); /* 4,2,1 -> 0,1,2 */
325 dend = da + dsize * 3 - 1;
326
327 sd1 = *sp++;
328
329 #pragma pipeloop(0)
330 for (i = 0; i < dsize / 8; i++) {
331 LOAD_INSERT_STORE_U8(3);
332 }
333
334 /* right end handling */
335 if ((mlib_addr) da <= (mlib_addr) dend) {
336
337 vis_alignaddr((void *)0, off);
338 sd0 = sd1;
339 sd1 = *sp++;
340 sd = vis_faligndata(sd0, sd1);
341
342 vis_alignaddr((void *)0, 1);
343 vis_st_u8(sd = vis_faligndata(sd, sd), da);
344 da += 3;
345 if ((mlib_addr) da <= (mlib_addr) dend) {
346 vis_st_u8(sd = vis_faligndata(sd, sd), da);
347 da += 3;
348 if ((mlib_addr) da <= (mlib_addr) dend) {
349 vis_st_u8(sd = vis_faligndata(sd, sd), da);
350 da += 3;
351 if ((mlib_addr) da <= (mlib_addr) dend) {
352 vis_st_u8(sd = vis_faligndata(sd, sd), da);
353 da += 3;
354 if ((mlib_addr) da <= (mlib_addr) dend) {
355 vis_st_u8(sd = vis_faligndata(sd, sd), da);
356 da += 3;
357 if ((mlib_addr) da <= (mlib_addr) dend) {
358 vis_st_u8(sd = vis_faligndata(sd, sd), da);
359 da += 3;
360 if ((mlib_addr) da <= (mlib_addr) dend) {
361 vis_st_u8(sd = vis_faligndata(sd, sd), da);
362 }
363 }
364 }
365 }
366 }
367 }
368 }
369 }
370
371 /***************************************************************/
372 #define INSERT_U8_14(sd0, dd0, dd1, dd2, dd3) \
373 sda = vis_fpmerge(vis_read_hi(sd0), vis_read_hi(sd0)); \
374 sdb = vis_fpmerge(vis_read_lo(sd0), vis_read_lo(sd0)); \
375 dd0 = vis_fpmerge(vis_read_hi(sda), vis_read_hi(sda)); \
376 dd1 = vis_fpmerge(vis_read_lo(sda), vis_read_lo(sda)); \
377 dd2 = vis_fpmerge(vis_read_hi(sdb), vis_read_hi(sdb)); \
378 dd3 = vis_fpmerge(vis_read_lo(sdb), vis_read_lo(sdb))
379
380 /***************************************************************/
mlib_v_ImageChannelInsert_U8_14_D1(const mlib_u8 * src,mlib_u8 * dst,mlib_s32 dsize,mlib_s32 cmask)381 void mlib_v_ImageChannelInsert_U8_14_D1(const mlib_u8 *src,
382 mlib_u8 *dst,
383 mlib_s32 dsize,
384 mlib_s32 cmask)
385 {
386 mlib_u8 *sa, *da;
387 mlib_u8 *dend, *dend2; /* end points in dst */
388 mlib_d64 *dp; /* 8-byte aligned start points in dst */
389 mlib_d64 *sp; /* 8-byte aligned start point in src */
390 mlib_d64 sd0, sd1, sd; /* 8-byte source data */
391 mlib_d64 sda, sdb;
392 mlib_d64 dd0, dd1, dd2, dd3, dd4;
393 mlib_s32 soff; /* offset of address in src */
394 mlib_s32 doff; /* offset of address in dst */
395 mlib_s32 emask; /* edge mask */
396 mlib_s32 bmask; /* channel mask */
397 mlib_s32 i, n;
398
399 sa = (void *)src;
400 da = dst;
401
402 bmask = cmask | (cmask << 4) | (cmask << 8);
403
404 /* prepare the source address */
405 sp = (mlib_d64 *) ((mlib_addr) sa & (~7));
406 soff = ((mlib_addr) sa & 7);
407
408 /* prepare the destination addresses */
409 dp = (mlib_d64 *) ((mlib_addr) da & (~7));
410 doff = ((mlib_addr) da & 7);
411 dend = da + dsize * 4 - 1;
412 dend2 = dend - 31;
413
414 bmask = (bmask >> (doff % 4)) & 0xff;
415
416 if (doff == 0) { /* dst is 8-byte aligned */
417
418 vis_alignaddr((void *)0, soff);
419 sd0 = *sp++;
420 sd1 = *sp++;
421 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */
422
423 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
424
425 emask = vis_edge8(da, dend);
426 vis_pst_8(dd0, dp++, emask & bmask);
427 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */
428 emask = vis_edge8(dp, dend);
429 vis_pst_8(dd1, dp++, emask & bmask);
430 if ((mlib_addr) dp <= (mlib_addr) dend) {
431 emask = vis_edge8(dp, dend);
432 vis_pst_8(dd2, dp++, emask & bmask);
433 if ((mlib_addr) dp <= (mlib_addr) dend) {
434 emask = vis_edge8(dp, dend);
435 vis_pst_8(dd3, dp++, emask & bmask);
436 }
437 }
438 }
439
440 if ((mlib_addr) dp <= (mlib_addr) dend2) {
441 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1;
442
443 /* 8-pixel column loop, emask not needed */
444 #pragma pipeloop(0)
445 for (i = 0; i < n; i++) {
446 sd0 = sd1;
447 sd1 = *sp++;
448 sd = vis_faligndata(sd0, sd1);
449
450 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
451
452 vis_pst_8(dd0, dp++, bmask);
453 vis_pst_8(dd1, dp++, bmask);
454 vis_pst_8(dd2, dp++, bmask);
455 vis_pst_8(dd3, dp++, bmask);
456 }
457 }
458
459 /* end point handling */
460 if ((mlib_addr) dp <= (mlib_addr) dend) {
461 sd0 = sd1;
462 sd1 = *sp++;
463 sd = vis_faligndata(sd0, sd1);
464
465 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
466
467 emask = vis_edge8(dp, dend);
468 vis_pst_8(dd0, dp++, emask & bmask);
469 if ((mlib_addr) dp <= (mlib_addr) dend) {
470 emask = vis_edge8(dp, dend);
471 vis_pst_8(dd1, dp++, emask & bmask);
472 if ((mlib_addr) dp <= (mlib_addr) dend) {
473 emask = vis_edge8(dp, dend);
474 vis_pst_8(dd2, dp++, emask & bmask);
475 if ((mlib_addr) dp <= (mlib_addr) dend) {
476 emask = vis_edge8(dp, dend);
477 vis_pst_8(dd3, dp++, emask & bmask);
478 }
479 }
480 }
481 }
482 }
483 else { /* dst is not 8-byte aligned */
484 vis_alignaddr((void *)0, soff);
485 sd0 = *sp++;
486 sd1 = *sp++;
487 sd = vis_faligndata(sd0, sd1); /* the intermediate is aligned */
488
489 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
490
491 vis_alignaddr((void *)0, -doff);
492
493 emask = vis_edge8(da, dend);
494 vis_pst_8(vis_faligndata(dd0, dd0), dp++, emask & bmask);
495 if ((mlib_addr) dp <= (mlib_addr) dend) { /* for very small size */
496 emask = vis_edge8(dp, dend);
497 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
498 if ((mlib_addr) dp <= (mlib_addr) dend) {
499 emask = vis_edge8(dp, dend);
500 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
501 if ((mlib_addr) dp <= (mlib_addr) dend) {
502 emask = vis_edge8(dp, dend);
503 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
504 }
505 }
506 }
507
508 if ((mlib_addr) dp <= (mlib_addr) dend2) {
509 n = ((mlib_u8 *) dend2 - (mlib_u8 *) dp) / 32 + 1;
510
511 /* 8-pixel column loop, emask not needed */
512 #pragma pipeloop(0)
513 for (i = 0; i < n; i++) {
514 dd4 = dd3;
515
516 vis_alignaddr((void *)0, soff);
517 sd0 = sd1;
518 sd1 = *sp++;
519 sd = vis_faligndata(sd0, sd1);
520
521 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
522
523 vis_alignaddr((void *)0, -doff);
524 vis_pst_8(vis_faligndata(dd4, dd0), dp++, bmask);
525 vis_pst_8(vis_faligndata(dd0, dd1), dp++, bmask);
526 vis_pst_8(vis_faligndata(dd1, dd2), dp++, bmask);
527 vis_pst_8(vis_faligndata(dd2, dd3), dp++, bmask);
528 }
529 }
530
531 /* end point handling */
532 if ((mlib_addr) dp <= (mlib_addr) dend) {
533 dd4 = dd3;
534
535 vis_alignaddr((void *)0, soff);
536 sd0 = sd1;
537 sd1 = *sp++;
538 sd = vis_faligndata(sd0, sd1);
539
540 INSERT_U8_14(sd, dd0, dd1, dd2, dd3);
541
542 vis_alignaddr((void *)0, -doff);
543 emask = vis_edge8(dp, dend);
544 vis_pst_8(vis_faligndata(dd4, dd0), dp++, emask & bmask);
545 if ((mlib_addr) dp <= (mlib_addr) dend) {
546 emask = vis_edge8(dp, dend);
547 vis_pst_8(vis_faligndata(dd0, dd1), dp++, emask & bmask);
548 if ((mlib_addr) dp <= (mlib_addr) dend) {
549 emask = vis_edge8(dp, dend);
550 vis_pst_8(vis_faligndata(dd1, dd2), dp++, emask & bmask);
551 if ((mlib_addr) dp <= (mlib_addr) dend) {
552 emask = vis_edge8(dp, dend);
553 vis_pst_8(vis_faligndata(dd2, dd3), dp++, emask & bmask);
554 }
555 }
556 }
557 }
558 }
559 }
560
561
562 /***************************************************************/
563