1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "mat.h"
16 #if __ARM_NEON
17 #include <arm_neon.h>
18 #endif // __ARM_NEON
19 #include "platform.h"
20
21 namespace ncnn {
22
23 #if NCNN_PIXEL_ROTATE
24 // should be a kanna ascii art here in my local branch
25 // but we shall ask the original art author for permission first ...
26 // https://www.reddit.com/r/anime/comments/5uxjn4/i_recreated_the_kanna_ascii_art_from_kobayashisan/
27
kanna_rotate_1_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)28 static void kanna_rotate_1_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
29 {
30 const int srcwgap = srcstride - srcw;
31 const int wgap = stride - w;
32
33 const unsigned char* src0 = src;
34 const unsigned char* src1 = src + srcstride;
35 unsigned char* dst0 = dst;
36 unsigned char* dst1 = dst + stride;
37
38 int y = 0;
39 for (; y + 1 < srch; y += 2)
40 {
41 #if __ARM_NEON
42 int nn = srcw >> 5;
43 int remain = srcw - (nn << 5);
44 #if __aarch64__
45 for (; nn > 0; nn--)
46 {
47 uint8x16_t _src0 = vld1q_u8(src0);
48 uint8x16_t _src0n = vld1q_u8(src0 + 16);
49 vst1q_u8(dst0, _src0);
50 vst1q_u8(dst0 + 16, _src0n);
51
52 uint8x16_t _src1 = vld1q_u8(src1);
53 uint8x16_t _src1n = vld1q_u8(src1 + 16);
54 vst1q_u8(dst1, _src1);
55 vst1q_u8(dst1 + 16, _src1n);
56
57 src0 += 32;
58 src1 += 32;
59 dst0 += 32;
60 dst1 += 32;
61 }
62 #else
63 if (nn > 0)
64 {
65 asm volatile(
66 "0: \n"
67 "pld [%1, #256] \n"
68 "vld1.u8 {d0-d3}, [%1]! \n"
69 "pld [%2, #256] \n"
70 "vld1.u8 {d4-d7}, [%2]! \n"
71 "subs %0, #1 \n"
72 "vst1.u8 {d0-d3}, [%3]! \n"
73 "vst1.u8 {d4-d7}, [%4]! \n"
74 "bne 0b \n"
75 : "=r"(nn), // %0
76 "=r"(src0), // %1
77 "=r"(src1), // %2
78 "=r"(dst0), // %3
79 "=r"(dst1) // %4
80 : "0"(nn),
81 "1"(src0),
82 "2"(src1),
83 "3"(dst0),
84 "4"(dst1)
85 : "cc", "memory", "q0", "q1", "q2", "q3");
86 }
87 #endif // __aarch64__
88 #else
89 int remain = srcw;
90 #endif // __ARM_NEON
91
92 for (; remain > 0; remain--)
93 {
94 *dst0++ = *src0++;
95 *dst1++ = *src1++;
96 }
97
98 src0 += srcwgap + srcstride;
99 src1 += srcwgap + srcstride;
100 dst0 += wgap + stride;
101 dst1 += wgap + stride;
102 }
103
104 for (; y < srch; y++)
105 {
106 #if __ARM_NEON
107 int nn = srcw >> 5;
108 int remain = srcw - (nn << 5);
109 #if __aarch64__
110 for (; nn > 0; nn--)
111 {
112 uint8x16_t _src = vld1q_u8(src0);
113 uint8x16_t _src2 = vld1q_u8(src0 + 16);
114 vst1q_u8(dst0, _src);
115 vst1q_u8(dst0 + 16, _src2);
116
117 src0 += 32;
118 dst0 += 32;
119 }
120 #else
121 if (nn > 0)
122 {
123 asm volatile(
124 "0: \n"
125 "pld [%1, #256] \n"
126 "vld1.u8 {d0-d3}, [%1]! \n"
127 "subs %0, #1 \n"
128 "vst1.u8 {d0-d3}, [%2]! \n"
129 "bne 0b \n"
130 : "=r"(nn), // %0
131 "=r"(src0), // %1
132 "=r"(dst0) // %2
133 : "0"(nn),
134 "1"(src0),
135 "2"(dst0)
136 : "cc", "memory", "q0", "q1");
137 }
138 #endif // __aarch64__
139 #else
140 int remain = srcw;
141 #endif // __ARM_NEON
142
143 for (; remain > 0; remain--)
144 {
145 *dst0++ = *src0++;
146 }
147
148 src0 += srcwgap;
149 dst0 += wgap;
150 }
151 }
152
kanna_rotate_1_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)153 static void kanna_rotate_1_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
154 {
155 const int srcwgap = srcstride - srcw * 2;
156 const int wgap = stride - w * 2;
157
158 int size = srcw * 2;
159
160 const unsigned char* src0 = src;
161 const unsigned char* src1 = src + srcstride;
162 unsigned char* dst0 = dst;
163 unsigned char* dst1 = dst + stride;
164
165 int y = 0;
166 for (; y + 1 < srch; y += 2)
167 {
168 #if __ARM_NEON
169 int nn = size >> 5;
170 int remain = size - (nn << 5);
171 #if __aarch64__
172 for (; nn > 0; nn--)
173 {
174 uint8x16_t _src0 = vld1q_u8(src0);
175 uint8x16_t _src0n = vld1q_u8(src0 + 16);
176 vst1q_u8(dst0, _src0);
177 vst1q_u8(dst0 + 16, _src0n);
178
179 uint8x16_t _src1 = vld1q_u8(src1);
180 uint8x16_t _src1n = vld1q_u8(src1 + 16);
181 vst1q_u8(dst1, _src1);
182 vst1q_u8(dst1 + 16, _src1n);
183
184 src0 += 32;
185 src1 += 32;
186 dst0 += 32;
187 dst1 += 32;
188 }
189 #else
190 if (nn > 0)
191 {
192 asm volatile(
193 "0: \n"
194 "pld [%1, #256] \n"
195 "vld1.u8 {d0-d3}, [%1]! \n"
196 "pld [%2, #256] \n"
197 "vld1.u8 {d4-d7}, [%2]! \n"
198 "subs %0, #1 \n"
199 "vst1.u8 {d0-d3}, [%3]! \n"
200 "vst1.u8 {d4-d7}, [%4]! \n"
201 "bne 0b \n"
202 : "=r"(nn), // %0
203 "=r"(src0), // %1
204 "=r"(src1), // %2
205 "=r"(dst0), // %3
206 "=r"(dst1) // %4
207 : "0"(nn),
208 "1"(src0),
209 "2"(src1),
210 "3"(dst0),
211 "4"(dst1)
212 : "cc", "memory", "q0", "q1", "q2", "q3");
213 }
214 #endif // __aarch64__
215 #else
216 int remain = size;
217 #endif // __ARM_NEON
218
219 for (; remain > 0; remain--)
220 {
221 *dst0++ = *src0++;
222 *dst1++ = *src1++;
223 }
224
225 src0 += srcwgap + srcstride;
226 src1 += srcwgap + srcstride;
227 dst0 += wgap + stride;
228 dst1 += wgap + stride;
229 }
230
231 for (; y < srch; y++)
232 {
233 #if __ARM_NEON
234 int nn = size >> 5;
235 int remain = size - (nn << 5);
236 #if __aarch64__
237 for (; nn > 0; nn--)
238 {
239 uint8x16_t _src = vld1q_u8(src0);
240 uint8x16_t _src2 = vld1q_u8(src0 + 16);
241 vst1q_u8(dst0, _src);
242 vst1q_u8(dst0 + 16, _src2);
243
244 src0 += 32;
245 dst0 += 32;
246 }
247 #else
248 if (nn > 0)
249 {
250 asm volatile(
251 "0: \n"
252 "pld [%1, #256] \n"
253 "vld1.u8 {d0-d3}, [%1]! \n"
254 "subs %0, #1 \n"
255 "vst1.u8 {d0-d3}, [%2]! \n"
256 "bne 0b \n"
257 : "=r"(nn), // %0
258 "=r"(src0), // %1
259 "=r"(dst0) // %2
260 : "0"(nn),
261 "1"(src0),
262 "2"(dst0)
263 : "cc", "memory", "q0", "q1");
264 }
265 #endif // __aarch64__
266 #else
267 int remain = size;
268 #endif // __ARM_NEON
269
270 for (; remain > 0; remain--)
271 {
272 *dst0++ = *src0++;
273 }
274
275 src0 += srcwgap;
276 dst0 += wgap;
277 }
278 }
279
kanna_rotate_1_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)280 static void kanna_rotate_1_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
281 {
282 const int srcwgap = srcstride - srcw * 3;
283 const int wgap = stride - w * 3;
284
285 int size = srcw * 3;
286
287 const unsigned char* src0 = src;
288 const unsigned char* src1 = src + srcstride;
289 unsigned char* dst0 = dst;
290 unsigned char* dst1 = dst + stride;
291
292 int y = 0;
293 for (; y + 1 < srch; y += 2)
294 {
295 #if __ARM_NEON
296 int nn = size >> 5;
297 int remain = size - (nn << 5);
298 #if __aarch64__
299 for (; nn > 0; nn--)
300 {
301 uint8x16_t _src0 = vld1q_u8(src0);
302 uint8x16_t _src0n = vld1q_u8(src0 + 16);
303 vst1q_u8(dst0, _src0);
304 vst1q_u8(dst0 + 16, _src0n);
305
306 uint8x16_t _src1 = vld1q_u8(src1);
307 uint8x16_t _src1n = vld1q_u8(src1 + 16);
308 vst1q_u8(dst1, _src1);
309 vst1q_u8(dst1 + 16, _src1n);
310
311 src0 += 32;
312 src1 += 32;
313 dst0 += 32;
314 dst1 += 32;
315 }
316 #else
317 if (nn > 0)
318 {
319 asm volatile(
320 "0: \n"
321 "pld [%1, #256] \n"
322 "vld1.u8 {d0-d3}, [%1]! \n"
323 "pld [%2, #256] \n"
324 "vld1.u8 {d4-d7}, [%2]! \n"
325 "subs %0, #1 \n"
326 "vst1.u8 {d0-d3}, [%3]! \n"
327 "vst1.u8 {d4-d7}, [%4]! \n"
328 "bne 0b \n"
329 : "=r"(nn), // %0
330 "=r"(src0), // %1
331 "=r"(src1), // %2
332 "=r"(dst0), // %3
333 "=r"(dst1) // %4
334 : "0"(nn),
335 "1"(src0),
336 "2"(src1),
337 "3"(dst0),
338 "4"(dst1)
339 : "cc", "memory", "q0", "q1", "q2", "q3");
340 }
341 #endif // __aarch64__
342 #else
343 int remain = size;
344 #endif // __ARM_NEON
345
346 for (; remain > 0; remain--)
347 {
348 *dst0++ = *src0++;
349 *dst1++ = *src1++;
350 }
351
352 src0 += srcwgap + srcstride;
353 src1 += srcwgap + srcstride;
354 dst0 += wgap + stride;
355 dst1 += wgap + stride;
356 }
357
358 for (; y < srch; y++)
359 {
360 #if __ARM_NEON
361 int nn = size >> 5;
362 int remain = size - (nn << 5);
363 #if __aarch64__
364 for (; nn > 0; nn--)
365 {
366 uint8x16_t _src = vld1q_u8(src0);
367 uint8x16_t _src2 = vld1q_u8(src0 + 16);
368 vst1q_u8(dst0, _src);
369 vst1q_u8(dst0 + 16, _src2);
370
371 src0 += 32;
372 dst0 += 32;
373 }
374 #else
375 if (nn > 0)
376 {
377 asm volatile(
378 "0: \n"
379 "pld [%1, #256] \n"
380 "vld1.u8 {d0-d3}, [%1]! \n"
381 "subs %0, #1 \n"
382 "vst1.u8 {d0-d3}, [%2]! \n"
383 "bne 0b \n"
384 : "=r"(nn), // %0
385 "=r"(src0), // %1
386 "=r"(dst0) // %2
387 : "0"(nn),
388 "1"(src0),
389 "2"(dst0)
390 : "cc", "memory", "q0", "q1");
391 }
392 #endif // __aarch64__
393 #else
394 int remain = size;
395 #endif // __ARM_NEON
396
397 for (; remain > 0; remain--)
398 {
399 *dst0++ = *src0++;
400 }
401
402 src0 += srcwgap;
403 dst0 += wgap;
404 }
405 }
406
kanna_rotate_1_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)407 static void kanna_rotate_1_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
408 {
409 const int srcwgap = srcstride - srcw * 4;
410 const int wgap = stride - w * 4;
411
412 int size = srcw * 4;
413
414 const unsigned char* src0 = src;
415 const unsigned char* src1 = src + srcstride;
416 unsigned char* dst0 = dst;
417 unsigned char* dst1 = dst + stride;
418
419 int y = 0;
420 for (; y + 1 < srch; y += 2)
421 {
422 #if __ARM_NEON
423 int nn = size >> 5;
424 int remain = size - (nn << 5);
425 #if __aarch64__
426 for (; nn > 0; nn--)
427 {
428 uint8x16_t _src0 = vld1q_u8(src0);
429 uint8x16_t _src0n = vld1q_u8(src0 + 16);
430 vst1q_u8(dst0, _src0);
431 vst1q_u8(dst0 + 16, _src0n);
432
433 uint8x16_t _src1 = vld1q_u8(src1);
434 uint8x16_t _src1n = vld1q_u8(src1 + 16);
435 vst1q_u8(dst1, _src1);
436 vst1q_u8(dst1 + 16, _src1n);
437
438 src0 += 32;
439 src1 += 32;
440 dst0 += 32;
441 dst1 += 32;
442 }
443 #else
444 if (nn > 0)
445 {
446 asm volatile(
447 "0: \n"
448 "pld [%1, #256] \n"
449 "vld1.u8 {d0-d3}, [%1]! \n"
450 "pld [%2, #256] \n"
451 "vld1.u8 {d4-d7}, [%2]! \n"
452 "subs %0, #1 \n"
453 "vst1.u8 {d0-d3}, [%3]! \n"
454 "vst1.u8 {d4-d7}, [%4]! \n"
455 "bne 0b \n"
456 : "=r"(nn), // %0
457 "=r"(src0), // %1
458 "=r"(src1), // %2
459 "=r"(dst0), // %3
460 "=r"(dst1) // %4
461 : "0"(nn),
462 "1"(src0),
463 "2"(src1),
464 "3"(dst0),
465 "4"(dst1)
466 : "cc", "memory", "q0", "q1", "q2", "q3");
467 }
468 #endif // __aarch64__
469 #else
470 int remain = size;
471 #endif // __ARM_NEON
472
473 for (; remain > 0; remain--)
474 {
475 *dst0++ = *src0++;
476 *dst1++ = *src1++;
477 }
478
479 src0 += srcwgap + srcstride;
480 src1 += srcwgap + srcstride;
481 dst0 += wgap + stride;
482 dst1 += wgap + stride;
483 }
484
485 for (; y < srch; y++)
486 {
487 #if __ARM_NEON
488 int nn = size >> 5;
489 int remain = size - (nn << 5);
490 #if __aarch64__
491 for (; nn > 0; nn--)
492 {
493 uint8x16_t _src = vld1q_u8(src0);
494 uint8x16_t _src2 = vld1q_u8(src0 + 16);
495 vst1q_u8(dst0, _src);
496 vst1q_u8(dst0 + 16, _src2);
497
498 src0 += 32;
499 dst0 += 32;
500 }
501 #else
502 if (nn > 0)
503 {
504 asm volatile(
505 "0: \n"
506 "pld [%1, #256] \n"
507 "vld1.u8 {d0-d3}, [%1]! \n"
508 "subs %0, #1 \n"
509 "vst1.u8 {d0-d3}, [%2]! \n"
510 "bne 0b \n"
511 : "=r"(nn), // %0
512 "=r"(src0), // %1
513 "=r"(dst0) // %2
514 : "0"(nn),
515 "1"(src0),
516 "2"(dst0)
517 : "cc", "memory", "q0", "q1");
518 }
519 #endif // __aarch64__
520 #else
521 int remain = size;
522 #endif // __ARM_NEON
523
524 for (; remain > 0; remain--)
525 {
526 *dst0++ = *src0++;
527 }
528
529 src0 += srcwgap;
530 dst0 += wgap;
531 }
532 }
533
kanna_rotate_2_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)534 static void kanna_rotate_2_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
535 {
536 const int srcwgap = srcstride - srcw;
537 const int wgap = stride + w;
538
539 const unsigned char* src0 = src;
540 unsigned char* dst0 = dst + w - 1;
541
542 int y = 0;
543 for (; y < srch; y++)
544 {
545 #if __ARM_NEON
546 dst0 -= 15;
547
548 int nn = srcw >> 4;
549 int remain = srcw - (nn << 4);
550
551 #if __aarch64__
552 for (; nn > 0; nn--)
553 {
554 uint8x8_t _src = vld1_u8(src0);
555 uint8x8_t _src2 = vld1_u8(src0 + 8);
556
557 _src = vrev64_u8(_src);
558 _src2 = vrev64_u8(_src2);
559
560 vst1_u8(dst0, _src2);
561 vst1_u8(dst0 + 8, _src);
562
563 src0 += 16;
564 dst0 -= 16;
565 }
566 #else
567 if (nn > 0)
568 {
569 asm volatile(
570 "mov r4, #-16 \n"
571 "0: \n"
572 "pld [%1, #128] \n"
573 "vld1.u8 {d0-d1}, [%1]! \n"
574 "vrev64.u8 d3, d0 \n"
575 "vrev64.u8 d2, d1 \n"
576 "subs %0, #1 \n"
577 "vst1.u8 {d2-d3}, [%2], r4 \n"
578 "bne 0b \n"
579 : "=r"(nn), // %0
580 "=r"(src0), // %1
581 "=r"(dst0) // %2
582 : "0"(nn),
583 "1"(src0),
584 "2"(dst0)
585 : "cc", "memory", "q0", "q1", "r4");
586 }
587 #endif // __aarch64__
588
589 dst0 += 15;
590 #else
591 int remain = srcw;
592 #endif // __ARM_NEON
593
594 for (; remain > 0; remain--)
595 {
596 *dst0 = *src0;
597
598 src0 += 1;
599 dst0 -= 1;
600 }
601
602 src0 += srcwgap;
603 dst0 += wgap;
604 }
605 }
606
kanna_rotate_2_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)607 static void kanna_rotate_2_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
608 {
609 const int srcwgap = srcstride - srcw * 2;
610 const int wgap = stride + w * 2;
611
612 const unsigned char* src0 = src;
613 unsigned char* dst0 = dst + w * 2 - 2;
614
615 int y = 0;
616 for (; y < srch; y++)
617 {
618 #if __ARM_NEON
619 dst0 -= 7 * 2;
620
621 int nn = srcw >> 4;
622 int remain = srcw - (nn << 4);
623
624 #if __aarch64__
625 for (; nn > 0; nn--)
626 {
627 uint8x8x2_t _src = vld2_u8(src0);
628 uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
629
630 _src.val[0] = vrev64_u8(_src.val[0]);
631 _src.val[1] = vrev64_u8(_src.val[1]);
632
633 _src2.val[0] = vrev64_u8(_src2.val[0]);
634 _src2.val[1] = vrev64_u8(_src2.val[1]);
635
636 vst2_u8(dst0, _src);
637 vst2_u8(dst0 - 8 * 2, _src2);
638
639 src0 += 16 * 2;
640 dst0 -= 16 * 2;
641 }
642 #else
643 if (nn > 0)
644 {
645 asm volatile(
646 "mov r4, #-16 \n"
647 "0: \n"
648 "pld [%1, #128] \n"
649 "vld2.u8 {d0-d1}, [%1]! \n"
650 "vrev64.u8 d0, d0 \n"
651 "pld [%1, #128] \n"
652 "vld2.u8 {d2-d3}, [%1]! \n"
653 "vrev64.u8 d1, d1 \n"
654 "vrev64.u8 d2, d2 \n"
655 "vst2.u8 {d0-d1}, [%2], r4 \n"
656 "vrev64.u8 d3, d3 \n"
657 "subs %0, #1 \n"
658 "vst2.u8 {d2-d3}, [%2], r4 \n"
659 "bne 0b \n"
660 : "=r"(nn), // %0
661 "=r"(src0), // %1
662 "=r"(dst0) // %2
663 : "0"(nn),
664 "1"(src0),
665 "2"(dst0)
666 : "cc", "memory", "q0", "q1", "r4");
667 }
668 #endif // __aarch64__
669
670 dst0 += 7 * 2;
671 #else
672 int remain = srcw;
673 #endif // __ARM_NEON
674
675 for (; remain > 0; remain--)
676 {
677 dst0[0] = src0[0];
678 dst0[1] = src0[1];
679
680 src0 += 2;
681 dst0 -= 2;
682 }
683
684 src0 += srcwgap;
685 dst0 += wgap;
686 }
687 }
688
kanna_rotate_2_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)689 static void kanna_rotate_2_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
690 {
691 const int srcwgap = srcstride - srcw * 3;
692 const int wgap = stride + w * 3;
693
694 const unsigned char* src0 = src;
695 unsigned char* dst0 = dst + w * 3 - 3;
696
697 int y = 0;
698 for (; y < srch; y++)
699 {
700 #if __ARM_NEON
701 dst0 -= 7 * 3;
702
703 int nn = srcw >> 4;
704 int remain = srcw - (nn << 4);
705
706 #if __aarch64__
707 for (; nn > 0; nn--)
708 {
709 uint8x8x3_t _src = vld3_u8(src0);
710 uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
711
712 _src.val[0] = vrev64_u8(_src.val[0]);
713 _src.val[1] = vrev64_u8(_src.val[1]);
714 _src.val[2] = vrev64_u8(_src.val[2]);
715
716 _src2.val[0] = vrev64_u8(_src2.val[0]);
717 _src2.val[1] = vrev64_u8(_src2.val[1]);
718 _src2.val[2] = vrev64_u8(_src2.val[2]);
719
720 vst3_u8(dst0, _src);
721 vst3_u8(dst0 - 8 * 3, _src2);
722
723 src0 += 16 * 3;
724 dst0 -= 16 * 3;
725 }
726 #else
727 if (nn > 0)
728 {
729 asm volatile(
730 "mov r4, #-24 \n"
731 "0: \n"
732 "pld [%1, #192] \n"
733 "vld3.u8 {d0-d2}, [%1]! \n"
734 "vrev64.u8 d0, d0 \n"
735 "vrev64.u8 d1, d1 \n"
736 "pld [%1, #192] \n"
737 "vld3.u8 {d4-d6}, [%1]! \n"
738 "vrev64.u8 d2, d2 \n"
739 "vrev64.u8 d4, d4 \n"
740 "vst3.u8 {d0-d2}, [%2], r4 \n"
741 "vrev64.u8 d5, d5 \n"
742 "vrev64.u8 d6, d6 \n"
743 "subs %0, #1 \n"
744 "vst3.u8 {d4-d6}, [%2], r4 \n"
745 "bne 0b \n"
746 : "=r"(nn), // %0
747 "=r"(src0), // %1
748 "=r"(dst0) // %2
749 : "0"(nn),
750 "1"(src0),
751 "2"(dst0)
752 : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
753 }
754 #endif // __aarch64__
755
756 dst0 += 7 * 3;
757 #else
758 int remain = srcw;
759 #endif // __ARM_NEON
760
761 for (; remain > 0; remain--)
762 {
763 dst0[0] = src0[0];
764 dst0[1] = src0[1];
765 dst0[2] = src0[2];
766
767 src0 += 3;
768 dst0 -= 3;
769 }
770
771 src0 += srcwgap;
772 dst0 += wgap;
773 }
774 }
775
kanna_rotate_2_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)776 static void kanna_rotate_2_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
777 {
778 const int srcwgap = srcstride - srcw * 4;
779 const int wgap = stride + w * 4;
780
781 const unsigned char* src0 = src;
782 unsigned char* dst0 = dst + w * 4 - 4;
783
784 int y = 0;
785 for (; y < srch; y++)
786 {
787 #if __ARM_NEON
788 dst0 -= 7 * 4;
789
790 int nn = srcw >> 4;
791 int remain = srcw - (nn << 4);
792
793 #if __aarch64__
794 for (; nn > 0; nn--)
795 {
796 uint8x8x4_t _src = vld4_u8(src0);
797 uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
798
799 _src.val[0] = vrev64_u8(_src.val[0]);
800 _src.val[1] = vrev64_u8(_src.val[1]);
801 _src.val[2] = vrev64_u8(_src.val[2]);
802 _src.val[3] = vrev64_u8(_src.val[3]);
803
804 _src2.val[0] = vrev64_u8(_src2.val[0]);
805 _src2.val[1] = vrev64_u8(_src2.val[1]);
806 _src2.val[2] = vrev64_u8(_src2.val[2]);
807 _src2.val[3] = vrev64_u8(_src2.val[3]);
808
809 vst4_u8(dst0, _src);
810 vst4_u8(dst0 - 8 * 4, _src2);
811
812 src0 += 16 * 4;
813 dst0 -= 16 * 4;
814 }
815 #else
816 if (nn > 0)
817 {
818 asm volatile(
819 "mov r4, #-32 \n"
820 "0: \n"
821 "pld [%1, #256] \n"
822 "vld4.u8 {d0-d3}, [%1]! \n"
823 "vrev64.u8 d0, d0 \n"
824 "vrev64.u8 d1, d1 \n"
825 "vrev64.u8 d2, d2 \n"
826 "pld [%1, #256] \n"
827 "vld4.u8 {d4-d7}, [%1]! \n"
828 "vrev64.u8 d3, d3 \n"
829 "vrev64.u8 d4, d4 \n"
830 "vrev64.u8 d5, d5 \n"
831 "vst4.u8 {d0-d3}, [%2], r4 \n"
832 "vrev64.u8 d6, d6 \n"
833 "vrev64.u8 d7, d7 \n"
834 "subs %0, #1 \n"
835 "vst4.u8 {d4-d7}, [%2], r4 \n"
836 "bne 0b \n"
837 : "=r"(nn), // %0
838 "=r"(src0), // %1
839 "=r"(dst0) // %2
840 : "0"(nn),
841 "1"(src0),
842 "2"(dst0)
843 : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
844 }
845 #endif // __aarch64__
846
847 dst0 += 7 * 4;
848 #else
849 int remain = srcw;
850 #endif // __ARM_NEON
851
852 for (; remain > 0; remain--)
853 {
854 dst0[0] = src0[0];
855 dst0[1] = src0[1];
856 dst0[2] = src0[2];
857 dst0[3] = src0[3];
858
859 src0 += 4;
860 dst0 -= 4;
861 }
862
863 src0 += srcwgap;
864 dst0 += wgap;
865 }
866 }
867
kanna_rotate_3_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)868 static void kanna_rotate_3_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
869 {
870 const int srcwgap = srcstride - srcw;
871 const int wgap = stride - w;
872
873 // point to the last dst pixel
874 unsigned char* dstend = dst + stride * h - wgap;
875
876 const unsigned char* src0 = src;
877 unsigned char* dst0 = dstend - 1;
878
879 int y = 0;
880 for (; y < srch; y++)
881 {
882 #if __ARM_NEON
883 dst0 -= 15;
884
885 int nn = srcw >> 4;
886 int remain = srcw - (nn << 4);
887
888 #if __aarch64__
889 for (; nn > 0; nn--)
890 {
891 uint8x8_t _src = vld1_u8(src0);
892 uint8x8_t _src2 = vld1_u8(src0 + 8);
893
894 _src = vrev64_u8(_src);
895 _src2 = vrev64_u8(_src2);
896
897 vst1_u8(dst0, _src2);
898 vst1_u8(dst0 + 8, _src);
899
900 src0 += 16;
901 dst0 -= 16;
902 }
903 #else
904 if (nn > 0)
905 {
906 asm volatile(
907 "mov r4, #-16 \n"
908 "0: \n"
909 "pld [%1, #128] \n"
910 "vld1.u8 {d0-d1}, [%1]! \n"
911 "vrev64.u8 d3, d0 \n"
912 "vrev64.u8 d2, d1 \n"
913 "subs %0, #1 \n"
914 "vst1.u8 {d2-d3}, [%2], r4 \n"
915 "bne 0b \n"
916 : "=r"(nn), // %0
917 "=r"(src0), // %1
918 "=r"(dst0) // %2
919 : "0"(nn),
920 "1"(src0),
921 "2"(dst0)
922 : "cc", "memory", "q0", "q1", "r4");
923 }
924 #endif // __aarch64__
925
926 dst0 += 15;
927 #else
928 int remain = srcw;
929 #endif // __ARM_NEON
930
931 for (; remain > 0; remain--)
932 {
933 *dst0 = *src0;
934
935 src0 += 1;
936 dst0 -= 1;
937 }
938
939 src0 += srcwgap;
940 dst0 -= wgap;
941 }
942 }
943
kanna_rotate_3_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)944 static void kanna_rotate_3_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
945 {
946 const int srcwgap = srcstride - srcw * 2;
947 const int wgap = stride - w * 2;
948
949 // point to the last dst pixel
950 unsigned char* dstend = dst + stride * h - wgap;
951
952 const unsigned char* src0 = src;
953 unsigned char* dst0 = dstend - 2;
954
955 int y = 0;
956 for (; y < srch; y++)
957 {
958 #if __ARM_NEON
959 dst0 -= 7 * 2;
960
961 int nn = srcw >> 4;
962 int remain = srcw - (nn << 4);
963
964 #if __aarch64__
965 for (; nn > 0; nn--)
966 {
967 uint8x8x2_t _src = vld2_u8(src0);
968 uint8x8x2_t _src2 = vld2_u8(src0 + 8 * 2);
969
970 _src.val[0] = vrev64_u8(_src.val[0]);
971 _src.val[1] = vrev64_u8(_src.val[1]);
972
973 _src2.val[0] = vrev64_u8(_src2.val[0]);
974 _src2.val[1] = vrev64_u8(_src2.val[1]);
975
976 vst2_u8(dst0, _src);
977 vst2_u8(dst0 - 8 * 2, _src2);
978
979 src0 += 16 * 2;
980 dst0 -= 16 * 2;
981 }
982 #else
983 if (nn > 0)
984 {
985 asm volatile(
986 "mov r4, #-16 \n"
987 "0: \n"
988 "pld [%1, #128] \n"
989 "vld2.u8 {d0-d1}, [%1]! \n"
990 "vrev64.u8 d0, d0 \n"
991 "pld [%1, #128] \n"
992 "vld2.u8 {d2-d3}, [%1]! \n"
993 "vrev64.u8 d1, d1 \n"
994 "vrev64.u8 d2, d2 \n"
995 "vst2.u8 {d0-d1}, [%2], r4 \n"
996 "vrev64.u8 d3, d3 \n"
997 "subs %0, #1 \n"
998 "vst2.u8 {d2-d3}, [%2], r4 \n"
999 "bne 0b \n"
1000 : "=r"(nn), // %0
1001 "=r"(src0), // %1
1002 "=r"(dst0) // %2
1003 : "0"(nn),
1004 "1"(src0),
1005 "2"(dst0)
1006 : "cc", "memory", "q0", "q1", "r4");
1007 }
1008 #endif // __aarch64__
1009
1010 dst0 += 7 * 2;
1011 #else
1012 int remain = srcw;
1013 #endif // __ARM_NEON
1014
1015 for (; remain > 0; remain--)
1016 {
1017 dst0[0] = src0[0];
1018 dst0[1] = src0[1];
1019
1020 src0 += 2;
1021 dst0 -= 2;
1022 }
1023
1024 src0 += srcwgap;
1025 dst0 -= wgap;
1026 }
1027 }
1028
kanna_rotate_3_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)1029 static void kanna_rotate_3_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1030 {
1031 const int srcwgap = srcstride - srcw * 3;
1032 const int wgap = stride - w * 3;
1033
1034 // point to the last dst pixel
1035 unsigned char* dstend = dst + stride * h - wgap;
1036
1037 const unsigned char* src0 = src;
1038 unsigned char* dst0 = dstend - 3;
1039
1040 int y = 0;
1041 for (; y < srch; y++)
1042 {
1043 #if __ARM_NEON
1044 dst0 -= 7 * 3;
1045
1046 int nn = srcw >> 4;
1047 int remain = srcw - (nn << 4);
1048
1049 #if __aarch64__
1050 for (; nn > 0; nn--)
1051 {
1052 uint8x8x3_t _src = vld3_u8(src0);
1053 uint8x8x3_t _src2 = vld3_u8(src0 + 8 * 3);
1054
1055 _src.val[0] = vrev64_u8(_src.val[0]);
1056 _src.val[1] = vrev64_u8(_src.val[1]);
1057 _src.val[2] = vrev64_u8(_src.val[2]);
1058
1059 _src2.val[0] = vrev64_u8(_src2.val[0]);
1060 _src2.val[1] = vrev64_u8(_src2.val[1]);
1061 _src2.val[2] = vrev64_u8(_src2.val[2]);
1062
1063 vst3_u8(dst0, _src);
1064 vst3_u8(dst0 - 8 * 3, _src2);
1065
1066 src0 += 16 * 3;
1067 dst0 -= 16 * 3;
1068 }
1069 #else
1070 if (nn > 0)
1071 {
1072 asm volatile(
1073 "mov r4, #-24 \n"
1074 "0: \n"
1075 "pld [%1, #192] \n"
1076 "vld3.u8 {d0-d2}, [%1]! \n"
1077 "vrev64.u8 d0, d0 \n"
1078 "vrev64.u8 d1, d1 \n"
1079 "pld [%1, #192] \n"
1080 "vld3.u8 {d4-d6}, [%1]! \n"
1081 "vrev64.u8 d2, d2 \n"
1082 "vrev64.u8 d4, d4 \n"
1083 "vst3.u8 {d0-d2}, [%2], r4 \n"
1084 "vrev64.u8 d5, d5 \n"
1085 "vrev64.u8 d6, d6 \n"
1086 "subs %0, #1 \n"
1087 "vst3.u8 {d4-d6}, [%2], r4 \n"
1088 "bne 0b \n"
1089 : "=r"(nn), // %0
1090 "=r"(src0), // %1
1091 "=r"(dst0) // %2
1092 : "0"(nn),
1093 "1"(src0),
1094 "2"(dst0)
1095 : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
1096 }
1097 #endif // __aarch64__
1098
1099 dst0 += 7 * 3;
1100 #else
1101 int remain = srcw;
1102 #endif // __ARM_NEON
1103
1104 for (; remain > 0; remain--)
1105 {
1106 dst0[0] = src0[0];
1107 dst0[1] = src0[1];
1108 dst0[2] = src0[2];
1109
1110 src0 += 3;
1111 dst0 -= 3;
1112 }
1113
1114 src0 += srcwgap;
1115 dst0 -= wgap;
1116 }
1117 }
1118
kanna_rotate_3_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)1119 static void kanna_rotate_3_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1120 {
1121 const int srcwgap = srcstride - srcw * 4;
1122 const int wgap = stride - w * 4;
1123
1124 // point to the last dst pixel
1125 unsigned char* dstend = dst + stride * h - wgap;
1126
1127 const unsigned char* src0 = src;
1128 unsigned char* dst0 = dstend - 4;
1129
1130 int y = 0;
1131 for (; y < srch; y++)
1132 {
1133 #if __ARM_NEON
1134 dst0 -= 7 * 4;
1135
1136 int nn = srcw >> 4;
1137 int remain = srcw - (nn << 4);
1138
1139 #if __aarch64__
1140 for (; nn > 0; nn--)
1141 {
1142 uint8x8x4_t _src = vld4_u8(src0);
1143 uint8x8x4_t _src2 = vld4_u8(src0 + 8 * 4);
1144
1145 _src.val[0] = vrev64_u8(_src.val[0]);
1146 _src.val[1] = vrev64_u8(_src.val[1]);
1147 _src.val[2] = vrev64_u8(_src.val[2]);
1148 _src.val[3] = vrev64_u8(_src.val[3]);
1149
1150 _src2.val[0] = vrev64_u8(_src2.val[0]);
1151 _src2.val[1] = vrev64_u8(_src2.val[1]);
1152 _src2.val[2] = vrev64_u8(_src2.val[2]);
1153 _src2.val[3] = vrev64_u8(_src2.val[3]);
1154
1155 vst4_u8(dst0, _src);
1156 vst4_u8(dst0 - 8 * 4, _src2);
1157
1158 src0 += 16 * 4;
1159 dst0 -= 16 * 4;
1160 }
1161 #else
1162 if (nn > 0)
1163 {
1164 asm volatile(
1165 "mov r4, #-32 \n"
1166 "0: \n"
1167 "pld [%1, #256] \n"
1168 "vld4.u8 {d0-d3}, [%1]! \n"
1169 "vrev64.u8 d0, d0 \n"
1170 "vrev64.u8 d1, d1 \n"
1171 "vrev64.u8 d2, d2 \n"
1172 "pld [%1, #256] \n"
1173 "vld4.u8 {d4-d7}, [%1]! \n"
1174 "vrev64.u8 d3, d3 \n"
1175 "vrev64.u8 d4, d4 \n"
1176 "vrev64.u8 d5, d5 \n"
1177 "vst4.u8 {d0-d3}, [%2], r4 \n"
1178 "vrev64.u8 d6, d6 \n"
1179 "vrev64.u8 d7, d7 \n"
1180 "subs %0, #1 \n"
1181 "vst4.u8 {d4-d7}, [%2], r4 \n"
1182 "bne 0b \n"
1183 : "=r"(nn), // %0
1184 "=r"(src0), // %1
1185 "=r"(dst0) // %2
1186 : "0"(nn),
1187 "1"(src0),
1188 "2"(dst0)
1189 : "cc", "memory", "q0", "q1", "q2", "q3", "r4");
1190 }
1191 #endif // __aarch64__
1192
1193 dst0 += 7 * 4;
1194 #else
1195 int remain = srcw;
1196 #endif // __ARM_NEON
1197
1198 for (; remain > 0; remain--)
1199 {
1200 dst0[0] = src0[0];
1201 dst0[1] = src0[1];
1202 dst0[2] = src0[2];
1203 dst0[3] = src0[3];
1204
1205 src0 += 4;
1206 dst0 -= 4;
1207 }
1208
1209 src0 += srcwgap;
1210 dst0 -= wgap;
1211 }
1212 }
1213
kanna_rotate_4_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)1214 static void kanna_rotate_4_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1215 {
1216 const int srcwgap = srcstride - srcw;
1217 const int wgap = stride + w;
1218
1219 // point to the last dst pixel row
1220 unsigned char* dstend = dst + stride * (h - 1);
1221
1222 const unsigned char* src0 = src;
1223 const unsigned char* src1 = src + srcstride;
1224 unsigned char* dst0 = dstend;
1225 unsigned char* dst1 = dstend - stride;
1226
1227 int y = 0;
1228 for (; y + 1 < srch; y += 2)
1229 {
1230 #if __ARM_NEON
1231 int nn = srcw >> 5;
1232 int remain = srcw - (nn << 5);
1233 #if __aarch64__
1234 for (; nn > 0; nn--)
1235 {
1236 uint8x16_t _src0 = vld1q_u8(src0);
1237 uint8x16_t _src0n = vld1q_u8(src0 + 16);
1238 vst1q_u8(dst0, _src0);
1239 vst1q_u8(dst0 + 16, _src0n);
1240
1241 uint8x16_t _src1 = vld1q_u8(src1);
1242 uint8x16_t _src1n = vld1q_u8(src1 + 16);
1243 vst1q_u8(dst1, _src1);
1244 vst1q_u8(dst1 + 16, _src1n);
1245
1246 src0 += 32;
1247 src1 += 32;
1248 dst0 += 32;
1249 dst1 += 32;
1250 }
1251 #else
1252 if (nn > 0)
1253 {
1254 asm volatile(
1255 "0: \n"
1256 "pld [%1, #256] \n"
1257 "vld1.u8 {d0-d3}, [%1]! \n"
1258 "pld [%2, #256] \n"
1259 "vld1.u8 {d4-d7}, [%2]! \n"
1260 "subs %0, #1 \n"
1261 "vst1.u8 {d0-d3}, [%3]! \n"
1262 "vst1.u8 {d4-d7}, [%4]! \n"
1263 "bne 0b \n"
1264 : "=r"(nn), // %0
1265 "=r"(src0), // %1
1266 "=r"(src1), // %2
1267 "=r"(dst0), // %3
1268 "=r"(dst1) // %4
1269 : "0"(nn),
1270 "1"(src0),
1271 "2"(src1),
1272 "3"(dst0),
1273 "4"(dst1)
1274 : "cc", "memory", "q0", "q1", "q2", "q3");
1275 }
1276 #endif // __aarch64__
1277 #else
1278 int remain = srcw;
1279 #endif // __ARM_NEON
1280
1281 for (; remain > 0; remain--)
1282 {
1283 *dst0++ = *src0++;
1284 *dst1++ = *src1++;
1285 }
1286
1287 src0 += srcwgap + srcstride;
1288 src1 += srcwgap + srcstride;
1289 dst0 -= wgap + stride;
1290 dst1 -= wgap + stride;
1291 }
1292
1293 for (; y < srch; y++)
1294 {
1295 #if __ARM_NEON
1296 int nn = srcw >> 5;
1297 int remain = srcw - (nn << 5);
1298 #if __aarch64__
1299 for (; nn > 0; nn--)
1300 {
1301 uint8x16_t _src = vld1q_u8(src0);
1302 uint8x16_t _src2 = vld1q_u8(src0 + 16);
1303 vst1q_u8(dst0, _src);
1304 vst1q_u8(dst0 + 16, _src2);
1305
1306 src0 += 32;
1307 dst0 += 32;
1308 }
1309 #else
1310 if (nn > 0)
1311 {
1312 asm volatile(
1313 "0: \n"
1314 "pld [%1, #256] \n"
1315 "vld1.u8 {d0-d3}, [%1]! \n"
1316 "subs %0, #1 \n"
1317 "vst1.u8 {d0-d3}, [%2]! \n"
1318 "bne 0b \n"
1319 : "=r"(nn), // %0
1320 "=r"(src0), // %1
1321 "=r"(dst0) // %2
1322 : "0"(nn),
1323 "1"(src0),
1324 "2"(dst0)
1325 : "cc", "memory", "q0", "q1");
1326 }
1327 #endif // __aarch64__
1328 #else
1329 int remain = srcw;
1330 #endif // __ARM_NEON
1331
1332 for (; remain > 0; remain--)
1333 {
1334 *dst0++ = *src0++;
1335 }
1336
1337 src0 += srcwgap;
1338 dst0 -= wgap;
1339 }
1340 }
1341
kanna_rotate_4_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)1342 static void kanna_rotate_4_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1343 {
1344 const int srcwgap = srcstride - srcw * 2;
1345 const int wgap = stride + w * 2;
1346
1347 // point to the last dst pixel row
1348 unsigned char* dstend = dst + stride * (h - 1);
1349
1350 int size = srcw * 2;
1351
1352 const unsigned char* src0 = src;
1353 const unsigned char* src1 = src + srcstride;
1354 unsigned char* dst0 = dstend;
1355 unsigned char* dst1 = dstend - stride;
1356
1357 int y = 0;
1358 for (; y + 1 < srch; y += 2)
1359 {
1360 #if __ARM_NEON
1361 int nn = size >> 5;
1362 int remain = size - (nn << 5);
1363 #if __aarch64__
1364 for (; nn > 0; nn--)
1365 {
1366 uint8x16_t _src0 = vld1q_u8(src0);
1367 uint8x16_t _src0n = vld1q_u8(src0 + 16);
1368 vst1q_u8(dst0, _src0);
1369 vst1q_u8(dst0 + 16, _src0n);
1370
1371 uint8x16_t _src1 = vld1q_u8(src1);
1372 uint8x16_t _src1n = vld1q_u8(src1 + 16);
1373 vst1q_u8(dst1, _src1);
1374 vst1q_u8(dst1 + 16, _src1n);
1375
1376 src0 += 32;
1377 src1 += 32;
1378 dst0 += 32;
1379 dst1 += 32;
1380 }
1381 #else
1382 if (nn > 0)
1383 {
1384 asm volatile(
1385 "0: \n"
1386 "pld [%1, #256] \n"
1387 "vld1.u8 {d0-d3}, [%1]! \n"
1388 "pld [%2, #256] \n"
1389 "vld1.u8 {d4-d7}, [%2]! \n"
1390 "subs %0, #1 \n"
1391 "vst1.u8 {d0-d3}, [%3]! \n"
1392 "vst1.u8 {d4-d7}, [%4]! \n"
1393 "bne 0b \n"
1394 : "=r"(nn), // %0
1395 "=r"(src0), // %1
1396 "=r"(src1), // %2
1397 "=r"(dst0), // %3
1398 "=r"(dst1) // %4
1399 : "0"(nn),
1400 "1"(src0),
1401 "2"(src1),
1402 "3"(dst0),
1403 "4"(dst1)
1404 : "cc", "memory", "q0", "q1", "q2", "q3");
1405 }
1406 #endif // __aarch64__
1407 #else
1408 int remain = size;
1409 #endif // __ARM_NEON
1410
1411 for (; remain > 0; remain--)
1412 {
1413 *dst0++ = *src0++;
1414 *dst1++ = *src1++;
1415 }
1416
1417 src0 += srcwgap + srcstride;
1418 src1 += srcwgap + srcstride;
1419 dst0 -= wgap + stride;
1420 dst1 -= wgap + stride;
1421 }
1422
1423 for (; y < srch; y++)
1424 {
1425 #if __ARM_NEON
1426 int nn = size >> 5;
1427 int remain = size - (nn << 5);
1428 #if __aarch64__
1429 for (; nn > 0; nn--)
1430 {
1431 uint8x16_t _src = vld1q_u8(src0);
1432 uint8x16_t _src2 = vld1q_u8(src0 + 16);
1433 vst1q_u8(dst0, _src);
1434 vst1q_u8(dst0 + 16, _src2);
1435
1436 src0 += 32;
1437 dst0 += 32;
1438 }
1439 #else
1440 if (nn > 0)
1441 {
1442 asm volatile(
1443 "0: \n"
1444 "pld [%1, #256] \n"
1445 "vld1.u8 {d0-d3}, [%1]! \n"
1446 "subs %0, #1 \n"
1447 "vst1.u8 {d0-d3}, [%2]! \n"
1448 "bne 0b \n"
1449 : "=r"(nn), // %0
1450 "=r"(src0), // %1
1451 "=r"(dst0) // %2
1452 : "0"(nn),
1453 "1"(src0),
1454 "2"(dst0)
1455 : "cc", "memory", "q0", "q1");
1456 }
1457 #endif // __aarch64__
1458 #else
1459 int remain = size;
1460 #endif // __ARM_NEON
1461
1462 for (; remain > 0; remain--)
1463 {
1464 *dst0++ = *src0++;
1465 }
1466
1467 src0 += srcwgap;
1468 dst0 -= wgap;
1469 }
1470 }
1471
kanna_rotate_4_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)1472 static void kanna_rotate_4_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1473 {
1474 const int srcwgap = srcstride - srcw * 3;
1475 const int wgap = stride + w * 3;
1476
1477 // point to the last dst pixel row
1478 unsigned char* dstend = dst + stride * (h - 1);
1479
1480 int size = srcw * 3;
1481
1482 const unsigned char* src0 = src;
1483 const unsigned char* src1 = src + srcstride;
1484 unsigned char* dst0 = dstend;
1485 unsigned char* dst1 = dstend - stride;
1486
1487 int y = 0;
1488 for (; y + 1 < srch; y += 2)
1489 {
1490 #if __ARM_NEON
1491 int nn = size >> 5;
1492 int remain = size - (nn << 5);
1493 #if __aarch64__
1494 for (; nn > 0; nn--)
1495 {
1496 uint8x16_t _src0 = vld1q_u8(src0);
1497 uint8x16_t _src0n = vld1q_u8(src0 + 16);
1498 vst1q_u8(dst0, _src0);
1499 vst1q_u8(dst0 + 16, _src0n);
1500
1501 uint8x16_t _src1 = vld1q_u8(src1);
1502 uint8x16_t _src1n = vld1q_u8(src1 + 16);
1503 vst1q_u8(dst1, _src1);
1504 vst1q_u8(dst1 + 16, _src1n);
1505
1506 src0 += 32;
1507 src1 += 32;
1508 dst0 += 32;
1509 dst1 += 32;
1510 }
1511 #else
1512 if (nn > 0)
1513 {
1514 asm volatile(
1515 "0: \n"
1516 "pld [%1, #256] \n"
1517 "vld1.u8 {d0-d3}, [%1]! \n"
1518 "pld [%2, #256] \n"
1519 "vld1.u8 {d4-d7}, [%2]! \n"
1520 "subs %0, #1 \n"
1521 "vst1.u8 {d0-d3}, [%3]! \n"
1522 "vst1.u8 {d4-d7}, [%4]! \n"
1523 "bne 0b \n"
1524 : "=r"(nn), // %0
1525 "=r"(src0), // %1
1526 "=r"(src1), // %2
1527 "=r"(dst0), // %3
1528 "=r"(dst1) // %4
1529 : "0"(nn),
1530 "1"(src0),
1531 "2"(src1),
1532 "3"(dst0),
1533 "4"(dst1)
1534 : "cc", "memory", "q0", "q1", "q2", "q3");
1535 }
1536 #endif // __aarch64__
1537 #else
1538 int remain = size;
1539 #endif // __ARM_NEON
1540
1541 for (; remain > 0; remain--)
1542 {
1543 *dst0++ = *src0++;
1544 *dst1++ = *src1++;
1545 }
1546
1547 src0 += srcwgap + srcstride;
1548 src1 += srcwgap + srcstride;
1549 dst0 -= wgap + stride;
1550 dst1 -= wgap + stride;
1551 }
1552
1553 for (; y < srch; y++)
1554 {
1555 #if __ARM_NEON
1556 int nn = size >> 5;
1557 int remain = size - (nn << 5);
1558 #if __aarch64__
1559 for (; nn > 0; nn--)
1560 {
1561 uint8x16_t _src = vld1q_u8(src0);
1562 uint8x16_t _src2 = vld1q_u8(src0 + 16);
1563 vst1q_u8(dst0, _src);
1564 vst1q_u8(dst0 + 16, _src2);
1565
1566 src0 += 32;
1567 dst0 += 32;
1568 }
1569 #else
1570 if (nn > 0)
1571 {
1572 asm volatile(
1573 "0: \n"
1574 "pld [%1, #256] \n"
1575 "vld1.u8 {d0-d3}, [%1]! \n"
1576 "subs %0, #1 \n"
1577 "vst1.u8 {d0-d3}, [%2]! \n"
1578 "bne 0b \n"
1579 : "=r"(nn), // %0
1580 "=r"(src0), // %1
1581 "=r"(dst0) // %2
1582 : "0"(nn),
1583 "1"(src0),
1584 "2"(dst0)
1585 : "cc", "memory", "q0", "q1");
1586 }
1587 #endif // __aarch64__
1588 #else
1589 int remain = size;
1590 #endif // __ARM_NEON
1591
1592 for (; remain > 0; remain--)
1593 {
1594 *dst0++ = *src0++;
1595 }
1596
1597 src0 += srcwgap;
1598 dst0 -= wgap;
1599 }
1600 }
1601
kanna_rotate_4_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)1602 static void kanna_rotate_4_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
1603 {
1604 const int srcwgap = srcstride - srcw * 4;
1605 const int wgap = stride + w * 4;
1606
1607 // point to the last dst pixel row
1608 unsigned char* dstend = dst + stride * (h - 1);
1609
1610 int size = srcw * 4;
1611
1612 const unsigned char* src0 = src;
1613 const unsigned char* src1 = src + srcstride;
1614 unsigned char* dst0 = dstend;
1615 unsigned char* dst1 = dstend - stride;
1616
1617 int y = 0;
1618 for (; y + 1 < srch; y += 2)
1619 {
1620 #if __ARM_NEON
1621 int nn = size >> 5;
1622 int remain = size - (nn << 5);
1623 #if __aarch64__
1624 for (; nn > 0; nn--)
1625 {
1626 uint8x16_t _src0 = vld1q_u8(src0);
1627 uint8x16_t _src0n = vld1q_u8(src0 + 16);
1628 vst1q_u8(dst0, _src0);
1629 vst1q_u8(dst0 + 16, _src0n);
1630
1631 uint8x16_t _src1 = vld1q_u8(src1);
1632 uint8x16_t _src1n = vld1q_u8(src1 + 16);
1633 vst1q_u8(dst1, _src1);
1634 vst1q_u8(dst1 + 16, _src1n);
1635
1636 src0 += 32;
1637 src1 += 32;
1638 dst0 += 32;
1639 dst1 += 32;
1640 }
1641 #else
1642 if (nn > 0)
1643 {
1644 asm volatile(
1645 "0: \n"
1646 "pld [%1, #256] \n"
1647 "vld1.u8 {d0-d3}, [%1]! \n"
1648 "pld [%2, #256] \n"
1649 "vld1.u8 {d4-d7}, [%2]! \n"
1650 "subs %0, #1 \n"
1651 "vst1.u8 {d0-d3}, [%3]! \n"
1652 "vst1.u8 {d4-d7}, [%4]! \n"
1653 "bne 0b \n"
1654 : "=r"(nn), // %0
1655 "=r"(src0), // %1
1656 "=r"(src1), // %2
1657 "=r"(dst0), // %3
1658 "=r"(dst1) // %4
1659 : "0"(nn),
1660 "1"(src0),
1661 "2"(src1),
1662 "3"(dst0),
1663 "4"(dst1)
1664 : "cc", "memory", "q0", "q1", "q2", "q3");
1665 }
1666 #endif // __aarch64__
1667 #else
1668 int remain = size;
1669 #endif // __ARM_NEON
1670
1671 for (; remain > 0; remain--)
1672 {
1673 *dst0++ = *src0++;
1674 *dst1++ = *src1++;
1675 }
1676
1677 src0 += srcwgap + srcstride;
1678 src1 += srcwgap + srcstride;
1679 dst0 -= wgap + stride;
1680 dst1 -= wgap + stride;
1681 }
1682
1683 for (; y < srch; y++)
1684 {
1685 #if __ARM_NEON
1686 int nn = size >> 5;
1687 int remain = size - (nn << 5);
1688 #if __aarch64__
1689 for (; nn > 0; nn--)
1690 {
1691 uint8x16_t _src = vld1q_u8(src0);
1692 uint8x16_t _src2 = vld1q_u8(src0 + 16);
1693 vst1q_u8(dst0, _src);
1694 vst1q_u8(dst0 + 16, _src2);
1695
1696 src0 += 32;
1697 dst0 += 32;
1698 }
1699 #else
1700 if (nn > 0)
1701 {
1702 asm volatile(
1703 "0: \n"
1704 "pld [%1, #256] \n"
1705 "vld1.u8 {d0-d3}, [%1]! \n"
1706 "subs %0, #1 \n"
1707 "vst1.u8 {d0-d3}, [%2]! \n"
1708 "bne 0b \n"
1709 : "=r"(nn), // %0
1710 "=r"(src0), // %1
1711 "=r"(dst0) // %2
1712 : "0"(nn),
1713 "1"(src0),
1714 "2"(dst0)
1715 : "cc", "memory", "q0", "q1");
1716 }
1717 #endif // __aarch64__
1718 #else
1719 int remain = size;
1720 #endif // __ARM_NEON
1721
1722 for (; remain > 0; remain--)
1723 {
1724 *dst0++ = *src0++;
1725 }
1726
1727 src0 += srcwgap;
1728 dst0 -= wgap;
1729 }
1730 }
1731
kanna_rotate_5_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int,int stride)1732 static void kanna_rotate_5_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
1733 {
1734 const int srcwgap = srcstride - srcw;
1735
1736 const unsigned char* src0 = src;
1737
1738 int y = 0;
1739 #if __ARM_NEON
1740 for (; y + 7 < srch; y += 8)
1741 {
1742 const unsigned char* src1 = src0 + srcstride;
1743
1744 unsigned char* dst0 = dst + y;
1745 unsigned char* dst1 = dst + y + stride;
1746
1747 int src_step = 2 * srcstride;
1748 int dst_step = 2 * stride;
1749
1750 int nn = srcw >> 3;
1751 int remain = srcw - (nn << 3);
1752
1753 #if __aarch64__
1754 for (; nn > 0; nn--)
1755 {
1756 uint8x8_t _src0 = vld1_u8(src0);
1757 uint8x8_t _src1 = vld1_u8(src1);
1758
1759 uint8x8_t _src2 = vld1_u8(src0 + src_step);
1760 uint8x8_t _src3 = vld1_u8(src1 + src_step);
1761
1762 uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
1763 uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
1764
1765 uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
1766 uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
1767
1768 uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
1769 uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
1770 uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
1771 uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
1772
1773 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
1774 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
1775 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
1776 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
1777
1778 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
1779 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
1780 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
1781 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
1782
1783 uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
1784 uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
1785 uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
1786 uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
1787 uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
1788 uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
1789 uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
1790 uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
1791
1792 vst1_u8(dst0, _dst0);
1793 vst1_u8(dst1, _dst1);
1794 vst1_u8(dst0 + dst_step, _dst2);
1795 vst1_u8(dst1 + dst_step, _dst3);
1796 vst1_u8(dst0 + 2 * dst_step, _dst4);
1797 vst1_u8(dst1 + 2 * dst_step, _dst5);
1798 vst1_u8(dst0 + 3 * dst_step, _dst6);
1799 vst1_u8(dst1 + 3 * dst_step, _dst7);
1800
1801 src0 += 8;
1802 src1 += 8;
1803
1804 dst0 += 4 * dst_step;
1805 dst1 += 4 * dst_step;
1806 }
1807 #else
1808 if (nn > 0)
1809 {
1810 asm volatile(
1811 "0: \n"
1812 "pld [%1, #64] \n"
1813 "vld1.u8 {d0}, [%1], %10 \n"
1814
1815 "pld [%2, #64] \n"
1816 "vld1.u8 {d1}, [%2], %10 \n"
1817
1818 "pld [%1, #64] \n"
1819 "vld1.u8 {d2}, [%1], %10 \n"
1820
1821 "vtrn.u8 d0, d1 \n" // _src01t_r
1822
1823 "pld [%2, #64] \n"
1824 "vld1.u8 {d3}, [%2], %10 \n"
1825
1826 "pld [%1, #64] \n"
1827 "vld1.u8 {d4}, [%1], %10 \n"
1828
1829 "vtrn.u8 d2, d3 \n" // _src23t_r
1830
1831 "pld [%2, #64] \n"
1832 "vld1.u8 {d5}, [%2], %10 \n"
1833
1834 "pld [%1, #64] \n"
1835 "vld1.u8 {d6}, [%1], %10 \n"
1836
1837 "vtrn.u8 d4, d5 \n" // _src45t_r
1838
1839 "pld [%2, #64] \n"
1840 "vld1.u8 {d7}, [%2], %10 \n"
1841
1842 "vtrn.u8 d6, d7 \n" // _src67t_r
1843
1844 "sub %1, %1, %10, lsl #2 \n" // restore src0
1845
1846 "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
1847
1848 "sub %2, %2, %10, lsl #2 \n" // restore src1
1849
1850 "vtrn.u16 q2, q3 \n" // _src13tt_r _src46tt_r
1851
1852 "add %1, #8 \n" // src0 += 8
1853
1854 "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
1855
1856 "add %2, #8 \n" // src1 += 8
1857
1858 "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
1859 "vst1.u8 {d0}, [%3], %11 \n"
1860 "vst1.u8 {d1}, [%4], %11 \n"
1861
1862 "subs %0, #1 \n"
1863
1864 "vst1.u8 {d2}, [%3], %11 \n"
1865 "vst1.u8 {d3}, [%4], %11 \n"
1866 "vst1.u8 {d4}, [%3], %11 \n"
1867 "vst1.u8 {d5}, [%4], %11 \n"
1868 "vst1.u8 {d6}, [%3], %11 \n"
1869 "vst1.u8 {d7}, [%4], %11 \n"
1870
1871 "bne 0b \n"
1872 : "=r"(nn), // %0
1873 "=r"(src0), // %1
1874 "=r"(src1), // %2
1875 "=r"(dst0), // %3
1876 "=r"(dst1) // %4
1877 : "0"(nn),
1878 "1"(src0),
1879 "2"(src1),
1880 "3"(dst0),
1881 "4"(dst1),
1882 "r"(src_step), // %10
1883 "r"(dst_step) // %11
1884 : "cc", "memory", "q0", "q1", "q2", "q3");
1885 }
1886 #endif // __aarch64__
1887 for (; remain > 0; remain--)
1888 {
1889 dst0[0] = src0[0];
1890 dst0[1] = src1[0];
1891 dst0[2] = src0[0 + src_step];
1892 dst0[3] = src1[0 + src_step];
1893 dst0[4] = src0[0 + 2 * src_step];
1894 dst0[5] = src1[0 + 2 * src_step];
1895 dst0[6] = src0[0 + 3 * src_step];
1896 dst0[7] = src1[0 + 3 * src_step];
1897
1898 src0 += 1;
1899 src1 += 1;
1900
1901 dst0 += stride;
1902 }
1903
1904 src0 += srcwgap + 7 * srcstride;
1905 }
1906 #endif // __ARM_NEON
1907 for (; y < srch; y++)
1908 {
1909 unsigned char* dst0 = dst + y;
1910
1911 int x = 0;
1912 for (; x < srcw; x++)
1913 {
1914 *dst0 = *src0;
1915
1916 src0 += 1;
1917 dst0 += stride;
1918 }
1919
1920 src0 += srcwgap;
1921 }
1922 }
1923
kanna_rotate_5_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int,int stride)1924 static void kanna_rotate_5_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
1925 {
1926 const int srcwgap = srcstride - srcw * 2;
1927
1928 const unsigned char* src0 = src;
1929
1930 int y = 0;
1931 #if __ARM_NEON
1932 for (; y + 7 < srch; y += 8)
1933 {
1934 const unsigned char* src1 = src0 + srcstride;
1935
1936 unsigned char* dst0 = dst + y * 2;
1937 unsigned char* dst1 = dst + y * 2 + stride;
1938
1939 int src_step = 2 * srcstride;
1940 int dst_step = 2 * stride;
1941
1942 int nn = srcw >> 3;
1943 int remain = srcw - (nn << 3);
1944
1945 #if __aarch64__
1946 for (; nn > 0; nn--)
1947 {
1948 uint8x8x2_t _src0 = vld2_u8(src0);
1949 uint8x8x2_t _src1 = vld2_u8(src1);
1950
1951 uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
1952 uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
1953
1954 uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
1955 uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
1956
1957 uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
1958 uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
1959
1960 uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
1961 uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
1962 uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
1963 uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
1964
1965 uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
1966 uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
1967 uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
1968 uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
1969
1970 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
1971 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
1972 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
1973 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
1974
1975 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
1976 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
1977 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
1978 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
1979
1980 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
1981 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
1982 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
1983 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
1984
1985 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
1986 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
1987 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
1988 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
1989
1990 uint8x8x2_t _dst0;
1991 uint8x8x2_t _dst1;
1992 uint8x8x2_t _dst2;
1993 uint8x8x2_t _dst3;
1994 uint8x8x2_t _dst4;
1995 uint8x8x2_t _dst5;
1996 uint8x8x2_t _dst6;
1997 uint8x8x2_t _dst7;
1998
1999 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2000 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2001 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2002 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2003 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2004 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2005 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2006 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2007
2008 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
2009 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
2010 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
2011 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
2012 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
2013 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
2014 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
2015 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
2016
2017 vst2_u8(dst0, _dst0);
2018 vst2_u8(dst1, _dst1);
2019 vst2_u8(dst0 + dst_step, _dst2);
2020 vst2_u8(dst1 + dst_step, _dst3);
2021 vst2_u8(dst0 + 2 * dst_step, _dst4);
2022 vst2_u8(dst1 + 2 * dst_step, _dst5);
2023 vst2_u8(dst0 + 3 * dst_step, _dst6);
2024 vst2_u8(dst1 + 3 * dst_step, _dst7);
2025
2026 src0 += 2 * 8;
2027 src1 += 2 * 8;
2028
2029 dst0 += 4 * dst_step;
2030 dst1 += 4 * dst_step;
2031 }
2032 #else
2033 if (nn > 0)
2034 {
2035 asm volatile(
2036 "0: \n"
2037 "pld [%1, #128] \n"
2038 "vld2.u8 {d0-d1}, [%1], %10 \n"
2039
2040 "pld [%2, #128] \n"
2041 "vld2.u8 {d2-d3}, [%2], %10 \n"
2042
2043 "pld [%1, #128] \n"
2044 "vld2.u8 {d4-d5}, [%1], %10 \n"
2045
2046 "vtrn.u8 q0, q1 \n" // _src01t_r
2047
2048 "pld [%2, #128] \n"
2049 "vld2.u8 {d6-d7}, [%2], %10 \n"
2050
2051 "pld [%1, #128] \n"
2052 "vld2.u8 {d16-d17}, [%1], %10\n"
2053
2054 "vtrn.u8 q2, q3 \n" // _src23t_r
2055
2056 "pld [%2, #128] \n"
2057 "vld2.u8 {d18-d19}, [%2], %10\n"
2058
2059 "pld [%1, #128] \n"
2060 "vld2.u8 {d20-d21}, [%1], %10\n"
2061
2062 "vtrn.u8 q8, q9 \n" // _src45t_r
2063
2064 "pld [%2, #128] \n"
2065 "vld2.u8 {d22-d23}, [%2], %10\n"
2066
2067 "vtrn.u8 q10, q11 \n" // _src67t_r
2068
2069 "sub %1, %1, %10, lsl #2 \n" // restore src0
2070
2071 "vtrn.u16 q0, q2 \n" // _src02tt_r
2072
2073 "sub %2, %2, %10, lsl #2 \n" // restore src1
2074
2075 "vtrn.u16 q1, q3 \n" // _src13tt_r
2076
2077 "add %1, #16 \n" // src0 += 16
2078
2079 "vtrn.u16 q8, q10 \n" // _src46tt_r
2080
2081 "add %2, #16 \n" // src1 += 16
2082
2083 "vtrn.u16 q9, q11 \n" // _src57tt_r
2084
2085 "vtrn.u32 q0, q8 \n" // _src04ttt_r
2086
2087 "vtrn.u32 q1, q9 \n" // _src15ttt_r
2088 "vst2.u8 {d0-d1}, [%3], %11 \n"
2089
2090 "vtrn.u32 q2, q10 \n" // _src26ttt_r
2091 "vst2.u8 {d2-d3}, [%4], %11 \n"
2092
2093 "vtrn.u32 q3, q11 \n" // _src37ttt_r
2094 "vst2.u8 {d4-d5}, [%3], %11 \n"
2095
2096 "subs %0, #1 \n"
2097
2098 "vst2.u8 {d6-d7}, [%4], %11 \n"
2099 "vst2.u8 {d16-d17}, [%3], %11\n"
2100 "vst2.u8 {d18-d19}, [%4], %11\n"
2101 "vst2.u8 {d20-d21}, [%3], %11\n"
2102 "vst2.u8 {d22-d23}, [%4], %11\n"
2103
2104 "bne 0b \n"
2105 : "=r"(nn), // %0
2106 "=r"(src0), // %1
2107 "=r"(src1), // %2
2108 "=r"(dst0), // %3
2109 "=r"(dst1) // %4
2110 : "0"(nn),
2111 "1"(src0),
2112 "2"(src1),
2113 "3"(dst0),
2114 "4"(dst1),
2115 "r"(src_step), // %10
2116 "r"(dst_step) // %11
2117 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
2118 }
2119 #endif // __aarch64__
2120 for (; remain > 0; remain--)
2121 {
2122 dst0[0] = src0[0];
2123 dst0[1] = src0[1];
2124 dst0[2] = src1[0];
2125 dst0[3] = src1[1];
2126 dst0[4] = src0[0 + src_step];
2127 dst0[5] = src0[1 + src_step];
2128 dst0[6] = src1[0 + src_step];
2129 dst0[7] = src1[1 + src_step];
2130 dst0[8] = src0[0 + 2 * src_step];
2131 dst0[9] = src0[1 + 2 * src_step];
2132 dst0[10] = src1[0 + 2 * src_step];
2133 dst0[11] = src1[1 + 2 * src_step];
2134 dst0[12] = src0[0 + 3 * src_step];
2135 dst0[13] = src0[1 + 3 * src_step];
2136 dst0[14] = src1[0 + 3 * src_step];
2137 dst0[15] = src1[1 + 3 * src_step];
2138
2139 src0 += 2;
2140 src1 += 2;
2141
2142 dst0 += stride;
2143 }
2144
2145 src0 += srcwgap + 7 * srcstride;
2146 }
2147 #endif // __ARM_NEON
2148 for (; y < srch; y++)
2149 {
2150 unsigned char* dst0 = dst + y * 2;
2151
2152 int x = 0;
2153 for (; x < srcw; x++)
2154 {
2155 dst0[0] = src0[0];
2156 dst0[1] = src0[1];
2157
2158 src0 += 2;
2159 dst0 += stride;
2160 }
2161
2162 src0 += srcwgap;
2163 }
2164 }
2165
kanna_rotate_5_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int,int stride)2166 static void kanna_rotate_5_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
2167 {
2168 const int srcwgap = srcstride - srcw * 3;
2169
2170 const unsigned char* src0 = src;
2171
2172 int y = 0;
2173 #if __ARM_NEON
2174 for (; y + 7 < srch; y += 8)
2175 {
2176 const unsigned char* src1 = src0 + srcstride;
2177
2178 unsigned char* dst0 = dst + y * 3;
2179 unsigned char* dst1 = dst + y * 3 + stride;
2180
2181 int src_step = 2 * srcstride;
2182 int dst_step = 2 * stride;
2183
2184 int nn = srcw >> 3;
2185 int remain = srcw - (nn << 3);
2186
2187 #if __aarch64__
2188 for (; nn > 0; nn--)
2189 {
2190 uint8x8x3_t _src0 = vld3_u8(src0);
2191 uint8x8x3_t _src1 = vld3_u8(src1);
2192
2193 uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
2194 uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
2195
2196 uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
2197 uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
2198
2199 uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
2200 uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
2201
2202 uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
2203 uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
2204 uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
2205 uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
2206
2207 uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
2208 uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
2209 uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
2210 uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
2211
2212 uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
2213 uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
2214 uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
2215 uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
2216
2217 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
2218 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
2219 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
2220 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
2221
2222 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
2223 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
2224 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
2225 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
2226
2227 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
2228 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
2229 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
2230 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
2231
2232 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
2233 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
2234 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
2235 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
2236
2237 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
2238 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
2239 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
2240 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
2241
2242 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
2243 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
2244 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
2245 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
2246
2247 uint8x8x3_t _dst0;
2248 uint8x8x3_t _dst1;
2249 uint8x8x3_t _dst2;
2250 uint8x8x3_t _dst3;
2251 uint8x8x3_t _dst4;
2252 uint8x8x3_t _dst5;
2253 uint8x8x3_t _dst6;
2254 uint8x8x3_t _dst7;
2255
2256 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2257 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2258 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2259 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2260 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2261 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2262 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2263 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2264
2265 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
2266 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
2267 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
2268 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
2269 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
2270 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
2271 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
2272 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
2273
2274 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
2275 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
2276 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
2277 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
2278 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
2279 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
2280 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
2281 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
2282
2283 vst3_u8(dst0, _dst0);
2284 vst3_u8(dst1, _dst1);
2285 vst3_u8(dst0 + dst_step, _dst2);
2286 vst3_u8(dst1 + dst_step, _dst3);
2287 vst3_u8(dst0 + 2 * dst_step, _dst4);
2288 vst3_u8(dst1 + 2 * dst_step, _dst5);
2289 vst3_u8(dst0 + 3 * dst_step, _dst6);
2290 vst3_u8(dst1 + 3 * dst_step, _dst7);
2291
2292 src0 += 3 * 8;
2293 src1 += 3 * 8;
2294
2295 dst0 += 4 * dst_step;
2296 dst1 += 4 * dst_step;
2297 }
2298 #else
2299 if (nn > 0)
2300 {
2301 asm volatile(
2302 "0: \n"
2303 "pld [%1, #192] \n"
2304 "vld3.u8 {d0-d2}, [%1], %10 \n"
2305
2306 "pld [%2, #192] \n"
2307 "vld3.u8 {d4-d6}, [%2], %10 \n"
2308
2309 "pld [%1, #192] \n"
2310 "vld3.u8 {d8-d10}, [%1], %10 \n"
2311
2312 "vtrn.u8 q0, q2 \n" // _src01t_r
2313 "vtrn.u8 d2, d6 \n"
2314
2315 "pld [%2, #192] \n"
2316 "vld3.u8 {d12-d14}, [%2], %10\n"
2317
2318 "pld [%1, #192] \n"
2319 "vld3.u8 {d16-d18}, [%1], %10\n"
2320
2321 "vtrn.u8 q4, q6 \n" // _src23t_r
2322 "vtrn.u8 d10, d14 \n"
2323
2324 "pld [%2, #192] \n"
2325 "vld3.u8 {d20-d22}, [%2], %10\n"
2326
2327 "pld [%1, #192] \n"
2328 "vld3.u8 {d24-d26}, [%1], %10\n"
2329
2330 "vtrn.u8 q8, q10 \n" // _src45t_r
2331 "vtrn.u8 d18, d22 \n"
2332
2333 "pld [%2, #192] \n"
2334 "vld3.u8 {d28-d30}, [%2], %10\n"
2335
2336 "vtrn.u8 q12, q14 \n" // _src67t_r
2337 "vtrn.u8 d26, d30 \n"
2338
2339 "sub %1, %1, %10, lsl #2 \n" // restore src0
2340
2341 "vtrn.u16 q0, q4 \n" // _src02tt_r
2342 "vtrn.u16 d2, d10 \n"
2343
2344 "sub %2, %2, %10, lsl #2 \n" // restore src1
2345
2346 "vtrn.u16 q2, q6 \n" // _src13tt_r
2347 "vtrn.u16 d6, d14 \n"
2348
2349 "add %1, #24 \n" // src0 += 24
2350
2351 "vtrn.u16 q8, q12 \n" // _src46tt_r
2352 "vtrn.u16 d18, d26 \n"
2353
2354 "add %2, #24 \n" // src1 += 24
2355
2356 "vtrn.u16 q10, q14 \n" // _src57tt_r
2357 "vtrn.u16 d22, d30 \n"
2358
2359 "vtrn.u32 q0, q8 \n" // _src04ttt_r
2360 "vtrn.u32 d2, d18 \n"
2361
2362 "vtrn.u32 q2, q10 \n" // _src15ttt_r
2363 "vst3.u8 {d0-d2}, [%3], %11 \n"
2364 "vtrn.u32 d6, d22 \n"
2365
2366 "vtrn.u32 q4, q12 \n" // _src26ttt_r
2367 "vst3.u8 {d4-d6}, [%4], %11 \n"
2368 "vtrn.u32 d10, d26 \n"
2369
2370 "vtrn.u32 q6, q14 \n" // _src37ttt_r
2371 "vst3.u8 {d8-d10}, [%3], %11 \n"
2372 "vtrn.u32 d14, d30 \n"
2373
2374 "subs %0, #1 \n"
2375
2376 "vst3.u8 {d16-d18}, [%3], %11\n"
2377 "vst3.u8 {d12-d14}, [%4], %11\n"
2378 "vst3.u8 {d20-d22}, [%4], %11\n"
2379 "vst3.u8 {d24-d26}, [%3], %11\n"
2380 "vst3.u8 {d28-d30}, [%4], %11\n"
2381
2382 "bne 0b \n"
2383 : "=r"(nn), // %0
2384 "=r"(src0), // %1
2385 "=r"(src1), // %2
2386 "=r"(dst0), // %3
2387 "=r"(dst1) // %4
2388 : "0"(nn),
2389 "1"(src0),
2390 "2"(src1),
2391 "3"(dst0),
2392 "4"(dst1),
2393 "r"(src_step), // %10
2394 "r"(dst_step) // %11
2395 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2396 }
2397 #endif // __aarch64__
2398 for (; remain > 0; remain--)
2399 {
2400 dst0[0] = src0[0];
2401 dst0[1] = src0[1];
2402 dst0[2] = src0[2];
2403 dst0[3] = src1[0];
2404 dst0[4] = src1[1];
2405 dst0[5] = src1[2];
2406 dst0[6] = src0[0 + src_step];
2407 dst0[7] = src0[1 + src_step];
2408 dst0[8] = src0[2 + src_step];
2409 dst0[9] = src1[0 + src_step];
2410 dst0[10] = src1[1 + src_step];
2411 dst0[11] = src1[2 + src_step];
2412 dst0[12] = src0[0 + 2 * src_step];
2413 dst0[13] = src0[1 + 2 * src_step];
2414 dst0[14] = src0[2 + 2 * src_step];
2415 dst0[15] = src1[0 + 2 * src_step];
2416 dst0[16] = src1[1 + 2 * src_step];
2417 dst0[17] = src1[2 + 2 * src_step];
2418 dst0[18] = src0[0 + 3 * src_step];
2419 dst0[19] = src0[1 + 3 * src_step];
2420 dst0[20] = src0[2 + 3 * src_step];
2421 dst0[21] = src1[0 + 3 * src_step];
2422 dst0[22] = src1[1 + 3 * src_step];
2423 dst0[23] = src1[2 + 3 * src_step];
2424
2425 src0 += 3;
2426 src1 += 3;
2427
2428 dst0 += stride;
2429 }
2430
2431 src0 += srcwgap + 7 * srcstride;
2432 }
2433 #endif // __ARM_NEON
2434 for (; y < srch; y++)
2435 {
2436 unsigned char* dst0 = dst + y * 3;
2437
2438 int x = 0;
2439 for (; x < srcw; x++)
2440 {
2441 dst0[0] = src0[0];
2442 dst0[1] = src0[1];
2443 dst0[2] = src0[2];
2444
2445 src0 += 3;
2446 dst0 += stride;
2447 }
2448
2449 src0 += srcwgap;
2450 }
2451 }
2452
kanna_rotate_5_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int,int stride)2453 static void kanna_rotate_5_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int /*h*/, int stride)
2454 {
2455 const int srcwgap = srcstride - srcw * 4;
2456
2457 const unsigned char* src0 = src;
2458
2459 int y = 0;
2460 #if __ARM_NEON
2461 for (; y + 7 < srch; y += 8)
2462 {
2463 const unsigned char* src1 = src0 + srcstride;
2464
2465 unsigned char* dst0 = dst + y * 4;
2466 unsigned char* dst1 = dst + y * 4 + stride;
2467
2468 int src_step = 2 * srcstride;
2469 int dst_step = 2 * stride;
2470
2471 int nn = srcw >> 3;
2472 int remain = srcw - (nn << 3);
2473
2474 #if __aarch64__
2475 for (; nn > 0; nn--)
2476 {
2477 uint8x8x4_t _src0 = vld4_u8(src0);
2478 uint8x8x4_t _src1 = vld4_u8(src1);
2479
2480 uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
2481 uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
2482
2483 uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
2484 uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
2485
2486 uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
2487 uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
2488
2489 uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
2490 uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
2491 uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
2492 uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
2493
2494 uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
2495 uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
2496 uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
2497 uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
2498
2499 uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
2500 uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
2501 uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
2502 uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
2503
2504 uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
2505 uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
2506 uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
2507 uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
2508
2509 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
2510 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
2511 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
2512 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
2513
2514 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
2515 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
2516 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
2517 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
2518
2519 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
2520 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
2521 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
2522 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
2523
2524 uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
2525 uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
2526 uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
2527 uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
2528
2529 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
2530 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
2531 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
2532 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
2533
2534 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
2535 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
2536 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
2537 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
2538
2539 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
2540 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
2541 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
2542 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
2543
2544 uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
2545 uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
2546 uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
2547 uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
2548
2549 uint8x8x4_t _dst0;
2550 uint8x8x4_t _dst1;
2551 uint8x8x4_t _dst2;
2552 uint8x8x4_t _dst3;
2553 uint8x8x4_t _dst4;
2554 uint8x8x4_t _dst5;
2555 uint8x8x4_t _dst6;
2556 uint8x8x4_t _dst7;
2557
2558 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2559 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2560 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2561 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2562 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2563 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2564 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2565 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2566
2567 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
2568 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
2569 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
2570 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
2571 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
2572 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
2573 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
2574 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
2575
2576 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
2577 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
2578 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
2579 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
2580 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
2581 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
2582 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
2583 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
2584
2585 _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
2586 _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
2587 _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
2588 _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
2589 _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
2590 _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
2591 _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
2592 _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
2593
2594 vst4_u8(dst0, _dst0);
2595 vst4_u8(dst1, _dst1);
2596 vst4_u8(dst0 + dst_step, _dst2);
2597 vst4_u8(dst1 + dst_step, _dst3);
2598 vst4_u8(dst0 + 2 * dst_step, _dst4);
2599 vst4_u8(dst1 + 2 * dst_step, _dst5);
2600 vst4_u8(dst0 + 3 * dst_step, _dst6);
2601 vst4_u8(dst1 + 3 * dst_step, _dst7);
2602
2603 src0 += 4 * 8;
2604 src1 += 4 * 8;
2605
2606 dst0 += 4 * dst_step;
2607 dst1 += 4 * dst_step;
2608 }
2609 #else
2610 if (nn > 0)
2611 {
2612 asm volatile(
2613 "0: \n"
2614 "pld [%1, #256] \n"
2615 "vld4.u8 {d0-d3}, [%1], %10 \n"
2616
2617 "pld [%2, #256] \n"
2618 "vld4.u8 {d4-d7}, [%2], %10 \n"
2619
2620 "pld [%1, #256] \n"
2621 "vld4.u8 {d8-d11}, [%1], %10 \n"
2622
2623 "vtrn.u8 q0, q2 \n" // _src01t_r
2624 "vtrn.u8 q1, q3 \n"
2625
2626 "pld [%2, #256] \n"
2627 "vld4.u8 {d12-d15}, [%2], %10\n"
2628
2629 "pld [%1, #256] \n"
2630 "vld4.u8 {d16-d19}, [%1], %10\n"
2631
2632 "vtrn.u8 q4, q6 \n" // _src23t_r
2633 "vtrn.u8 q5, q7 \n"
2634
2635 "pld [%2, #256] \n"
2636 "vld4.u8 {d20-d23}, [%2], %10\n"
2637
2638 "pld [%1, #256] \n"
2639 "vld4.u8 {d24-d27}, [%1], %10\n"
2640
2641 "vtrn.u8 q8, q10 \n" // _src45t_r
2642 "vtrn.u8 q9, q11 \n"
2643
2644 "pld [%2, #256] \n"
2645 "vld4.u8 {d28-d31}, [%2], %10\n"
2646
2647 "vtrn.u8 q12, q14 \n" // _src67t_r
2648 "vtrn.u8 q13, q15 \n"
2649
2650 "sub %1, %1, %10, lsl #2 \n" // restore src0
2651
2652 "vtrn.u16 q0, q4 \n" // _src02tt_r
2653 "vtrn.u16 q1, q5 \n"
2654
2655 "sub %2, %2, %10, lsl #2 \n" // restore src1
2656
2657 "vtrn.u16 q2, q6 \n" // _src13tt_r
2658 "vtrn.u16 q3, q7 \n"
2659
2660 "add %1, #32 \n" // src0 += 32
2661
2662 "vtrn.u16 q8, q12 \n" // _src46tt_r
2663 "vtrn.u16 q9, q13 \n"
2664
2665 "add %2, #32 \n" // src1 += 32
2666
2667 "vtrn.u16 q10, q14 \n" // _src57tt_r
2668 "vtrn.u16 q11, q15 \n"
2669
2670 "vtrn.u32 q0, q8 \n" // _src04ttt_r
2671 "vtrn.u32 q1, q9 \n"
2672
2673 "vtrn.u32 q2, q10 \n" // _src15ttt_r
2674 "vst4.u8 {d0-d3}, [%3], %11 \n"
2675 "vtrn.u32 q3, q11 \n"
2676
2677 "vtrn.u32 q4, q12 \n" // _src26ttt_r
2678 "vst4.u8 {d4-d7}, [%4], %11 \n"
2679 "vtrn.u32 q5, q13 \n"
2680
2681 "vtrn.u32 q6, q14 \n" // _src37ttt_r
2682 "vst4.u8 {d8-d11}, [%3], %11 \n"
2683 "vtrn.u32 q7, q15 \n"
2684
2685 "subs %0, #1 \n"
2686
2687 "vst4.u8 {d16-d19}, [%3], %11\n"
2688 "vst4.u8 {d12-d15}, [%4], %11\n"
2689 "vst4.u8 {d20-d23}, [%4], %11\n"
2690 "vst4.u8 {d24-d27}, [%3], %11\n"
2691 "vst4.u8 {d28-d31}, [%4], %11\n"
2692
2693 "bne 0b \n"
2694 : "=r"(nn), // %0
2695 "=r"(src0), // %1
2696 "=r"(src1), // %2
2697 "=r"(dst0), // %3
2698 "=r"(dst1) // %4
2699 : "0"(nn),
2700 "1"(src0),
2701 "2"(src1),
2702 "3"(dst0),
2703 "4"(dst1),
2704 "r"(src_step), // %10
2705 "r"(dst_step) // %11
2706 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2707 }
2708 #endif // __aarch64__
2709 for (; remain > 0; remain--)
2710 {
2711 dst0[0] = src0[0];
2712 dst0[1] = src0[1];
2713 dst0[2] = src0[2];
2714 dst0[3] = src0[3];
2715 dst0[4] = src1[0];
2716 dst0[5] = src1[1];
2717 dst0[6] = src1[2];
2718 dst0[7] = src1[3];
2719 dst0[8] = src0[0 + src_step];
2720 dst0[9] = src0[1 + src_step];
2721 dst0[10] = src0[2 + src_step];
2722 dst0[11] = src0[3 + src_step];
2723 dst0[12] = src1[0 + src_step];
2724 dst0[13] = src1[1 + src_step];
2725 dst0[14] = src1[2 + src_step];
2726 dst0[15] = src1[3 + src_step];
2727 dst0[16] = src0[0 + 2 * src_step];
2728 dst0[17] = src0[1 + 2 * src_step];
2729 dst0[18] = src0[2 + 2 * src_step];
2730 dst0[19] = src0[3 + 2 * src_step];
2731 dst0[20] = src1[0 + 2 * src_step];
2732 dst0[21] = src1[1 + 2 * src_step];
2733 dst0[22] = src1[2 + 2 * src_step];
2734 dst0[23] = src1[3 + 2 * src_step];
2735 dst0[24] = src0[0 + 3 * src_step];
2736 dst0[25] = src0[1 + 3 * src_step];
2737 dst0[26] = src0[2 + 3 * src_step];
2738 dst0[27] = src0[3 + 3 * src_step];
2739 dst0[28] = src1[0 + 3 * src_step];
2740 dst0[29] = src1[1 + 3 * src_step];
2741 dst0[30] = src1[2 + 3 * src_step];
2742 dst0[31] = src1[3 + 3 * src_step];
2743
2744 src0 += 4;
2745 src1 += 4;
2746
2747 dst0 += stride;
2748 }
2749
2750 src0 += srcwgap + 7 * srcstride;
2751 }
2752 #endif // __ARM_NEON
2753 for (; y < srch; y++)
2754 {
2755 unsigned char* dst0 = dst + y * 4;
2756
2757 int x = 0;
2758 for (; x < srcw; x++)
2759 {
2760 dst0[0] = src0[0];
2761 dst0[1] = src0[1];
2762 dst0[2] = src0[2];
2763 dst0[3] = src0[3];
2764
2765 src0 += 4;
2766 dst0 += stride;
2767 }
2768
2769 src0 += srcwgap;
2770 }
2771 }
2772
kanna_rotate_6_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)2773 static void kanna_rotate_6_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
2774 {
2775 const int srcwgap = srcstride - srcw;
2776
2777 // point to the last dst pixel in row
2778 unsigned char* dstend = dst + w;
2779
2780 const unsigned char* src0 = src;
2781
2782 int y = 0;
2783 #if __ARM_NEON
2784 for (; y + 7 < srch; y += 8)
2785 {
2786 const unsigned char* src1 = src0 + srcstride;
2787
2788 unsigned char* dst0 = dstend - y - 8;
2789 unsigned char* dst1 = dstend - y - 8 + stride;
2790
2791 int src_step = 2 * srcstride;
2792 int dst_step = 2 * stride;
2793
2794 int nn = srcw >> 3;
2795 int remain = srcw - (nn << 3);
2796
2797 #if __aarch64__
2798 for (; nn > 0; nn--)
2799 {
2800 uint8x8_t _src0 = vld1_u8(src0);
2801 uint8x8_t _src1 = vld1_u8(src1);
2802
2803 uint8x8_t _src2 = vld1_u8(src0 + src_step);
2804 uint8x8_t _src3 = vld1_u8(src1 + src_step);
2805
2806 uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
2807 uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
2808
2809 uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
2810 uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
2811
2812 uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
2813 uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
2814 uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
2815 uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
2816
2817 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
2818 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
2819 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
2820 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
2821
2822 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
2823 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
2824 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
2825 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
2826
2827 uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
2828 uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
2829 uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
2830 uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
2831 uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
2832 uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
2833 uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
2834 uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
2835
2836 vst1_u8(dst0, _dst7);
2837 vst1_u8(dst1, _dst6);
2838 vst1_u8(dst0 + dst_step, _dst5);
2839 vst1_u8(dst1 + dst_step, _dst4);
2840 vst1_u8(dst0 + 2 * dst_step, _dst3);
2841 vst1_u8(dst1 + 2 * dst_step, _dst2);
2842 vst1_u8(dst0 + 3 * dst_step, _dst1);
2843 vst1_u8(dst1 + 3 * dst_step, _dst0);
2844
2845 src0 += 8;
2846 src1 += 8;
2847
2848 dst0 += 4 * dst_step;
2849 dst1 += 4 * dst_step;
2850 }
2851 #else
2852 if (nn > 0)
2853 {
2854 asm volatile(
2855 "0: \n"
2856 "pld [%1, #64] \n"
2857 "vld1.u8 {d0}, [%1], %10 \n"
2858
2859 "pld [%2, #64] \n"
2860 "vld1.u8 {d1}, [%2], %10 \n"
2861
2862 "pld [%1, #64] \n"
2863 "vld1.u8 {d2}, [%1], %10 \n"
2864
2865 "vtrn.u8 d1, d0 \n" // _src01t_r
2866
2867 "pld [%2, #64] \n"
2868 "vld1.u8 {d3}, [%2], %10 \n"
2869
2870 "pld [%1, #64] \n"
2871 "vld1.u8 {d4}, [%1], %10 \n"
2872
2873 "vtrn.u8 d3, d2 \n" // _src23t_r
2874
2875 "pld [%2, #64] \n"
2876 "vld1.u8 {d5}, [%2], %10 \n"
2877
2878 "pld [%1, #64] \n"
2879 "vld1.u8 {d6}, [%1], %10 \n"
2880
2881 "vtrn.u8 d5, d4 \n" // _src45t_r
2882
2883 "pld [%2, #64] \n"
2884 "vld1.u8 {d7}, [%2], %10 \n"
2885
2886 "vtrn.u8 d7, d6 \n" // _src67t_r
2887
2888 "sub %1, %1, %10, lsl #2 \n" // restore src0
2889
2890 "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
2891
2892 "sub %2, %2, %10, lsl #2 \n" // restore src1
2893
2894 "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
2895
2896 "add %1, #8 \n" // src0 += 8
2897
2898 "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
2899
2900 "add %2, #8 \n" // src1 += 8
2901
2902 "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
2903 "vst1.u8 {d6}, [%4], %11 \n"
2904 "vst1.u8 {d7}, [%3], %11 \n"
2905
2906 "subs %0, #1 \n"
2907
2908 "vst1.u8 {d4}, [%4], %11 \n"
2909 "vst1.u8 {d5}, [%3], %11 \n"
2910 "vst1.u8 {d2}, [%4], %11 \n"
2911 "vst1.u8 {d3}, [%3], %11 \n"
2912 "vst1.u8 {d0}, [%4], %11 \n"
2913 "vst1.u8 {d1}, [%3], %11 \n"
2914
2915 "bne 0b \n"
2916 : "=r"(nn), // %0
2917 "=r"(src0), // %1
2918 "=r"(src1), // %2
2919 "=r"(dst0), // %3
2920 "=r"(dst1) // %4
2921 : "0"(nn),
2922 "1"(src0),
2923 "2"(src1),
2924 "3"(dst0),
2925 "4"(dst1),
2926 "r"(src_step), // %10
2927 "r"(dst_step) // %11
2928 : "cc", "memory", "q0", "q1", "q2", "q3");
2929 }
2930 #endif // __aarch64__
2931 for (; remain > 0; remain--)
2932 {
2933 dst0[0] = src1[0 + 3 * src_step];
2934 dst0[1] = src0[0 + 3 * src_step];
2935 dst0[2] = src1[0 + 2 * src_step];
2936 dst0[3] = src0[0 + 2 * src_step];
2937 dst0[4] = src1[0 + src_step];
2938 dst0[5] = src0[0 + src_step];
2939 dst0[6] = src1[0];
2940 dst0[7] = src0[0];
2941
2942 src0 += 1;
2943 src1 += 1;
2944
2945 dst0 += stride;
2946 }
2947
2948 src0 += srcwgap + 7 * srcstride;
2949 }
2950 #endif // __ARM_NEON
2951 for (; y < srch; y++)
2952 {
2953 unsigned char* dst0 = dstend - y - 1;
2954
2955 int x = 0;
2956 for (; x < srcw; x++)
2957 {
2958 *dst0 = *src0;
2959
2960 src0 += 1;
2961 dst0 += stride;
2962 }
2963
2964 src0 += srcwgap;
2965 }
2966 }
2967
kanna_rotate_6_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)2968 static void kanna_rotate_6_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
2969 {
2970 const int srcwgap = srcstride - srcw * 2;
2971
2972 // point to the last dst pixel in row
2973 unsigned char* dstend = dst + w * 2;
2974
2975 const unsigned char* src0 = src;
2976
2977 int y = 0;
2978 #if __ARM_NEON
2979 for (; y + 7 < srch; y += 8)
2980 {
2981 const unsigned char* src1 = src0 + srcstride;
2982
2983 unsigned char* dst0 = dstend - y * 2 - 8 * 2;
2984 unsigned char* dst1 = dstend - y * 2 - 8 * 2 + stride;
2985
2986 int src_step = 2 * srcstride;
2987 int dst_step = 2 * stride;
2988
2989 int nn = srcw >> 3;
2990 int remain = srcw - (nn << 3);
2991
2992 #if __aarch64__
2993 for (; nn > 0; nn--)
2994 {
2995 uint8x8x2_t _src0 = vld2_u8(src0);
2996 uint8x8x2_t _src1 = vld2_u8(src1);
2997
2998 uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
2999 uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
3000
3001 uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
3002 uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
3003
3004 uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
3005 uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
3006
3007 uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
3008 uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
3009 uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
3010 uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
3011
3012 uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
3013 uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
3014 uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
3015 uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
3016
3017 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3018 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3019 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3020 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3021
3022 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
3023 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
3024 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
3025 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
3026
3027 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3028 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3029 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3030 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3031
3032 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
3033 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
3034 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
3035 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
3036
3037 uint8x8x2_t _dst0;
3038 uint8x8x2_t _dst1;
3039 uint8x8x2_t _dst2;
3040 uint8x8x2_t _dst3;
3041 uint8x8x2_t _dst4;
3042 uint8x8x2_t _dst5;
3043 uint8x8x2_t _dst6;
3044 uint8x8x2_t _dst7;
3045
3046 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3047 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3048 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3049 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3050 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3051 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3052 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3053 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3054
3055 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
3056 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
3057 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
3058 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
3059 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
3060 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
3061 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
3062 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
3063
3064 vst2_u8(dst0, _dst7);
3065 vst2_u8(dst1, _dst6);
3066 vst2_u8(dst0 + dst_step, _dst5);
3067 vst2_u8(dst1 + dst_step, _dst4);
3068 vst2_u8(dst0 + 2 * dst_step, _dst3);
3069 vst2_u8(dst1 + 2 * dst_step, _dst2);
3070 vst2_u8(dst0 + 3 * dst_step, _dst1);
3071 vst2_u8(dst1 + 3 * dst_step, _dst0);
3072
3073 src0 += 2 * 8;
3074 src1 += 2 * 8;
3075
3076 dst0 += 4 * dst_step;
3077 dst1 += 4 * dst_step;
3078 }
3079 #else
3080 if (nn > 0)
3081 {
3082 asm volatile(
3083 "0: \n"
3084 "pld [%1, #128] \n"
3085 "vld2.u8 {d0-d1}, [%1], %10 \n"
3086
3087 "pld [%2, #128] \n"
3088 "vld2.u8 {d2-d3}, [%2], %10 \n"
3089
3090 "pld [%1, #128] \n"
3091 "vld2.u8 {d4-d5}, [%1], %10 \n"
3092
3093 "vtrn.u8 q1, q0 \n" // _src01t_r
3094
3095 "pld [%2, #128] \n"
3096 "vld2.u8 {d6-d7}, [%2], %10 \n"
3097
3098 "pld [%1, #128] \n"
3099 "vld2.u8 {d16-d17}, [%1], %10\n"
3100
3101 "vtrn.u8 q3, q2 \n" // _src23t_r
3102
3103 "pld [%2, #128] \n"
3104 "vld2.u8 {d18-d19}, [%2], %10\n"
3105
3106 "pld [%1, #128] \n"
3107 "vld2.u8 {d20-d21}, [%1], %10\n"
3108
3109 "vtrn.u8 q9, q8 \n" // _src45t_r
3110
3111 "pld [%2, #128] \n"
3112 "vld2.u8 {d22-d23}, [%2], %10\n"
3113
3114 "vtrn.u8 q11, q10 \n" // _src67t_r
3115
3116 "sub %1, %1, %10, lsl #2 \n" // restore src0
3117
3118 "vtrn.u16 q2, q0 \n" // _src02tt_r
3119
3120 "sub %2, %2, %10, lsl #2 \n" // restore src1
3121
3122 "vtrn.u16 q3, q1 \n" // _src13tt_r
3123
3124 "add %1, #16 \n" // src0 += 16
3125
3126 "vtrn.u16 q10, q8 \n" // _src46tt_r
3127
3128 "add %2, #16 \n" // src1 += 16
3129
3130 "vtrn.u16 q11, q9 \n" // _src57tt_r
3131
3132 "vtrn.u32 q10, q2 \n" // _src26ttt_r
3133
3134 "vtrn.u32 q11, q3 \n" // _src37ttt_r
3135 "vst2.u8 {d20-d21}, [%4], %11\n"
3136
3137 "vtrn.u32 q8, q0 \n" // _src04ttt_r
3138 "vst2.u8 {d22-d23}, [%3], %11\n"
3139
3140 "vtrn.u32 q9, q1 \n" // _src15ttt_r
3141 "vst2.u8 {d16-d17}, [%4], %11\n"
3142
3143 "subs %0, #1 \n"
3144
3145 "vst2.u8 {d18-d19}, [%3], %11\n"
3146 "vst2.u8 {d4-d5}, [%4], %11 \n"
3147 "vst2.u8 {d6-d7}, [%3], %11 \n"
3148 "vst2.u8 {d0-d1}, [%4], %11 \n"
3149 "vst2.u8 {d2-d3}, [%3], %11 \n"
3150
3151 "bne 0b \n"
3152 : "=r"(nn), // %0
3153 "=r"(src0), // %1
3154 "=r"(src1), // %2
3155 "=r"(dst0), // %3
3156 "=r"(dst1) // %4
3157 : "0"(nn),
3158 "1"(src0),
3159 "2"(src1),
3160 "3"(dst0),
3161 "4"(dst1),
3162 "r"(src_step), // %10
3163 "r"(dst_step) // %11
3164 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
3165 }
3166 #endif // __aarch64__
3167 for (; remain > 0; remain--)
3168 {
3169 dst0[0] = src1[0 + 3 * src_step];
3170 dst0[1] = src1[1 + 3 * src_step];
3171 dst0[2] = src0[0 + 3 * src_step];
3172 dst0[3] = src0[1 + 3 * src_step];
3173 dst0[4] = src1[0 + 2 * src_step];
3174 dst0[5] = src1[1 + 2 * src_step];
3175 dst0[6] = src0[0 + 2 * src_step];
3176 dst0[7] = src0[1 + 2 * src_step];
3177 dst0[8] = src1[0 + src_step];
3178 dst0[9] = src1[1 + src_step];
3179 dst0[10] = src0[0 + src_step];
3180 dst0[11] = src0[1 + src_step];
3181 dst0[12] = src1[0];
3182 dst0[13] = src1[1];
3183 dst0[14] = src0[0];
3184 dst0[15] = src0[1];
3185
3186 src0 += 2;
3187 src1 += 2;
3188
3189 dst0 += stride;
3190 }
3191
3192 src0 += srcwgap + 7 * srcstride;
3193 }
3194 #endif // __ARM_NEON
3195 for (; y < srch; y++)
3196 {
3197 unsigned char* dst0 = dstend - y * 2 - 2;
3198
3199 int x = 0;
3200 for (; x < srcw; x++)
3201 {
3202 dst0[0] = src0[0];
3203 dst0[1] = src0[1];
3204
3205 src0 += 2;
3206 dst0 += stride;
3207 }
3208
3209 src0 += srcwgap;
3210 }
3211 }
3212
kanna_rotate_6_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)3213 static void kanna_rotate_6_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
3214 {
3215 const int srcwgap = srcstride - srcw * 3;
3216
3217 // point to the last dst pixel in row
3218 unsigned char* dstend = dst + w * 3;
3219
3220 const unsigned char* src0 = src;
3221
3222 int y = 0;
3223 #if __ARM_NEON
3224 for (; y + 7 < srch; y += 8)
3225 {
3226 const unsigned char* src1 = src0 + srcstride;
3227
3228 unsigned char* dst0 = dstend - y * 3 - 8 * 3;
3229 unsigned char* dst1 = dstend - y * 3 - 8 * 3 + stride;
3230
3231 int src_step = 2 * srcstride;
3232 int dst_step = 2 * stride;
3233
3234 int nn = srcw >> 3;
3235 int remain = srcw - (nn << 3);
3236
3237 #if __aarch64__
3238 for (; nn > 0; nn--)
3239 {
3240 uint8x8x3_t _src0 = vld3_u8(src0);
3241 uint8x8x3_t _src1 = vld3_u8(src1);
3242
3243 uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
3244 uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
3245
3246 uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
3247 uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
3248
3249 uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
3250 uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
3251
3252 uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
3253 uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
3254 uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
3255 uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
3256
3257 uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
3258 uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
3259 uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
3260 uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
3261
3262 uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
3263 uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
3264 uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
3265 uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
3266
3267 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3268 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3269 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3270 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3271
3272 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
3273 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
3274 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
3275 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
3276
3277 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
3278 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
3279 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
3280 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
3281
3282 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3283 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3284 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3285 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3286
3287 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
3288 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
3289 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
3290 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
3291
3292 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
3293 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
3294 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
3295 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
3296
3297 uint8x8x3_t _dst0;
3298 uint8x8x3_t _dst1;
3299 uint8x8x3_t _dst2;
3300 uint8x8x3_t _dst3;
3301 uint8x8x3_t _dst4;
3302 uint8x8x3_t _dst5;
3303 uint8x8x3_t _dst6;
3304 uint8x8x3_t _dst7;
3305
3306 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3307 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3308 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3309 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3310 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3311 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3312 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3313 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3314
3315 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
3316 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
3317 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
3318 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
3319 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
3320 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
3321 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
3322 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
3323
3324 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
3325 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
3326 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
3327 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
3328 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
3329 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
3330 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
3331 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
3332
3333 vst3_u8(dst0, _dst7);
3334 vst3_u8(dst1, _dst6);
3335 vst3_u8(dst0 + dst_step, _dst5);
3336 vst3_u8(dst1 + dst_step, _dst4);
3337 vst3_u8(dst0 + 2 * dst_step, _dst3);
3338 vst3_u8(dst1 + 2 * dst_step, _dst2);
3339 vst3_u8(dst0 + 3 * dst_step, _dst1);
3340 vst3_u8(dst1 + 3 * dst_step, _dst0);
3341
3342 src0 += 3 * 8;
3343 src1 += 3 * 8;
3344
3345 dst0 += 4 * dst_step;
3346 dst1 += 4 * dst_step;
3347 }
3348 #else
3349 if (nn > 0)
3350 {
3351 asm volatile(
3352 "0: \n"
3353 "pld [%1, #192] \n"
3354 "vld3.u8 {d0-d2}, [%1], %10 \n"
3355
3356 "pld [%2, #192] \n"
3357 "vld3.u8 {d4-d6}, [%2], %10 \n"
3358
3359 "pld [%1, #192] \n"
3360 "vld3.u8 {d8-d10}, [%1], %10 \n"
3361
3362 "vtrn.u8 q2, q0 \n" // _src01t_r
3363 "vtrn.u8 d6, d2 \n"
3364
3365 "pld [%2, #192] \n"
3366 "vld3.u8 {d12-d14}, [%2], %10\n"
3367
3368 "pld [%1, #192] \n"
3369 "vld3.u8 {d16-d18}, [%1], %10\n"
3370
3371 "vtrn.u8 q6, q4 \n" // _src23t_r
3372 "vtrn.u8 d14, d10 \n"
3373
3374 "pld [%2, #192] \n"
3375 "vld3.u8 {d20-d22}, [%2], %10\n"
3376
3377 "pld [%1, #192] \n"
3378 "vld3.u8 {d24-d26}, [%1], %10\n"
3379
3380 "vtrn.u8 q10, q8 \n" // _src45t_r
3381 "vtrn.u8 d22, d18 \n"
3382
3383 "pld [%2, #192] \n"
3384 "vld3.u8 {d28-d30}, [%2], %10\n"
3385
3386 "vtrn.u8 q14, q12 \n" // _src67t_r
3387 "vtrn.u8 d30, d26 \n"
3388
3389 "sub %1, %1, %10, lsl #2 \n" // restore src0
3390
3391 "vtrn.u16 q4, q0 \n" // _src02tt_r
3392 "vtrn.u16 d10, d2 \n"
3393
3394 "sub %2, %2, %10, lsl #2 \n" // restore src1
3395
3396 "vtrn.u16 q6, q2 \n" // _src13tt_r
3397 "vtrn.u16 d14, d6 \n"
3398
3399 "add %1, #24 \n" // src0 += 24
3400
3401 "vtrn.u16 q12, q8 \n" // _src46tt_r
3402 "vtrn.u16 d26, d18 \n"
3403
3404 "add %2, #24 \n" // src1 += 24
3405
3406 "vtrn.u16 q14, q10 \n" // _src57tt_r
3407 "vtrn.u16 d30, d22 \n"
3408
3409 "vtrn.u32 q12, q4 \n" // _src26ttt_r
3410 "vtrn.u32 d26, d10 \n"
3411
3412 "vtrn.u32 q14, q6 \n" // _src37ttt_r
3413 "vst3.u8 {d24-d26}, [%4], %11\n"
3414 "vtrn.u32 d30, d14 \n"
3415
3416 "vtrn.u32 q8, q0 \n" // _src04ttt_r
3417 "vst3.u8 {d28-d30}, [%3], %11\n"
3418 "vtrn.u32 d18, d2 \n"
3419
3420 "vtrn.u32 q10, q2 \n" // _src15ttt_r
3421 "vst3.u8 {d16-d18}, [%4], %11\n"
3422 "vtrn.u32 d22, d6 \n"
3423
3424 "subs %0, #1 \n"
3425
3426 "vst3.u8 {d20-d22}, [%3], %11\n"
3427 "vst3.u8 {d8-d10}, [%4], %11 \n"
3428 "vst3.u8 {d12-d14}, [%3], %11\n"
3429 "vst3.u8 {d0-d2}, [%4], %11 \n"
3430 "vst3.u8 {d4-d6}, [%3], %11 \n"
3431
3432 "bne 0b \n"
3433 : "=r"(nn), // %0
3434 "=r"(src0), // %1
3435 "=r"(src1), // %2
3436 "=r"(dst0), // %3
3437 "=r"(dst1) // %4
3438 : "0"(nn),
3439 "1"(src0),
3440 "2"(src1),
3441 "3"(dst0),
3442 "4"(dst1),
3443 "r"(src_step), // %10
3444 "r"(dst_step) // %11
3445 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
3446 }
3447 #endif // __aarch64__
3448 for (; remain > 0; remain--)
3449 {
3450 dst0[0] = src1[0 + 3 * src_step];
3451 dst0[1] = src1[1 + 3 * src_step];
3452 dst0[2] = src1[2 + 3 * src_step];
3453 dst0[3] = src0[0 + 3 * src_step];
3454 dst0[4] = src0[1 + 3 * src_step];
3455 dst0[5] = src0[2 + 3 * src_step];
3456 dst0[6] = src1[0 + 2 * src_step];
3457 dst0[7] = src1[1 + 2 * src_step];
3458 dst0[8] = src1[2 + 2 * src_step];
3459 dst0[9] = src0[0 + 2 * src_step];
3460 dst0[10] = src0[1 + 2 * src_step];
3461 dst0[11] = src0[2 + 2 * src_step];
3462 dst0[12] = src1[0 + src_step];
3463 dst0[13] = src1[1 + src_step];
3464 dst0[14] = src1[2 + src_step];
3465 dst0[15] = src0[0 + src_step];
3466 dst0[16] = src0[1 + src_step];
3467 dst0[17] = src0[2 + src_step];
3468 dst0[18] = src1[0];
3469 dst0[19] = src1[1];
3470 dst0[20] = src1[2];
3471 dst0[21] = src0[0];
3472 dst0[22] = src0[1];
3473 dst0[23] = src0[2];
3474
3475 src0 += 3;
3476 src1 += 3;
3477
3478 dst0 += stride;
3479 }
3480
3481 src0 += srcwgap + 7 * srcstride;
3482 }
3483 #endif // __ARM_NEON
3484 for (; y < srch; y++)
3485 {
3486 unsigned char* dst0 = dstend - y * 3 - 3;
3487
3488 int x = 0;
3489 for (; x < srcw; x++)
3490 {
3491 dst0[0] = src0[0];
3492 dst0[1] = src0[1];
3493 dst0[2] = src0[2];
3494
3495 src0 += 3;
3496 dst0 += stride;
3497 }
3498
3499 src0 += srcwgap;
3500 }
3501 }
3502
kanna_rotate_6_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int,int stride)3503 static void kanna_rotate_6_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int /*h*/, int stride)
3504 {
3505 const int srcwgap = srcstride - srcw * 4;
3506
3507 // point to the last dst pixel in row
3508 unsigned char* dstend = dst + w * 4;
3509
3510 const unsigned char* src0 = src;
3511
3512 int y = 0;
3513 #if __ARM_NEON
3514 for (; y + 7 < srch; y += 8)
3515 {
3516 const unsigned char* src1 = src0 + srcstride;
3517
3518 unsigned char* dst0 = dstend - y * 4 - 8 * 4;
3519 unsigned char* dst1 = dstend - y * 4 - 8 * 4 + stride;
3520
3521 int src_step = 2 * srcstride;
3522 int dst_step = 2 * stride;
3523
3524 int nn = srcw >> 3;
3525 int remain = srcw - (nn << 3);
3526
3527 #if __aarch64__
3528 for (; nn > 0; nn--)
3529 {
3530 uint8x8x4_t _src0 = vld4_u8(src0);
3531 uint8x8x4_t _src1 = vld4_u8(src1);
3532
3533 uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
3534 uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
3535
3536 uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
3537 uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
3538
3539 uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
3540 uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
3541
3542 uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
3543 uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
3544 uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
3545 uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
3546
3547 uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
3548 uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
3549 uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
3550 uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
3551
3552 uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
3553 uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
3554 uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
3555 uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
3556
3557 uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
3558 uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
3559 uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
3560 uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
3561
3562 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3563 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3564 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3565 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3566
3567 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
3568 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
3569 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
3570 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
3571
3572 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
3573 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
3574 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
3575 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
3576
3577 uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
3578 uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
3579 uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
3580 uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
3581
3582 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3583 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3584 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3585 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3586
3587 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
3588 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
3589 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
3590 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
3591
3592 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
3593 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
3594 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
3595 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
3596
3597 uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
3598 uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
3599 uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
3600 uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
3601
3602 uint8x8x4_t _dst0;
3603 uint8x8x4_t _dst1;
3604 uint8x8x4_t _dst2;
3605 uint8x8x4_t _dst3;
3606 uint8x8x4_t _dst4;
3607 uint8x8x4_t _dst5;
3608 uint8x8x4_t _dst6;
3609 uint8x8x4_t _dst7;
3610
3611 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3612 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3613 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3614 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3615 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3616 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3617 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3618 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3619
3620 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
3621 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
3622 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
3623 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
3624 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
3625 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
3626 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
3627 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
3628
3629 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
3630 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
3631 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
3632 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
3633 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
3634 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
3635 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
3636 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
3637
3638 _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
3639 _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
3640 _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
3641 _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
3642 _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
3643 _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
3644 _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
3645 _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
3646
3647 vst4_u8(dst0, _dst7);
3648 vst4_u8(dst1, _dst6);
3649 vst4_u8(dst0 + dst_step, _dst5);
3650 vst4_u8(dst1 + dst_step, _dst4);
3651 vst4_u8(dst0 + 2 * dst_step, _dst3);
3652 vst4_u8(dst1 + 2 * dst_step, _dst2);
3653 vst4_u8(dst0 + 3 * dst_step, _dst1);
3654 vst4_u8(dst1 + 3 * dst_step, _dst0);
3655
3656 src0 += 4 * 8;
3657 src1 += 4 * 8;
3658
3659 dst0 += 4 * dst_step;
3660 dst1 += 4 * dst_step;
3661 }
3662 #else
3663 if (nn > 0)
3664 {
3665 asm volatile(
3666 "0: \n"
3667 "pld [%1, #256] \n"
3668 "vld4.u8 {d0-d3}, [%1], %10 \n"
3669
3670 "pld [%2, #256] \n"
3671 "vld4.u8 {d4-d7}, [%2], %10 \n"
3672
3673 "pld [%1, #256] \n"
3674 "vld4.u8 {d8-d11}, [%1], %10 \n"
3675
3676 "vtrn.u8 q2, q0 \n" // _src01t_r
3677 "vtrn.u8 q3, q1 \n"
3678
3679 "pld [%2, #256] \n"
3680 "vld4.u8 {d12-d15}, [%2], %10\n"
3681
3682 "pld [%1, #256] \n"
3683 "vld4.u8 {d16-d19}, [%1], %10\n"
3684
3685 "vtrn.u8 q6, q4 \n" // _src23t_r
3686 "vtrn.u8 q7, q5 \n"
3687
3688 "pld [%2, #256] \n"
3689 "vld4.u8 {d20-d23}, [%2], %10\n"
3690
3691 "pld [%1, #256] \n"
3692 "vld4.u8 {d24-d27}, [%1], %10\n"
3693
3694 "vtrn.u8 q10, q8 \n" // _src45t_r
3695 "vtrn.u8 q11, q9 \n"
3696
3697 "pld [%2, #256] \n"
3698 "vld4.u8 {d28-d31}, [%2], %10\n"
3699
3700 "vtrn.u8 q14, q12 \n" // _src67t_r
3701 "vtrn.u8 q15, q13 \n"
3702
3703 "sub %1, %1, %10, lsl #2 \n" // restore src0
3704
3705 "vtrn.u16 q4, q0 \n" // _src02tt_r
3706 "vtrn.u16 q5, q1 \n"
3707
3708 "sub %2, %2, %10, lsl #2 \n" // restore src1
3709
3710 "vtrn.u16 q6, q2 \n" // _src13tt_r
3711 "vtrn.u16 q7, q3 \n"
3712
3713 "add %1, #32 \n" // src0 += 32
3714
3715 "vtrn.u16 q12, q8 \n" // _src46tt_r
3716 "vtrn.u16 q13, q9 \n"
3717
3718 "add %2, #32 \n" // src1 += 32
3719
3720 "vtrn.u16 q14, q10 \n" // _src57tt_r
3721 "vtrn.u16 q15, q11 \n"
3722
3723 "vtrn.u32 q12, q4 \n" // _src26ttt_r
3724 "vtrn.u32 q13, q5 \n"
3725
3726 "vtrn.u32 q14, q6 \n" // _src37ttt_r
3727 "vst4.u8 {d24-d27}, [%4], %11\n"
3728 "vtrn.u32 q15, q7 \n"
3729
3730 "vtrn.u32 q8, q0 \n" // _src04ttt_r
3731 "vst4.u8 {d28-d31}, [%3], %11\n"
3732 "vtrn.u32 q9, q1 \n"
3733
3734 "vtrn.u32 q10, q2 \n" // _src15ttt_r
3735 "vst4.u8 {d16-d19}, [%4], %11\n"
3736 "vtrn.u32 q11, q3 \n"
3737
3738 "subs %0, #1 \n"
3739
3740 "vst4.u8 {d8-d11}, [%4], %11 \n"
3741 "vst4.u8 {d20-d23}, [%3], %11\n"
3742 "vst4.u8 {d12-d15}, [%3], %11\n"
3743 "vst4.u8 {d0-d3}, [%4], %11 \n"
3744 "vst4.u8 {d4-d7}, [%3], %11 \n"
3745
3746 "bne 0b \n"
3747 : "=r"(nn), // %0
3748 "=r"(src0), // %1
3749 "=r"(src1), // %2
3750 "=r"(dst0), // %3
3751 "=r"(dst1) // %4
3752 : "0"(nn),
3753 "1"(src0),
3754 "2"(src1),
3755 "3"(dst0),
3756 "4"(dst1),
3757 "r"(src_step), // %10
3758 "r"(dst_step) // %11
3759 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
3760 }
3761 #endif // __aarch64__
3762 for (; remain > 0; remain--)
3763 {
3764 dst0[0] = src1[0 + 3 * src_step];
3765 dst0[1] = src1[1 + 3 * src_step];
3766 dst0[2] = src1[2 + 3 * src_step];
3767 dst0[3] = src1[3 + 3 * src_step];
3768 dst0[4] = src0[0 + 3 * src_step];
3769 dst0[5] = src0[1 + 3 * src_step];
3770 dst0[6] = src0[2 + 3 * src_step];
3771 dst0[7] = src0[3 + 3 * src_step];
3772 dst0[8] = src1[0 + 2 * src_step];
3773 dst0[9] = src1[1 + 2 * src_step];
3774 dst0[10] = src1[2 + 2 * src_step];
3775 dst0[11] = src1[3 + 2 * src_step];
3776 dst0[12] = src0[0 + 2 * src_step];
3777 dst0[13] = src0[1 + 2 * src_step];
3778 dst0[14] = src0[2 + 2 * src_step];
3779 dst0[15] = src0[3 + 2 * src_step];
3780 dst0[16] = src1[0 + src_step];
3781 dst0[17] = src1[1 + src_step];
3782 dst0[18] = src1[2 + src_step];
3783 dst0[19] = src1[3 + src_step];
3784 dst0[20] = src0[0 + src_step];
3785 dst0[21] = src0[1 + src_step];
3786 dst0[22] = src0[2 + src_step];
3787 dst0[23] = src0[3 + src_step];
3788 dst0[24] = src1[0];
3789 dst0[25] = src1[1];
3790 dst0[26] = src1[2];
3791 dst0[27] = src1[3];
3792 dst0[28] = src0[0];
3793 dst0[29] = src0[1];
3794 dst0[30] = src0[2];
3795 dst0[31] = src0[3];
3796
3797 src0 += 4;
3798 src1 += 4;
3799
3800 dst0 += stride;
3801 }
3802
3803 src0 += srcwgap + 7 * srcstride;
3804 }
3805 #endif // __ARM_NEON
3806 for (; y < srch; y++)
3807 {
3808 unsigned char* dst0 = dstend - y * 4 - 4;
3809
3810 int x = 0;
3811 for (; x < srcw; x++)
3812 {
3813 dst0[0] = src0[0];
3814 dst0[1] = src0[1];
3815 dst0[2] = src0[2];
3816 dst0[3] = src0[3];
3817
3818 src0 += 4;
3819 dst0 += stride;
3820 }
3821
3822 src0 += srcwgap;
3823 }
3824 }
3825
kanna_rotate_7_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)3826 static void kanna_rotate_7_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
3827 {
3828 const int srcwgap = srcstride - srcw;
3829
3830 // point to the last dst pixel
3831 unsigned char* dstend = dst + stride * (h - 1) + w;
3832
3833 const unsigned char* src0 = src;
3834
3835 int y = 0;
3836 #if __ARM_NEON
3837 for (; y + 7 < srch; y += 8)
3838 {
3839 const unsigned char* src1 = src0 + srcstride;
3840
3841 unsigned char* dst6 = dstend - y - 8 - stride;
3842 unsigned char* dst7 = dstend - y - 8;
3843
3844 int src_step = 2 * srcstride;
3845 int dst_step = -2 * stride;
3846
3847 int nn = srcw >> 3;
3848 int remain = srcw - (nn << 3);
3849
3850 #if __aarch64__
3851 for (; nn > 0; nn--)
3852 {
3853 uint8x8_t _src0 = vld1_u8(src0);
3854 uint8x8_t _src1 = vld1_u8(src1);
3855
3856 uint8x8_t _src2 = vld1_u8(src0 + src_step);
3857 uint8x8_t _src3 = vld1_u8(src1 + src_step);
3858
3859 uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
3860 uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
3861
3862 uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
3863 uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
3864
3865 uint8x8x2_t _src01t_r = vtrn_u8(_src1, _src0);
3866 uint8x8x2_t _src23t_r = vtrn_u8(_src3, _src2);
3867 uint8x8x2_t _src45t_r = vtrn_u8(_src5, _src4);
3868 uint8x8x2_t _src67t_r = vtrn_u8(_src7, _src6);
3869
3870 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
3871 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
3872 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
3873 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
3874
3875 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
3876 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
3877 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
3878 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
3879
3880 uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
3881 uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
3882 uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
3883 uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
3884 uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
3885 uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
3886 uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
3887 uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
3888
3889 vst1_u8(dst7, _dst7);
3890 vst1_u8(dst6, _dst6);
3891 vst1_u8(dst7 + dst_step, _dst5);
3892 vst1_u8(dst6 + dst_step, _dst4);
3893 vst1_u8(dst7 + 2 * dst_step, _dst3);
3894 vst1_u8(dst6 + 2 * dst_step, _dst2);
3895 vst1_u8(dst7 + 3 * dst_step, _dst1);
3896 vst1_u8(dst6 + 3 * dst_step, _dst0);
3897
3898 src0 += 8;
3899 src1 += 8;
3900
3901 dst7 += 4 * dst_step;
3902 dst6 += 4 * dst_step;
3903 }
3904 #else
3905 if (nn > 0)
3906 {
3907 asm volatile(
3908 "0: \n"
3909 "pld [%1, #64] \n"
3910 "vld1.u8 {d0}, [%1], %10 \n"
3911
3912 "pld [%2, #64] \n"
3913 "vld1.u8 {d1}, [%2], %10 \n"
3914
3915 "pld [%1, #64] \n"
3916 "vld1.u8 {d2}, [%1], %10 \n"
3917
3918 "vtrn.u8 d1, d0 \n" // _src01t_r
3919
3920 "pld [%2, #64] \n"
3921 "vld1.u8 {d3}, [%2], %10 \n"
3922
3923 "pld [%1, #64] \n"
3924 "vld1.u8 {d4}, [%1], %10 \n"
3925
3926 "vtrn.u8 d3, d2 \n" // _src23t_r
3927
3928 "pld [%2, #64] \n"
3929 "vld1.u8 {d5}, [%2], %10 \n"
3930
3931 "pld [%1, #64] \n"
3932 "vld1.u8 {d6}, [%1], %10 \n"
3933
3934 "vtrn.u8 d5, d4 \n" // _src45t_r
3935
3936 "pld [%2, #64] \n"
3937 "vld1.u8 {d7}, [%2], %10 \n"
3938
3939 "vtrn.u8 d7, d6 \n" // _src67t_r
3940
3941 "sub %1, %1, %10, lsl #2 \n" // restore src0
3942
3943 "vtrn.u16 q1, q0 \n" // _src02tt_r _src13tt_r
3944
3945 "sub %2, %2, %10, lsl #2 \n" // restore src1
3946
3947 "vtrn.u16 q3, q2 \n" // _src46tt_r _src57tt_r
3948
3949 "add %1, #8 \n" // src0 += 8
3950
3951 "vtrn.u32 q3, q1 \n" // _src26ttt_r _src37ttt_r
3952
3953 "add %2, #8 \n" // src1 += 8
3954
3955 "vtrn.u32 q2, q0 \n" // _src04ttt_r _src15ttt_r
3956 "vst1.u8 {d6}, [%4], %11 \n"
3957 "vst1.u8 {d7}, [%3], %11 \n"
3958
3959 "subs %0, #1 \n"
3960
3961 "vst1.u8 {d4}, [%4], %11 \n"
3962 "vst1.u8 {d5}, [%3], %11 \n"
3963 "vst1.u8 {d2}, [%4], %11 \n"
3964 "vst1.u8 {d3}, [%3], %11 \n"
3965 "vst1.u8 {d0}, [%4], %11 \n"
3966 "vst1.u8 {d1}, [%3], %11 \n"
3967
3968 "bne 0b \n"
3969 : "=r"(nn), // %0
3970 "=r"(src0), // %1
3971 "=r"(src1), // %2
3972 "=r"(dst7), // %3
3973 "=r"(dst6) // %4
3974 : "0"(nn),
3975 "1"(src0),
3976 "2"(src1),
3977 "3"(dst7),
3978 "4"(dst6),
3979 "r"(src_step), // %10
3980 "r"(dst_step) // %11
3981 : "cc", "memory", "q0", "q1", "q2", "q3");
3982 }
3983 #endif // __aarch64__
3984 for (; remain > 0; remain--)
3985 {
3986 dst7[0] = src1[0 + 3 * src_step];
3987 dst7[1] = src0[0 + 3 * src_step];
3988 dst7[2] = src1[0 + 2 * src_step];
3989 dst7[3] = src0[0 + 2 * src_step];
3990 dst7[4] = src1[0 + src_step];
3991 dst7[5] = src0[0 + src_step];
3992 dst7[6] = src1[0];
3993 dst7[7] = src0[0];
3994
3995 src0 += 1;
3996 src1 += 1;
3997
3998 dst7 -= stride;
3999 }
4000
4001 src0 += srcwgap + 7 * srcstride;
4002 }
4003 #endif // __ARM_NEON
4004 for (; y < srch; y++)
4005 {
4006 unsigned char* dst0 = dstend - y - 1;
4007
4008 int x = 0;
4009 for (; x < srcw; x++)
4010 {
4011 *dst0 = *src0;
4012
4013 src0 += 1;
4014 dst0 -= stride;
4015 }
4016
4017 src0 += srcwgap;
4018 }
4019 }
4020
kanna_rotate_7_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)4021 static void kanna_rotate_7_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
4022 {
4023 const int srcwgap = srcstride - srcw * 2;
4024
4025 // point to the last dst pixel
4026 unsigned char* dstend = dst + stride * (h - 1) + w * 2;
4027
4028 const unsigned char* src0 = src;
4029
4030 int y = 0;
4031 #if __ARM_NEON
4032 for (; y + 7 < srch; y += 8)
4033 {
4034 const unsigned char* src1 = src0 + srcstride;
4035
4036 unsigned char* dst6 = dstend - y * 2 - 8 * 2 - stride;
4037 unsigned char* dst7 = dstend - y * 2 - 8 * 2;
4038
4039 int src_step = 2 * srcstride;
4040 int dst_step = -2 * stride;
4041
4042 int nn = srcw >> 3;
4043 int remain = srcw - (nn << 3);
4044
4045 #if __aarch64__
4046 for (; nn > 0; nn--)
4047 {
4048 uint8x8x2_t _src0 = vld2_u8(src0);
4049 uint8x8x2_t _src1 = vld2_u8(src1);
4050
4051 uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
4052 uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
4053
4054 uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
4055 uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
4056
4057 uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
4058 uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
4059
4060 uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
4061 uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
4062 uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
4063 uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
4064
4065 uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
4066 uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
4067 uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
4068 uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
4069
4070 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
4071 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
4072 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
4073 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
4074
4075 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
4076 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
4077 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
4078 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
4079
4080 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
4081 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
4082 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
4083 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
4084
4085 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
4086 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
4087 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
4088 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
4089
4090 uint8x8x2_t _dst0;
4091 uint8x8x2_t _dst1;
4092 uint8x8x2_t _dst2;
4093 uint8x8x2_t _dst3;
4094 uint8x8x2_t _dst4;
4095 uint8x8x2_t _dst5;
4096 uint8x8x2_t _dst6;
4097 uint8x8x2_t _dst7;
4098
4099 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4100 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4101 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4102 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4103 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4104 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4105 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4106 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4107
4108 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
4109 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
4110 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
4111 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
4112 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
4113 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
4114 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
4115 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
4116
4117 vst2_u8(dst7, _dst7);
4118 vst2_u8(dst6, _dst6);
4119 vst2_u8(dst7 + dst_step, _dst5);
4120 vst2_u8(dst6 + dst_step, _dst4);
4121 vst2_u8(dst7 + 2 * dst_step, _dst3);
4122 vst2_u8(dst6 + 2 * dst_step, _dst2);
4123 vst2_u8(dst7 + 3 * dst_step, _dst1);
4124 vst2_u8(dst6 + 3 * dst_step, _dst0);
4125
4126 src0 += 2 * 8;
4127 src1 += 2 * 8;
4128
4129 dst7 += 4 * dst_step;
4130 dst6 += 4 * dst_step;
4131 }
4132 #else
4133 if (nn > 0)
4134 {
4135 asm volatile(
4136 "0: \n"
4137 "pld [%1, #128] \n"
4138 "vld2.u8 {d0-d1}, [%1], %10 \n"
4139
4140 "pld [%2, #128] \n"
4141 "vld2.u8 {d2-d3}, [%2], %10 \n"
4142
4143 "pld [%1, #128] \n"
4144 "vld2.u8 {d4-d5}, [%1], %10 \n"
4145
4146 "vtrn.u8 q1, q0 \n" // _src01t_r
4147
4148 "pld [%2, #128] \n"
4149 "vld2.u8 {d6-d7}, [%2], %10 \n"
4150
4151 "pld [%1, #128] \n"
4152 "vld2.u8 {d16-d17}, [%1], %10\n"
4153
4154 "vtrn.u8 q3, q2 \n" // _src23t_r
4155
4156 "pld [%2, #128] \n"
4157 "vld2.u8 {d18-d19}, [%2], %10\n"
4158
4159 "pld [%1, #128] \n"
4160 "vld2.u8 {d20-d21}, [%1], %10\n"
4161
4162 "vtrn.u8 q9, q8 \n" // _src45t_r
4163
4164 "pld [%2, #128] \n"
4165 "vld2.u8 {d22-d23}, [%2], %10\n"
4166
4167 "vtrn.u8 q11, q10 \n" // _src67t_r
4168
4169 "sub %1, %1, %10, lsl #2 \n" // restore src0
4170
4171 "vtrn.u16 q2, q0 \n" // _src02tt_r
4172
4173 "sub %2, %2, %10, lsl #2 \n" // restore src1
4174
4175 "vtrn.u16 q3, q1 \n" // _src13tt_r
4176
4177 "add %1, #16 \n" // src0 += 16
4178
4179 "vtrn.u16 q10, q8 \n" // _src46tt_r
4180
4181 "add %2, #16 \n" // src1 += 16
4182
4183 "vtrn.u16 q11, q9 \n" // _src57tt_r
4184
4185 "vtrn.u32 q10, q2 \n" // _src26ttt_r
4186
4187 "vtrn.u32 q11, q3 \n" // _src37ttt_r
4188 "vst2.u8 {d20-d21}, [%4], %11\n"
4189
4190 "vtrn.u32 q8, q0 \n" // _src04ttt_r
4191 "vst2.u8 {d22-d23}, [%3], %11\n"
4192
4193 "vtrn.u32 q9, q1 \n" // _src15ttt_r
4194 "vst2.u8 {d16-d17}, [%4], %11\n"
4195
4196 "subs %0, #1 \n"
4197
4198 "vst2.u8 {d4-d5}, [%4], %11 \n"
4199 "vst2.u8 {d18-d19}, [%3], %11\n"
4200 "vst2.u8 {d6-d7}, [%3], %11 \n"
4201 "vst2.u8 {d0-d1}, [%4], %11 \n"
4202 "vst2.u8 {d2-d3}, [%3], %11 \n"
4203
4204 "bne 0b \n"
4205 : "=r"(nn), // %0
4206 "=r"(src0), // %1
4207 "=r"(src1), // %2
4208 "=r"(dst7), // %3
4209 "=r"(dst6) // %4
4210 : "0"(nn),
4211 "1"(src0),
4212 "2"(src1),
4213 "3"(dst7),
4214 "4"(dst6),
4215 "r"(src_step), // %10
4216 "r"(dst_step) // %11
4217 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
4218 }
4219 #endif // __aarch64__
4220 for (; remain > 0; remain--)
4221 {
4222 dst7[0] = src1[0 + 3 * src_step];
4223 dst7[1] = src1[1 + 3 * src_step];
4224 dst7[2] = src0[0 + 3 * src_step];
4225 dst7[3] = src0[1 + 3 * src_step];
4226 dst7[4] = src1[0 + 2 * src_step];
4227 dst7[5] = src1[1 + 2 * src_step];
4228 dst7[6] = src0[0 + 2 * src_step];
4229 dst7[7] = src0[1 + 2 * src_step];
4230 dst7[8] = src1[0 + src_step];
4231 dst7[9] = src1[1 + src_step];
4232 dst7[10] = src0[0 + src_step];
4233 dst7[11] = src0[1 + src_step];
4234 dst7[12] = src1[0];
4235 dst7[13] = src1[1];
4236 dst7[14] = src0[0];
4237 dst7[15] = src0[1];
4238
4239 src0 += 2;
4240 src1 += 2;
4241
4242 dst7 -= stride;
4243 }
4244
4245 src0 += srcwgap + 7 * srcstride;
4246 }
4247 #endif // __ARM_NEON
4248 for (; y < srch; y++)
4249 {
4250 unsigned char* dst0 = dstend - y * 2 - 2;
4251
4252 int x = 0;
4253 for (; x < srcw; x++)
4254 {
4255 dst0[0] = src0[0];
4256 dst0[1] = src0[1];
4257
4258 src0 += 2;
4259 dst0 -= stride;
4260 }
4261
4262 src0 += srcwgap;
4263 }
4264 }
4265
kanna_rotate_7_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)4266 static void kanna_rotate_7_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
4267 {
4268 const int srcwgap = srcstride - srcw * 3;
4269
4270 // point to the last dst pixel
4271 unsigned char* dstend = dst + stride * (h - 1) + w * 3;
4272
4273 const unsigned char* src0 = src;
4274
4275 int y = 0;
4276 #if __ARM_NEON
4277 for (; y + 7 < srch; y += 8)
4278 {
4279 const unsigned char* src1 = src0 + srcstride;
4280
4281 unsigned char* dst6 = dstend - y * 3 - 8 * 3 - stride;
4282 unsigned char* dst7 = dstend - y * 3 - 8 * 3;
4283
4284 int src_step = 2 * srcstride;
4285 int dst_step = -2 * stride;
4286
4287 int nn = srcw >> 3;
4288 int remain = srcw - (nn << 3);
4289
4290 #if __aarch64__
4291 for (; nn > 0; nn--)
4292 {
4293 uint8x8x3_t _src0 = vld3_u8(src0);
4294 uint8x8x3_t _src1 = vld3_u8(src1);
4295
4296 uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
4297 uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
4298
4299 uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
4300 uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
4301
4302 uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
4303 uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
4304
4305 uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
4306 uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
4307 uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
4308 uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
4309
4310 uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
4311 uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
4312 uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
4313 uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
4314
4315 uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
4316 uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
4317 uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
4318 uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
4319
4320 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
4321 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
4322 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
4323 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
4324
4325 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
4326 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
4327 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
4328 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
4329
4330 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
4331 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
4332 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
4333 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
4334
4335 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
4336 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
4337 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
4338 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
4339
4340 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
4341 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
4342 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
4343 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
4344
4345 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
4346 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
4347 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
4348 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
4349
4350 uint8x8x3_t _dst0;
4351 uint8x8x3_t _dst1;
4352 uint8x8x3_t _dst2;
4353 uint8x8x3_t _dst3;
4354 uint8x8x3_t _dst4;
4355 uint8x8x3_t _dst5;
4356 uint8x8x3_t _dst6;
4357 uint8x8x3_t _dst7;
4358
4359 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4360 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4361 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4362 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4363 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4364 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4365 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4366 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4367
4368 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
4369 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
4370 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
4371 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
4372 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
4373 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
4374 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
4375 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
4376
4377 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
4378 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
4379 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
4380 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
4381 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
4382 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
4383 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
4384 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
4385
4386 vst3_u8(dst7, _dst7);
4387 vst3_u8(dst6, _dst6);
4388 vst3_u8(dst7 + dst_step, _dst5);
4389 vst3_u8(dst6 + dst_step, _dst4);
4390 vst3_u8(dst7 + 2 * dst_step, _dst3);
4391 vst3_u8(dst6 + 2 * dst_step, _dst2);
4392 vst3_u8(dst7 + 3 * dst_step, _dst1);
4393 vst3_u8(dst6 + 3 * dst_step, _dst0);
4394
4395 src0 += 3 * 8;
4396 src1 += 3 * 8;
4397
4398 dst7 += 4 * dst_step;
4399 dst6 += 4 * dst_step;
4400 }
4401 #else
4402 if (nn > 0)
4403 {
4404 asm volatile(
4405 "0: \n"
4406 "pld [%1, #192] \n"
4407 "vld3.u8 {d0-d2}, [%1], %10 \n"
4408
4409 "pld [%2, #192] \n"
4410 "vld3.u8 {d4-d6}, [%2], %10 \n"
4411
4412 "pld [%1, #192] \n"
4413 "vld3.u8 {d8-d10}, [%1], %10 \n"
4414
4415 "vtrn.u8 q2, q0 \n" // _src01t_r
4416 "vtrn.u8 d6, d2 \n"
4417
4418 "pld [%2, #192] \n"
4419 "vld3.u8 {d12-d14}, [%2], %10\n"
4420
4421 "pld [%1, #192] \n"
4422 "vld3.u8 {d16-d18}, [%1], %10\n"
4423
4424 "vtrn.u8 q6, q4 \n" // _src23t_r
4425 "vtrn.u8 d14, d10 \n"
4426
4427 "pld [%2, #192] \n"
4428 "vld3.u8 {d20-d22}, [%2], %10\n"
4429
4430 "pld [%1, #192] \n"
4431 "vld3.u8 {d24-d26}, [%1], %10\n"
4432
4433 "vtrn.u8 q10, q8 \n" // _src45t_r
4434 "vtrn.u8 d22, d18 \n"
4435
4436 "pld [%2, #192] \n"
4437 "vld3.u8 {d28-d30}, [%2], %10\n"
4438
4439 "vtrn.u8 q14, q12 \n" // _src67t_r
4440 "vtrn.u8 d30, d26 \n"
4441
4442 "sub %1, %1, %10, lsl #2 \n" // restore src0
4443
4444 "vtrn.u16 q4, q0 \n" // _src02tt_r
4445 "vtrn.u16 d10, d2 \n"
4446
4447 "sub %2, %2, %10, lsl #2 \n" // restore src1
4448
4449 "vtrn.u16 q6, q2 \n" // _src13tt_r
4450 "vtrn.u16 d14, d6 \n"
4451
4452 "add %1, #24 \n" // src0 += 24
4453
4454 "vtrn.u16 q12, q8 \n" // _src46tt_r
4455 "vtrn.u16 d26, d18 \n"
4456
4457 "add %2, #24 \n" // src1 += 24
4458
4459 "vtrn.u16 q14, q10 \n" // _src57tt_r
4460 "vtrn.u16 d30, d22 \n"
4461
4462 "vtrn.u32 q12, q4 \n" // _src26ttt_r
4463 "vtrn.u32 d26, d10 \n"
4464
4465 "vtrn.u32 q14, q6 \n" // _src37ttt_r
4466 "vst3.u8 {d24-d26}, [%4], %11\n"
4467 "vtrn.u32 d30, d14 \n"
4468
4469 "vtrn.u32 q8, q0 \n" // _src04ttt_r
4470 "vst3.u8 {d28-d30}, [%3], %11\n"
4471 "vtrn.u32 d18, d2 \n"
4472
4473 "vtrn.u32 q10, q2 \n" // _src15ttt_r
4474 "vst3.u8 {d16-d18}, [%4], %11\n"
4475 "vtrn.u32 d22, d6 \n"
4476
4477 "subs %0, #1 \n"
4478
4479 "vst3.u8 {d8-d10}, [%4], %11 \n"
4480 "vst3.u8 {d20-d22}, [%3], %11\n"
4481 "vst3.u8 {d12-d14}, [%3], %11\n"
4482 "vst3.u8 {d0-d2}, [%4], %11 \n"
4483 "vst3.u8 {d4-d6}, [%3], %11 \n"
4484
4485 "bne 0b \n"
4486 : "=r"(nn), // %0
4487 "=r"(src0), // %1
4488 "=r"(src1), // %2
4489 "=r"(dst7), // %3
4490 "=r"(dst6) // %4
4491 : "0"(nn),
4492 "1"(src0),
4493 "2"(src1),
4494 "3"(dst7),
4495 "4"(dst6),
4496 "r"(src_step), // %10
4497 "r"(dst_step) // %11
4498 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4499 }
4500 #endif // __aarch64__
4501 for (; remain > 0; remain--)
4502 {
4503 dst7[0] = src1[0 + 3 * src_step];
4504 dst7[1] = src1[1 + 3 * src_step];
4505 dst7[2] = src1[2 + 3 * src_step];
4506 dst7[3] = src0[0 + 3 * src_step];
4507 dst7[4] = src0[1 + 3 * src_step];
4508 dst7[5] = src0[2 + 3 * src_step];
4509 dst7[6] = src1[0 + 2 * src_step];
4510 dst7[7] = src1[1 + 2 * src_step];
4511 dst7[8] = src1[2 + 2 * src_step];
4512 dst7[9] = src0[0 + 2 * src_step];
4513 dst7[10] = src0[1 + 2 * src_step];
4514 dst7[11] = src0[2 + 2 * src_step];
4515 dst7[12] = src1[0 + src_step];
4516 dst7[13] = src1[1 + src_step];
4517 dst7[14] = src1[2 + src_step];
4518 dst7[15] = src0[0 + src_step];
4519 dst7[16] = src0[1 + src_step];
4520 dst7[17] = src0[2 + src_step];
4521 dst7[18] = src1[0];
4522 dst7[19] = src1[1];
4523 dst7[20] = src1[2];
4524 dst7[21] = src0[0];
4525 dst7[22] = src0[1];
4526 dst7[23] = src0[2];
4527
4528 src0 += 3;
4529 src1 += 3;
4530
4531 dst7 -= stride;
4532 }
4533
4534 src0 += srcwgap + 7 * srcstride;
4535 }
4536 #endif // __ARM_NEON
4537 for (; y < srch; y++)
4538 {
4539 unsigned char* dst0 = dstend - y * 3 - 3;
4540
4541 int x = 0;
4542 for (; x < srcw; x++)
4543 {
4544 dst0[0] = src0[0];
4545 dst0[1] = src0[1];
4546 dst0[2] = src0[2];
4547
4548 src0 += 3;
4549 dst0 -= stride;
4550 }
4551
4552 src0 += srcwgap;
4553 }
4554 }
4555
kanna_rotate_7_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride)4556 static void kanna_rotate_7_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride)
4557 {
4558 const int srcwgap = srcstride - srcw * 4;
4559
4560 // point to the last dst pixel
4561 unsigned char* dstend = dst + stride * (h - 1) + w * 4;
4562
4563 const unsigned char* src0 = src;
4564
4565 int y = 0;
4566 #if __ARM_NEON
4567 for (; y + 7 < srch; y += 8)
4568 {
4569 const unsigned char* src1 = src0 + srcstride;
4570
4571 unsigned char* dst6 = dstend - y * 4 - 8 * 4 - stride;
4572 unsigned char* dst7 = dstend - y * 4 - 8 * 4;
4573
4574 int src_step = 2 * srcstride;
4575 int dst_step = -2 * stride;
4576
4577 int nn = srcw >> 3;
4578 int remain = srcw - (nn << 3);
4579
4580 #if __aarch64__
4581 for (; nn > 0; nn--)
4582 {
4583 uint8x8x4_t _src0 = vld4_u8(src0);
4584 uint8x8x4_t _src1 = vld4_u8(src1);
4585
4586 uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
4587 uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
4588
4589 uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
4590 uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
4591
4592 uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
4593 uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
4594
4595 uint8x8x2_t _src01t_r = vtrn_u8(_src1.val[0], _src0.val[0]);
4596 uint8x8x2_t _src23t_r = vtrn_u8(_src3.val[0], _src2.val[0]);
4597 uint8x8x2_t _src45t_r = vtrn_u8(_src5.val[0], _src4.val[0]);
4598 uint8x8x2_t _src67t_r = vtrn_u8(_src7.val[0], _src6.val[0]);
4599
4600 uint8x8x2_t _src01t_g = vtrn_u8(_src1.val[1], _src0.val[1]);
4601 uint8x8x2_t _src23t_g = vtrn_u8(_src3.val[1], _src2.val[1]);
4602 uint8x8x2_t _src45t_g = vtrn_u8(_src5.val[1], _src4.val[1]);
4603 uint8x8x2_t _src67t_g = vtrn_u8(_src7.val[1], _src6.val[1]);
4604
4605 uint8x8x2_t _src01t_b = vtrn_u8(_src1.val[2], _src0.val[2]);
4606 uint8x8x2_t _src23t_b = vtrn_u8(_src3.val[2], _src2.val[2]);
4607 uint8x8x2_t _src45t_b = vtrn_u8(_src5.val[2], _src4.val[2]);
4608 uint8x8x2_t _src67t_b = vtrn_u8(_src7.val[2], _src6.val[2]);
4609
4610 uint8x8x2_t _src01t_a = vtrn_u8(_src1.val[3], _src0.val[3]);
4611 uint8x8x2_t _src23t_a = vtrn_u8(_src3.val[3], _src2.val[3]);
4612 uint8x8x2_t _src45t_a = vtrn_u8(_src5.val[3], _src4.val[3]);
4613 uint8x8x2_t _src67t_a = vtrn_u8(_src7.val[3], _src6.val[3]);
4614
4615 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[1]), vreinterpret_u16_u8(_src01t_r.val[1]));
4616 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src23t_r.val[0]), vreinterpret_u16_u8(_src01t_r.val[0]));
4617 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[1]), vreinterpret_u16_u8(_src45t_r.val[1]));
4618 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src67t_r.val[0]), vreinterpret_u16_u8(_src45t_r.val[0]));
4619
4620 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[1]), vreinterpret_u16_u8(_src01t_g.val[1]));
4621 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src23t_g.val[0]), vreinterpret_u16_u8(_src01t_g.val[0]));
4622 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[1]), vreinterpret_u16_u8(_src45t_g.val[1]));
4623 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src67t_g.val[0]), vreinterpret_u16_u8(_src45t_g.val[0]));
4624
4625 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[1]), vreinterpret_u16_u8(_src01t_b.val[1]));
4626 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src23t_b.val[0]), vreinterpret_u16_u8(_src01t_b.val[0]));
4627 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[1]), vreinterpret_u16_u8(_src45t_b.val[1]));
4628 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src67t_b.val[0]), vreinterpret_u16_u8(_src45t_b.val[0]));
4629
4630 uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[1]), vreinterpret_u16_u8(_src01t_a.val[1]));
4631 uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src23t_a.val[0]), vreinterpret_u16_u8(_src01t_a.val[0]));
4632 uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[1]), vreinterpret_u16_u8(_src45t_a.val[1]));
4633 uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src67t_a.val[0]), vreinterpret_u16_u8(_src45t_a.val[0]));
4634
4635 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[1]), vreinterpret_u32_u16(_src02tt_r.val[1]));
4636 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[1]), vreinterpret_u32_u16(_src13tt_r.val[1]));
4637 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src46tt_r.val[0]), vreinterpret_u32_u16(_src02tt_r.val[0]));
4638 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src57tt_r.val[0]), vreinterpret_u32_u16(_src13tt_r.val[0]));
4639
4640 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[1]), vreinterpret_u32_u16(_src02tt_g.val[1]));
4641 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[1]), vreinterpret_u32_u16(_src13tt_g.val[1]));
4642 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src46tt_g.val[0]), vreinterpret_u32_u16(_src02tt_g.val[0]));
4643 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src57tt_g.val[0]), vreinterpret_u32_u16(_src13tt_g.val[0]));
4644
4645 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[1]), vreinterpret_u32_u16(_src02tt_b.val[1]));
4646 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[1]), vreinterpret_u32_u16(_src13tt_b.val[1]));
4647 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src46tt_b.val[0]), vreinterpret_u32_u16(_src02tt_b.val[0]));
4648 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src57tt_b.val[0]), vreinterpret_u32_u16(_src13tt_b.val[0]));
4649
4650 uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[1]), vreinterpret_u32_u16(_src02tt_a.val[1]));
4651 uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[1]), vreinterpret_u32_u16(_src13tt_a.val[1]));
4652 uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src46tt_a.val[0]), vreinterpret_u32_u16(_src02tt_a.val[0]));
4653 uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src57tt_a.val[0]), vreinterpret_u32_u16(_src13tt_a.val[0]));
4654
4655 uint8x8x4_t _dst0;
4656 uint8x8x4_t _dst1;
4657 uint8x8x4_t _dst2;
4658 uint8x8x4_t _dst3;
4659 uint8x8x4_t _dst4;
4660 uint8x8x4_t _dst5;
4661 uint8x8x4_t _dst6;
4662 uint8x8x4_t _dst7;
4663
4664 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4665 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4666 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4667 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4668 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4669 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4670 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4671 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4672
4673 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
4674 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
4675 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
4676 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
4677 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
4678 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
4679 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
4680 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
4681
4682 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
4683 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
4684 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
4685 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
4686 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
4687 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
4688 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
4689 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
4690
4691 _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
4692 _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
4693 _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
4694 _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
4695 _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
4696 _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
4697 _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
4698 _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
4699
4700 vst4_u8(dst7, _dst7);
4701 vst4_u8(dst6, _dst6);
4702 vst4_u8(dst7 + dst_step, _dst5);
4703 vst4_u8(dst6 + dst_step, _dst4);
4704 vst4_u8(dst7 + 2 * dst_step, _dst3);
4705 vst4_u8(dst6 + 2 * dst_step, _dst2);
4706 vst4_u8(dst7 + 3 * dst_step, _dst1);
4707 vst4_u8(dst6 + 3 * dst_step, _dst0);
4708
4709 src0 += 4 * 8;
4710 src1 += 4 * 8;
4711
4712 dst7 += 4 * dst_step;
4713 dst6 += 4 * dst_step;
4714 }
4715 #else
4716 if (nn > 0)
4717 {
4718 asm volatile(
4719 "0: \n"
4720 "pld [%1, #256] \n"
4721 "vld4.u8 {d0-d3}, [%1], %10 \n"
4722
4723 "pld [%2, #256] \n"
4724 "vld4.u8 {d4-d7}, [%2], %10 \n"
4725
4726 "pld [%1, #256] \n"
4727 "vld4.u8 {d8-d11}, [%1], %10 \n"
4728
4729 "vtrn.u8 q2, q0 \n" // _src01t_r
4730 "vtrn.u8 q3, q1 \n"
4731
4732 "pld [%2, #256] \n"
4733 "vld4.u8 {d12-d15}, [%2], %10\n"
4734
4735 "pld [%1, #256] \n"
4736 "vld4.u8 {d16-d19}, [%1], %10\n"
4737
4738 "vtrn.u8 q6, q4 \n" // _src23t_r
4739 "vtrn.u8 q7, q5 \n"
4740
4741 "pld [%2, #256] \n"
4742 "vld4.u8 {d20-d23}, [%2], %10\n"
4743
4744 "pld [%1, #256] \n"
4745 "vld4.u8 {d24-d27}, [%1], %10\n"
4746
4747 "vtrn.u8 q10, q8 \n" // _src45t_r
4748 "vtrn.u8 q11, q9 \n"
4749
4750 "pld [%2, #256] \n"
4751 "vld4.u8 {d28-d31}, [%2], %10\n"
4752
4753 "vtrn.u8 q14, q12 \n" // _src67t_r
4754 "vtrn.u8 q15, q13 \n"
4755
4756 "sub %1, %1, %10, lsl #2 \n" // restore src0
4757
4758 "vtrn.u16 q4, q0 \n" // _src02tt_r
4759 "vtrn.u16 q5, q1 \n"
4760
4761 "sub %2, %2, %10, lsl #2 \n" // restore src1
4762
4763 "vtrn.u16 q6, q2 \n" // _src13tt_r
4764 "vtrn.u16 q7, q3 \n"
4765
4766 "add %1, #32 \n" // src0 += 32
4767
4768 "vtrn.u16 q12, q8 \n" // _src46tt_r
4769 "vtrn.u16 q13, q9 \n"
4770
4771 "add %2, #32 \n" // src1 += 32
4772
4773 "vtrn.u16 q14, q10 \n" // _src57tt_r
4774 "vtrn.u16 q15, q11 \n"
4775
4776 "vtrn.u32 q12, q4 \n" // _src26ttt_r
4777 "vtrn.u32 q13, q5 \n"
4778
4779 "vtrn.u32 q14, q6 \n" // _src37ttt_r
4780 "vst4.u8 {d24-d27}, [%4], %11\n"
4781 "vtrn.u32 q15, q7 \n"
4782
4783 "vtrn.u32 q8, q0 \n" // _src04ttt_r
4784 "vst4.u8 {d28-d31}, [%3], %11\n"
4785 "vtrn.u32 q9, q1 \n"
4786
4787 "vtrn.u32 q10, q2 \n" // _src15ttt_r
4788 "vst4.u8 {d16-d19}, [%4], %11\n"
4789 "vtrn.u32 q11, q3 \n"
4790
4791 "subs %0, #1 \n"
4792
4793 "vst4.u8 {d8-d11}, [%4], %11 \n"
4794 "vst4.u8 {d20-d23}, [%3], %11\n"
4795 "vst4.u8 {d12-d15}, [%3], %11\n"
4796 "vst4.u8 {d0-d3}, [%4], %11 \n"
4797 "vst4.u8 {d4-d7}, [%3], %11 \n"
4798
4799 "bne 0b \n"
4800 : "=r"(nn), // %0
4801 "=r"(src0), // %1
4802 "=r"(src1), // %2
4803 "=r"(dst7), // %3
4804 "=r"(dst6) // %4
4805 : "0"(nn),
4806 "1"(src0),
4807 "2"(src1),
4808 "3"(dst7),
4809 "4"(dst6),
4810 "r"(src_step), // %10
4811 "r"(dst_step) // %11
4812 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4813 }
4814 #endif // __aarch64__
4815 for (; remain > 0; remain--)
4816 {
4817 dst7[0] = src1[0 + 3 * src_step];
4818 dst7[1] = src1[1 + 3 * src_step];
4819 dst7[2] = src1[2 + 3 * src_step];
4820 dst7[3] = src1[3 + 3 * src_step];
4821 dst7[4] = src0[0 + 3 * src_step];
4822 dst7[5] = src0[1 + 3 * src_step];
4823 dst7[6] = src0[2 + 3 * src_step];
4824 dst7[7] = src0[3 + 3 * src_step];
4825 dst7[8] = src1[0 + 2 * src_step];
4826 dst7[9] = src1[1 + 2 * src_step];
4827 dst7[10] = src1[2 + 2 * src_step];
4828 dst7[11] = src1[3 + 2 * src_step];
4829 dst7[12] = src0[0 + 2 * src_step];
4830 dst7[13] = src0[1 + 2 * src_step];
4831 dst7[14] = src0[2 + 2 * src_step];
4832 dst7[15] = src0[3 + 2 * src_step];
4833 dst7[16] = src1[0 + src_step];
4834 dst7[17] = src1[1 + src_step];
4835 dst7[18] = src1[2 + src_step];
4836 dst7[19] = src1[3 + src_step];
4837 dst7[20] = src0[0 + src_step];
4838 dst7[21] = src0[1 + src_step];
4839 dst7[22] = src0[2 + src_step];
4840 dst7[23] = src0[3 + src_step];
4841 dst7[24] = src1[0];
4842 dst7[25] = src1[1];
4843 dst7[26] = src1[2];
4844 dst7[27] = src1[3];
4845 dst7[28] = src0[0];
4846 dst7[29] = src0[1];
4847 dst7[30] = src0[2];
4848 dst7[31] = src0[3];
4849
4850 src0 += 4;
4851 src1 += 4;
4852
4853 dst7 -= stride;
4854 }
4855
4856 src0 += srcwgap + 7 * srcstride;
4857 }
4858 #endif // __ARM_NEON
4859 for (; y < srch; y++)
4860 {
4861 unsigned char* dst0 = dstend - y * 4 - 4;
4862
4863 int x = 0;
4864 for (; x < srcw; x++)
4865 {
4866 dst0[0] = src0[0];
4867 dst0[1] = src0[1];
4868 dst0[2] = src0[2];
4869 dst0[3] = src0[3];
4870
4871 src0 += 4;
4872 dst0 -= stride;
4873 }
4874
4875 src0 += srcwgap;
4876 }
4877 }
4878
kanna_rotate_8_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int h,int stride)4879 static void kanna_rotate_8_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
4880 {
4881 const int srcwgap = srcstride - srcw;
4882
4883 // point to the last dst pixel row
4884 unsigned char* dstend = dst + stride * (h - 1);
4885
4886 const unsigned char* src0 = src;
4887
4888 int y = 0;
4889 #if __ARM_NEON
4890 for (; y + 7 < srch; y += 8)
4891 {
4892 const unsigned char* src1 = src0 + srcstride;
4893
4894 unsigned char* dst7 = dstend + y;
4895 unsigned char* dst6 = dstend + y - stride;
4896
4897 int src_step = 2 * srcstride;
4898 int dst_step = -2 * stride;
4899
4900 int nn = srcw >> 3;
4901 int remain = srcw - (nn << 3);
4902
4903 #if __aarch64__
4904 for (; nn > 0; nn--)
4905 {
4906 uint8x8_t _src0 = vld1_u8(src0);
4907 uint8x8_t _src1 = vld1_u8(src1);
4908
4909 uint8x8_t _src2 = vld1_u8(src0 + src_step);
4910 uint8x8_t _src3 = vld1_u8(src1 + src_step);
4911
4912 uint8x8_t _src4 = vld1_u8(src0 + 2 * src_step);
4913 uint8x8_t _src5 = vld1_u8(src1 + 2 * src_step);
4914
4915 uint8x8_t _src6 = vld1_u8(src0 + 3 * src_step);
4916 uint8x8_t _src7 = vld1_u8(src1 + 3 * src_step);
4917
4918 uint8x8x2_t _src01t_r = vtrn_u8(_src0, _src1);
4919 uint8x8x2_t _src23t_r = vtrn_u8(_src2, _src3);
4920 uint8x8x2_t _src45t_r = vtrn_u8(_src4, _src5);
4921 uint8x8x2_t _src67t_r = vtrn_u8(_src6, _src7);
4922
4923 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
4924 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
4925 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
4926 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
4927
4928 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
4929 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
4930 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
4931 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
4932
4933 uint8x8_t _dst0 = vreinterpret_u8_u32(_src04ttt_r.val[0]);
4934 uint8x8_t _dst1 = vreinterpret_u8_u32(_src15ttt_r.val[0]);
4935 uint8x8_t _dst2 = vreinterpret_u8_u32(_src26ttt_r.val[0]);
4936 uint8x8_t _dst3 = vreinterpret_u8_u32(_src37ttt_r.val[0]);
4937 uint8x8_t _dst4 = vreinterpret_u8_u32(_src04ttt_r.val[1]);
4938 uint8x8_t _dst5 = vreinterpret_u8_u32(_src15ttt_r.val[1]);
4939 uint8x8_t _dst6 = vreinterpret_u8_u32(_src26ttt_r.val[1]);
4940 uint8x8_t _dst7 = vreinterpret_u8_u32(_src37ttt_r.val[1]);
4941
4942 vst1_u8(dst7, _dst0);
4943 vst1_u8(dst6, _dst1);
4944 vst1_u8(dst7 + dst_step, _dst2);
4945 vst1_u8(dst6 + dst_step, _dst3);
4946 vst1_u8(dst7 + 2 * dst_step, _dst4);
4947 vst1_u8(dst6 + 2 * dst_step, _dst5);
4948 vst1_u8(dst7 + 3 * dst_step, _dst6);
4949 vst1_u8(dst6 + 3 * dst_step, _dst7);
4950
4951 src0 += 8;
4952 src1 += 8;
4953
4954 dst7 += 4 * dst_step;
4955 dst6 += 4 * dst_step;
4956 }
4957 #else
4958 if (nn > 0)
4959 {
4960 asm volatile(
4961 "0: \n"
4962 "pld [%1, #64] \n"
4963 "vld1.u8 {d0}, [%1], %10 \n"
4964
4965 "pld [%2, #64] \n"
4966 "vld1.u8 {d1}, [%2], %10 \n"
4967
4968 "pld [%1, #64] \n"
4969 "vld1.u8 {d2}, [%1], %10 \n"
4970
4971 "vtrn.u8 d0, d1 \n" // _src01t_r
4972
4973 "pld [%2, #64] \n"
4974 "vld1.u8 {d3}, [%2], %10 \n"
4975
4976 "pld [%1, #64] \n"
4977 "vld1.u8 {d4}, [%1], %10 \n"
4978
4979 "vtrn.u8 d2, d3 \n" // _src23t_r
4980
4981 "pld [%2, #64] \n"
4982 "vld1.u8 {d5}, [%2], %10 \n"
4983
4984 "pld [%1, #64] \n"
4985 "vld1.u8 {d6}, [%1], %10 \n"
4986
4987 "vtrn.u8 d4, d5 \n" // _src45t_r
4988
4989 "pld [%2, #64] \n"
4990 "vld1.u8 {d7}, [%2], %10 \n"
4991
4992 "vtrn.u8 d6, d7 \n" // _src67t_r
4993
4994 "sub %1, %1, %10, lsl #2 \n" // restore src0
4995
4996 "vtrn.u16 q0, q1 \n" // _src02tt_r _src13tt_r
4997
4998 "sub %2, %2, %10, lsl #2 \n" // restore src1
4999
5000 "vtrn.u16 q2, q3 \n" // _src46tt_r _src57tt_r
5001
5002 "add %1, #8 \n" // src0 += 8
5003
5004 "vtrn.u32 q0, q2 \n" // _src04ttt_r _src15ttt_r
5005
5006 "add %2, #8 \n" // src1 += 8
5007
5008 "vtrn.u32 q1, q3 \n" // _src26ttt_r _src37ttt_r
5009 "vst1.u8 {d0}, [%3], %11 \n"
5010 "vst1.u8 {d1}, [%4], %11 \n"
5011
5012 "subs %0, #1 \n"
5013
5014 "vst1.u8 {d2}, [%3], %11 \n"
5015 "vst1.u8 {d3}, [%4], %11 \n"
5016 "vst1.u8 {d4}, [%3], %11 \n"
5017 "vst1.u8 {d5}, [%4], %11 \n"
5018 "vst1.u8 {d6}, [%3], %11 \n"
5019 "vst1.u8 {d7}, [%4], %11 \n"
5020
5021 "bne 0b \n"
5022 : "=r"(nn), // %0
5023 "=r"(src0), // %1
5024 "=r"(src1), // %2
5025 "=r"(dst7), // %3
5026 "=r"(dst6) // %4
5027 : "0"(nn),
5028 "1"(src0),
5029 "2"(src1),
5030 "3"(dst7),
5031 "4"(dst6),
5032 "r"(src_step), // %10
5033 "r"(dst_step) // %11
5034 : "cc", "memory", "q0", "q1", "q2", "q3");
5035 }
5036 #endif // __aarch64__
5037 for (; remain > 0; remain--)
5038 {
5039 dst7[0] = src0[0];
5040 dst7[1] = src1[0];
5041 dst7[2] = src0[0 + src_step];
5042 dst7[3] = src1[0 + src_step];
5043 dst7[4] = src0[0 + 2 * src_step];
5044 dst7[5] = src1[0 + 2 * src_step];
5045 dst7[6] = src0[0 + 3 * src_step];
5046 dst7[7] = src1[0 + 3 * src_step];
5047
5048 src0 += 1;
5049 src1 += 1;
5050
5051 dst7 -= stride;
5052 }
5053
5054 src0 += srcwgap + 7 * srcstride;
5055 }
5056 #endif // __ARM_NEON
5057 for (; y < srch; y++)
5058 {
5059 unsigned char* dst0 = dstend + y;
5060
5061 int x = 0;
5062 for (; x < srcw; x++)
5063 {
5064 *dst0 = *src0;
5065
5066 src0 += 1;
5067 dst0 -= stride;
5068 }
5069
5070 src0 += srcwgap;
5071 }
5072 }
5073
kanna_rotate_8_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int h,int stride)5074 static void kanna_rotate_8_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
5075 {
5076 const int srcwgap = srcstride - srcw * 2;
5077
5078 // point to the last dst pixel row
5079 unsigned char* dstend = dst + stride * (h - 1);
5080
5081 const unsigned char* src0 = src;
5082
5083 int y = 0;
5084 #if __ARM_NEON
5085 for (; y + 7 < srch; y += 8)
5086 {
5087 const unsigned char* src1 = src0 + srcstride;
5088
5089 unsigned char* dst7 = dstend + y * 2;
5090 unsigned char* dst6 = dstend + y * 2 - stride;
5091
5092 int src_step = 2 * srcstride;
5093 int dst_step = -2 * stride;
5094
5095 int nn = srcw >> 3;
5096 int remain = srcw - (nn << 3);
5097
5098 #if __aarch64__
5099 for (; nn > 0; nn--)
5100 {
5101 uint8x8x2_t _src0 = vld2_u8(src0);
5102 uint8x8x2_t _src1 = vld2_u8(src1);
5103
5104 uint8x8x2_t _src2 = vld2_u8(src0 + src_step);
5105 uint8x8x2_t _src3 = vld2_u8(src1 + src_step);
5106
5107 uint8x8x2_t _src4 = vld2_u8(src0 + 2 * src_step);
5108 uint8x8x2_t _src5 = vld2_u8(src1 + 2 * src_step);
5109
5110 uint8x8x2_t _src6 = vld2_u8(src0 + 3 * src_step);
5111 uint8x8x2_t _src7 = vld2_u8(src1 + 3 * src_step);
5112
5113 uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
5114 uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
5115 uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
5116 uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
5117
5118 uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
5119 uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
5120 uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
5121 uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
5122
5123 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
5124 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
5125 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
5126 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
5127
5128 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
5129 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
5130 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
5131 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
5132
5133 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
5134 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
5135 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
5136 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
5137
5138 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
5139 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
5140 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
5141 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
5142
5143 uint8x8x2_t _dst0;
5144 uint8x8x2_t _dst1;
5145 uint8x8x2_t _dst2;
5146 uint8x8x2_t _dst3;
5147 uint8x8x2_t _dst4;
5148 uint8x8x2_t _dst5;
5149 uint8x8x2_t _dst6;
5150 uint8x8x2_t _dst7;
5151
5152 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
5153 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
5154 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
5155 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
5156 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
5157 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
5158 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
5159 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
5160
5161 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
5162 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
5163 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
5164 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
5165 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
5166 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
5167 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
5168 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
5169
5170 vst2_u8(dst7, _dst0);
5171 vst2_u8(dst6, _dst1);
5172 vst2_u8(dst7 + dst_step, _dst2);
5173 vst2_u8(dst6 + dst_step, _dst3);
5174 vst2_u8(dst7 + 2 * dst_step, _dst4);
5175 vst2_u8(dst6 + 2 * dst_step, _dst5);
5176 vst2_u8(dst7 + 3 * dst_step, _dst6);
5177 vst2_u8(dst6 + 3 * dst_step, _dst7);
5178
5179 src0 += 2 * 8;
5180 src1 += 2 * 8;
5181
5182 dst7 += 4 * dst_step;
5183 dst6 += 4 * dst_step;
5184 }
5185 #else
5186 if (nn > 0)
5187 {
5188 asm volatile(
5189 "0: \n"
5190 "pld [%1, #128] \n"
5191 "vld2.u8 {d0-d1}, [%1], %10 \n"
5192
5193 "pld [%2, #128] \n"
5194 "vld2.u8 {d2-d3}, [%2], %10 \n"
5195
5196 "pld [%1, #128] \n"
5197 "vld2.u8 {d4-d5}, [%1], %10 \n"
5198
5199 "vtrn.u8 q0, q1 \n" // _src01t_r
5200
5201 "pld [%2, #128] \n"
5202 "vld2.u8 {d6-d7}, [%2], %10 \n"
5203
5204 "pld [%1, #128] \n"
5205 "vld2.u8 {d16-d17}, [%1], %10\n"
5206
5207 "vtrn.u8 q2, q3 \n" // _src23t_r
5208
5209 "pld [%2, #128] \n"
5210 "vld2.u8 {d18-d19}, [%2], %10\n"
5211
5212 "pld [%1, #128] \n"
5213 "vld2.u8 {d20-d21}, [%1], %10\n"
5214
5215 "vtrn.u8 q8, q9 \n" // _src45t_r
5216
5217 "pld [%2, #128] \n"
5218 "vld2.u8 {d22-d23}, [%2], %10\n"
5219
5220 "vtrn.u8 q10, q11 \n" // _src67t_r
5221
5222 "sub %1, %1, %10, lsl #2 \n" // restore src0
5223
5224 "vtrn.u16 q0, q2 \n" // _src02tt_r
5225
5226 "sub %2, %2, %10, lsl #2 \n" // restore src1
5227
5228 "vtrn.u16 q1, q3 \n" // _src13tt_r
5229
5230 "add %1, #16 \n" // src0 += 16
5231
5232 "vtrn.u16 q8, q10 \n" // _src46tt_r
5233
5234 "add %2, #16 \n" // src1 += 16
5235
5236 "vtrn.u16 q9, q11 \n" // _src57tt_r
5237
5238 "vtrn.u32 q0, q8 \n" // _src04ttt_r
5239
5240 "vtrn.u32 q1, q9 \n" // _src15ttt_r
5241 "vst2.u8 {d0-d1}, [%3], %11 \n"
5242
5243 "vtrn.u32 q2, q10 \n" // _src26ttt_r
5244 "vst2.u8 {d2-d3}, [%4], %11 \n"
5245
5246 "vtrn.u32 q3, q11 \n" // _src37ttt_r
5247 "vst2.u8 {d4-d5}, [%3], %11 \n"
5248
5249 "subs %0, #1 \n"
5250
5251 "vst2.u8 {d16-d17}, [%3], %11\n"
5252 "vst2.u8 {d6-d7}, [%4], %11 \n"
5253 "vst2.u8 {d18-d19}, [%4], %11\n"
5254 "vst2.u8 {d20-d21}, [%3], %11\n"
5255 "vst2.u8 {d22-d23}, [%4], %11\n"
5256
5257 "bne 0b \n"
5258 : "=r"(nn), // %0
5259 "=r"(src0), // %1
5260 "=r"(src1), // %2
5261 "=r"(dst7), // %3
5262 "=r"(dst6) // %4
5263 : "0"(nn),
5264 "1"(src0),
5265 "2"(src1),
5266 "3"(dst7),
5267 "4"(dst6),
5268 "r"(src_step), // %10
5269 "r"(dst_step) // %11
5270 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
5271 }
5272 #endif // __aarch64__
5273 for (; remain > 0; remain--)
5274 {
5275 dst7[0] = src0[0];
5276 dst7[1] = src0[1];
5277 dst7[2] = src1[0];
5278 dst7[3] = src1[1];
5279 dst7[4] = src0[0 + src_step];
5280 dst7[5] = src0[1 + src_step];
5281 dst7[6] = src1[0 + src_step];
5282 dst7[7] = src1[1 + src_step];
5283 dst7[8] = src0[0 + 2 * src_step];
5284 dst7[9] = src0[1 + 2 * src_step];
5285 dst7[10] = src1[0 + 2 * src_step];
5286 dst7[11] = src1[1 + 2 * src_step];
5287 dst7[12] = src0[0 + 3 * src_step];
5288 dst7[13] = src0[1 + 3 * src_step];
5289 dst7[14] = src1[0 + 3 * src_step];
5290 dst7[15] = src1[1 + 3 * src_step];
5291
5292 src0 += 2;
5293 src1 += 2;
5294
5295 dst7 -= stride;
5296 }
5297
5298 src0 += srcwgap + 7 * srcstride;
5299 }
5300 #endif // __ARM_NEON
5301 for (; y < srch; y++)
5302 {
5303 unsigned char* dst0 = dstend + y * 2;
5304
5305 int x = 0;
5306 for (; x < srcw; x++)
5307 {
5308 dst0[0] = src0[0];
5309 dst0[1] = src0[1];
5310
5311 src0 += 2;
5312 dst0 -= stride;
5313 }
5314
5315 src0 += srcwgap;
5316 }
5317 }
5318
kanna_rotate_8_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int h,int stride)5319 static void kanna_rotate_8_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
5320 {
5321 const int srcwgap = srcstride - srcw * 3;
5322
5323 // point to the last dst pixel row
5324 unsigned char* dstend = dst + stride * (h - 1);
5325
5326 const unsigned char* src0 = src;
5327
5328 int y = 0;
5329 #if __ARM_NEON
5330 for (; y + 7 < srch; y += 8)
5331 {
5332 const unsigned char* src1 = src0 + srcstride;
5333
5334 unsigned char* dst7 = dstend + y * 3;
5335 unsigned char* dst6 = dstend + y * 3 - stride;
5336
5337 int src_step = 2 * srcstride;
5338 int dst_step = -2 * stride;
5339
5340 int nn = srcw >> 3;
5341 int remain = srcw - (nn << 3);
5342
5343 #if __aarch64__
5344 for (; nn > 0; nn--)
5345 {
5346 uint8x8x3_t _src0 = vld3_u8(src0);
5347 uint8x8x3_t _src1 = vld3_u8(src1);
5348
5349 uint8x8x3_t _src2 = vld3_u8(src0 + src_step);
5350 uint8x8x3_t _src3 = vld3_u8(src1 + src_step);
5351
5352 uint8x8x3_t _src4 = vld3_u8(src0 + 2 * src_step);
5353 uint8x8x3_t _src5 = vld3_u8(src1 + 2 * src_step);
5354
5355 uint8x8x3_t _src6 = vld3_u8(src0 + 3 * src_step);
5356 uint8x8x3_t _src7 = vld3_u8(src1 + 3 * src_step);
5357
5358 uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
5359 uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
5360 uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
5361 uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
5362
5363 uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
5364 uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
5365 uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
5366 uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
5367
5368 uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
5369 uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
5370 uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
5371 uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
5372
5373 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
5374 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
5375 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
5376 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
5377
5378 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
5379 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
5380 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
5381 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
5382
5383 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
5384 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
5385 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
5386 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
5387
5388 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
5389 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
5390 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
5391 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
5392
5393 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
5394 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
5395 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
5396 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
5397
5398 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
5399 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
5400 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
5401 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
5402
5403 uint8x8x3_t _dst0;
5404 uint8x8x3_t _dst1;
5405 uint8x8x3_t _dst2;
5406 uint8x8x3_t _dst3;
5407 uint8x8x3_t _dst4;
5408 uint8x8x3_t _dst5;
5409 uint8x8x3_t _dst6;
5410 uint8x8x3_t _dst7;
5411
5412 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
5413 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
5414 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
5415 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
5416 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
5417 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
5418 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
5419 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
5420
5421 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
5422 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
5423 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
5424 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
5425 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
5426 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
5427 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
5428 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
5429
5430 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
5431 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
5432 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
5433 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
5434 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
5435 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
5436 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
5437 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
5438
5439 vst3_u8(dst7, _dst0);
5440 vst3_u8(dst6, _dst1);
5441 vst3_u8(dst7 + dst_step, _dst2);
5442 vst3_u8(dst6 + dst_step, _dst3);
5443 vst3_u8(dst7 + 2 * dst_step, _dst4);
5444 vst3_u8(dst6 + 2 * dst_step, _dst5);
5445 vst3_u8(dst7 + 3 * dst_step, _dst6);
5446 vst3_u8(dst6 + 3 * dst_step, _dst7);
5447
5448 src0 += 3 * 8;
5449 src1 += 3 * 8;
5450
5451 dst7 += 4 * dst_step;
5452 dst6 += 4 * dst_step;
5453 }
5454 #else
5455 if (nn > 0)
5456 {
5457 asm volatile(
5458 "0: \n"
5459 "pld [%1, #192] \n"
5460 "vld3.u8 {d0-d2}, [%1], %10 \n"
5461
5462 "pld [%2, #192] \n"
5463 "vld3.u8 {d4-d6}, [%2], %10 \n"
5464
5465 "pld [%1, #192] \n"
5466 "vld3.u8 {d8-d10}, [%1], %10 \n"
5467
5468 "vtrn.u8 q0, q2 \n" // _src01t_r
5469 "vtrn.u8 d2, d6 \n"
5470
5471 "pld [%2, #192] \n"
5472 "vld3.u8 {d12-d14}, [%2], %10\n"
5473
5474 "pld [%1, #192] \n"
5475 "vld3.u8 {d16-d18}, [%1], %10\n"
5476
5477 "vtrn.u8 q4, q6 \n" // _src23t_r
5478 "vtrn.u8 d10, d14 \n"
5479
5480 "pld [%2, #192] \n"
5481 "vld3.u8 {d20-d22}, [%2], %10\n"
5482
5483 "pld [%1, #192] \n"
5484 "vld3.u8 {d24-d26}, [%1], %10\n"
5485
5486 "vtrn.u8 q8, q10 \n" // _src45t_r
5487 "vtrn.u8 d18, d22 \n"
5488
5489 "pld [%2, #192] \n"
5490 "vld3.u8 {d28-d30}, [%2], %10\n"
5491
5492 "vtrn.u8 q12, q14 \n" // _src67t_r
5493 "vtrn.u8 d26, d30 \n"
5494
5495 "sub %1, %1, %10, lsl #2 \n" // restore src0
5496
5497 "vtrn.u16 q0, q4 \n" // _src02tt_r
5498 "vtrn.u16 d2, d10 \n"
5499
5500 "sub %2, %2, %10, lsl #2 \n" // restore src1
5501
5502 "vtrn.u16 q2, q6 \n" // _src13tt_r
5503 "vtrn.u16 d6, d14 \n"
5504
5505 "add %1, #24 \n" // src0 += 24
5506
5507 "vtrn.u16 q8, q12 \n" // _src46tt_r
5508 "vtrn.u16 d18, d26 \n"
5509
5510 "add %2, #24 \n" // src1 += 24
5511
5512 "vtrn.u16 q10, q14 \n" // _src57tt_r
5513 "vtrn.u16 d22, d30 \n"
5514
5515 "vtrn.u32 q0, q8 \n" // _src04ttt_r
5516 "vtrn.u32 d2, d18 \n"
5517
5518 "vtrn.u32 q2, q10 \n" // _src15ttt_r
5519 "vst3.u8 {d0-d2}, [%3], %11 \n"
5520 "vtrn.u32 d6, d22 \n"
5521
5522 "vtrn.u32 q4, q12 \n" // _src26ttt_r
5523 "vst3.u8 {d4-d6}, [%4], %11 \n"
5524 "vtrn.u32 d10, d26 \n"
5525
5526 "vtrn.u32 q6, q14 \n" // _src37ttt_r
5527 "vst3.u8 {d8-d10}, [%3], %11 \n"
5528 "vtrn.u32 d14, d30 \n"
5529
5530 "subs %0, #1 \n"
5531
5532 "vst3.u8 {d16-d18}, [%3], %11\n"
5533 "vst3.u8 {d12-d14}, [%4], %11\n"
5534 "vst3.u8 {d20-d22}, [%4], %11\n"
5535 "vst3.u8 {d24-d26}, [%3], %11\n"
5536 "vst3.u8 {d28-d30}, [%4], %11\n"
5537
5538 "bne 0b \n"
5539 : "=r"(nn), // %0
5540 "=r"(src0), // %1
5541 "=r"(src1), // %2
5542 "=r"(dst7), // %3
5543 "=r"(dst6) // %4
5544 : "0"(nn),
5545 "1"(src0),
5546 "2"(src1),
5547 "3"(dst7),
5548 "4"(dst6),
5549 "r"(src_step), // %10
5550 "r"(dst_step) // %11
5551 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5552 }
5553 #endif // __aarch64__
5554 for (; remain > 0; remain--)
5555 {
5556 dst7[0] = src0[0];
5557 dst7[1] = src0[1];
5558 dst7[2] = src0[2];
5559 dst7[3] = src1[0];
5560 dst7[4] = src1[1];
5561 dst7[5] = src1[2];
5562 dst7[6] = src0[0 + src_step];
5563 dst7[7] = src0[1 + src_step];
5564 dst7[8] = src0[2 + src_step];
5565 dst7[9] = src1[0 + src_step];
5566 dst7[10] = src1[1 + src_step];
5567 dst7[11] = src1[2 + src_step];
5568 dst7[12] = src0[0 + 2 * src_step];
5569 dst7[13] = src0[1 + 2 * src_step];
5570 dst7[14] = src0[2 + 2 * src_step];
5571 dst7[15] = src1[0 + 2 * src_step];
5572 dst7[16] = src1[1 + 2 * src_step];
5573 dst7[17] = src1[2 + 2 * src_step];
5574 dst7[18] = src0[0 + 3 * src_step];
5575 dst7[19] = src0[1 + 3 * src_step];
5576 dst7[20] = src0[2 + 3 * src_step];
5577 dst7[21] = src1[0 + 3 * src_step];
5578 dst7[22] = src1[1 + 3 * src_step];
5579 dst7[23] = src1[2 + 3 * src_step];
5580
5581 src0 += 3;
5582 src1 += 3;
5583
5584 dst7 -= stride;
5585 }
5586
5587 src0 += srcwgap + 7 * srcstride;
5588 }
5589 #endif // __ARM_NEON
5590 for (; y < srch; y++)
5591 {
5592 unsigned char* dst0 = dstend + y * 3;
5593
5594 int x = 0;
5595 for (; x < srcw; x++)
5596 {
5597 dst0[0] = src0[0];
5598 dst0[1] = src0[1];
5599 dst0[2] = src0[2];
5600
5601 src0 += 3;
5602 dst0 -= stride;
5603 }
5604
5605 src0 += srcwgap;
5606 }
5607 }
5608
kanna_rotate_8_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int,int h,int stride)5609 static void kanna_rotate_8_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int /*w*/, int h, int stride)
5610 {
5611 const int srcwgap = srcstride - srcw * 4;
5612
5613 // point to the last dst pixel row
5614 unsigned char* dstend = dst + stride * (h - 1);
5615
5616 const unsigned char* src0 = src;
5617
5618 int y = 0;
5619 #if __ARM_NEON
5620 for (; y + 7 < srch; y += 8)
5621 {
5622 const unsigned char* src1 = src0 + srcstride;
5623
5624 unsigned char* dst7 = dstend + y * 4;
5625 unsigned char* dst6 = dstend + y * 4 - stride;
5626
5627 int src_step = 2 * srcstride;
5628 int dst_step = -2 * stride;
5629
5630 int nn = srcw >> 3;
5631 int remain = srcw - (nn << 3);
5632
5633 #if __aarch64__
5634 for (; nn > 0; nn--)
5635 {
5636 uint8x8x4_t _src0 = vld4_u8(src0);
5637 uint8x8x4_t _src1 = vld4_u8(src1);
5638
5639 uint8x8x4_t _src2 = vld4_u8(src0 + src_step);
5640 uint8x8x4_t _src3 = vld4_u8(src1 + src_step);
5641
5642 uint8x8x4_t _src4 = vld4_u8(src0 + 2 * src_step);
5643 uint8x8x4_t _src5 = vld4_u8(src1 + 2 * src_step);
5644
5645 uint8x8x4_t _src6 = vld4_u8(src0 + 3 * src_step);
5646 uint8x8x4_t _src7 = vld4_u8(src1 + 3 * src_step);
5647
5648 uint8x8x2_t _src01t_r = vtrn_u8(_src0.val[0], _src1.val[0]);
5649 uint8x8x2_t _src23t_r = vtrn_u8(_src2.val[0], _src3.val[0]);
5650 uint8x8x2_t _src45t_r = vtrn_u8(_src4.val[0], _src5.val[0]);
5651 uint8x8x2_t _src67t_r = vtrn_u8(_src6.val[0], _src7.val[0]);
5652
5653 uint8x8x2_t _src01t_g = vtrn_u8(_src0.val[1], _src1.val[1]);
5654 uint8x8x2_t _src23t_g = vtrn_u8(_src2.val[1], _src3.val[1]);
5655 uint8x8x2_t _src45t_g = vtrn_u8(_src4.val[1], _src5.val[1]);
5656 uint8x8x2_t _src67t_g = vtrn_u8(_src6.val[1], _src7.val[1]);
5657
5658 uint8x8x2_t _src01t_b = vtrn_u8(_src0.val[2], _src1.val[2]);
5659 uint8x8x2_t _src23t_b = vtrn_u8(_src2.val[2], _src3.val[2]);
5660 uint8x8x2_t _src45t_b = vtrn_u8(_src4.val[2], _src5.val[2]);
5661 uint8x8x2_t _src67t_b = vtrn_u8(_src6.val[2], _src7.val[2]);
5662
5663 uint8x8x2_t _src01t_a = vtrn_u8(_src0.val[3], _src1.val[3]);
5664 uint8x8x2_t _src23t_a = vtrn_u8(_src2.val[3], _src3.val[3]);
5665 uint8x8x2_t _src45t_a = vtrn_u8(_src4.val[3], _src5.val[3]);
5666 uint8x8x2_t _src67t_a = vtrn_u8(_src6.val[3], _src7.val[3]);
5667
5668 uint16x4x2_t _src02tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[0]), vreinterpret_u16_u8(_src23t_r.val[0]));
5669 uint16x4x2_t _src13tt_r = vtrn_u16(vreinterpret_u16_u8(_src01t_r.val[1]), vreinterpret_u16_u8(_src23t_r.val[1]));
5670 uint16x4x2_t _src46tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[0]), vreinterpret_u16_u8(_src67t_r.val[0]));
5671 uint16x4x2_t _src57tt_r = vtrn_u16(vreinterpret_u16_u8(_src45t_r.val[1]), vreinterpret_u16_u8(_src67t_r.val[1]));
5672
5673 uint16x4x2_t _src02tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[0]), vreinterpret_u16_u8(_src23t_g.val[0]));
5674 uint16x4x2_t _src13tt_g = vtrn_u16(vreinterpret_u16_u8(_src01t_g.val[1]), vreinterpret_u16_u8(_src23t_g.val[1]));
5675 uint16x4x2_t _src46tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[0]), vreinterpret_u16_u8(_src67t_g.val[0]));
5676 uint16x4x2_t _src57tt_g = vtrn_u16(vreinterpret_u16_u8(_src45t_g.val[1]), vreinterpret_u16_u8(_src67t_g.val[1]));
5677
5678 uint16x4x2_t _src02tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[0]), vreinterpret_u16_u8(_src23t_b.val[0]));
5679 uint16x4x2_t _src13tt_b = vtrn_u16(vreinterpret_u16_u8(_src01t_b.val[1]), vreinterpret_u16_u8(_src23t_b.val[1]));
5680 uint16x4x2_t _src46tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[0]), vreinterpret_u16_u8(_src67t_b.val[0]));
5681 uint16x4x2_t _src57tt_b = vtrn_u16(vreinterpret_u16_u8(_src45t_b.val[1]), vreinterpret_u16_u8(_src67t_b.val[1]));
5682
5683 uint16x4x2_t _src02tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[0]), vreinterpret_u16_u8(_src23t_a.val[0]));
5684 uint16x4x2_t _src13tt_a = vtrn_u16(vreinterpret_u16_u8(_src01t_a.val[1]), vreinterpret_u16_u8(_src23t_a.val[1]));
5685 uint16x4x2_t _src46tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[0]), vreinterpret_u16_u8(_src67t_a.val[0]));
5686 uint16x4x2_t _src57tt_a = vtrn_u16(vreinterpret_u16_u8(_src45t_a.val[1]), vreinterpret_u16_u8(_src67t_a.val[1]));
5687
5688 uint32x2x2_t _src04ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[0]), vreinterpret_u32_u16(_src46tt_r.val[0]));
5689 uint32x2x2_t _src15ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[0]), vreinterpret_u32_u16(_src57tt_r.val[0]));
5690 uint32x2x2_t _src26ttt_r = vtrn_u32(vreinterpret_u32_u16(_src02tt_r.val[1]), vreinterpret_u32_u16(_src46tt_r.val[1]));
5691 uint32x2x2_t _src37ttt_r = vtrn_u32(vreinterpret_u32_u16(_src13tt_r.val[1]), vreinterpret_u32_u16(_src57tt_r.val[1]));
5692
5693 uint32x2x2_t _src04ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[0]), vreinterpret_u32_u16(_src46tt_g.val[0]));
5694 uint32x2x2_t _src15ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[0]), vreinterpret_u32_u16(_src57tt_g.val[0]));
5695 uint32x2x2_t _src26ttt_g = vtrn_u32(vreinterpret_u32_u16(_src02tt_g.val[1]), vreinterpret_u32_u16(_src46tt_g.val[1]));
5696 uint32x2x2_t _src37ttt_g = vtrn_u32(vreinterpret_u32_u16(_src13tt_g.val[1]), vreinterpret_u32_u16(_src57tt_g.val[1]));
5697
5698 uint32x2x2_t _src04ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[0]), vreinterpret_u32_u16(_src46tt_b.val[0]));
5699 uint32x2x2_t _src15ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[0]), vreinterpret_u32_u16(_src57tt_b.val[0]));
5700 uint32x2x2_t _src26ttt_b = vtrn_u32(vreinterpret_u32_u16(_src02tt_b.val[1]), vreinterpret_u32_u16(_src46tt_b.val[1]));
5701 uint32x2x2_t _src37ttt_b = vtrn_u32(vreinterpret_u32_u16(_src13tt_b.val[1]), vreinterpret_u32_u16(_src57tt_b.val[1]));
5702
5703 uint32x2x2_t _src04ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[0]), vreinterpret_u32_u16(_src46tt_a.val[0]));
5704 uint32x2x2_t _src15ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[0]), vreinterpret_u32_u16(_src57tt_a.val[0]));
5705 uint32x2x2_t _src26ttt_a = vtrn_u32(vreinterpret_u32_u16(_src02tt_a.val[1]), vreinterpret_u32_u16(_src46tt_a.val[1]));
5706 uint32x2x2_t _src37ttt_a = vtrn_u32(vreinterpret_u32_u16(_src13tt_a.val[1]), vreinterpret_u32_u16(_src57tt_a.val[1]));
5707
5708 uint8x8x4_t _dst0;
5709 uint8x8x4_t _dst1;
5710 uint8x8x4_t _dst2;
5711 uint8x8x4_t _dst3;
5712 uint8x8x4_t _dst4;
5713 uint8x8x4_t _dst5;
5714 uint8x8x4_t _dst6;
5715 uint8x8x4_t _dst7;
5716
5717 _dst0.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[0]);
5718 _dst1.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[0]);
5719 _dst2.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[0]);
5720 _dst3.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[0]);
5721 _dst4.val[0] = vreinterpret_u8_u32(_src04ttt_r.val[1]);
5722 _dst5.val[0] = vreinterpret_u8_u32(_src15ttt_r.val[1]);
5723 _dst6.val[0] = vreinterpret_u8_u32(_src26ttt_r.val[1]);
5724 _dst7.val[0] = vreinterpret_u8_u32(_src37ttt_r.val[1]);
5725
5726 _dst0.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[0]);
5727 _dst1.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[0]);
5728 _dst2.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[0]);
5729 _dst3.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[0]);
5730 _dst4.val[1] = vreinterpret_u8_u32(_src04ttt_g.val[1]);
5731 _dst5.val[1] = vreinterpret_u8_u32(_src15ttt_g.val[1]);
5732 _dst6.val[1] = vreinterpret_u8_u32(_src26ttt_g.val[1]);
5733 _dst7.val[1] = vreinterpret_u8_u32(_src37ttt_g.val[1]);
5734
5735 _dst0.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[0]);
5736 _dst1.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[0]);
5737 _dst2.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[0]);
5738 _dst3.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[0]);
5739 _dst4.val[2] = vreinterpret_u8_u32(_src04ttt_b.val[1]);
5740 _dst5.val[2] = vreinterpret_u8_u32(_src15ttt_b.val[1]);
5741 _dst6.val[2] = vreinterpret_u8_u32(_src26ttt_b.val[1]);
5742 _dst7.val[2] = vreinterpret_u8_u32(_src37ttt_b.val[1]);
5743
5744 _dst0.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[0]);
5745 _dst1.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[0]);
5746 _dst2.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[0]);
5747 _dst3.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[0]);
5748 _dst4.val[3] = vreinterpret_u8_u32(_src04ttt_a.val[1]);
5749 _dst5.val[3] = vreinterpret_u8_u32(_src15ttt_a.val[1]);
5750 _dst6.val[3] = vreinterpret_u8_u32(_src26ttt_a.val[1]);
5751 _dst7.val[3] = vreinterpret_u8_u32(_src37ttt_a.val[1]);
5752
5753 vst4_u8(dst7, _dst0);
5754 vst4_u8(dst6, _dst1);
5755 vst4_u8(dst7 + dst_step, _dst2);
5756 vst4_u8(dst6 + dst_step, _dst3);
5757 vst4_u8(dst7 + 2 * dst_step, _dst4);
5758 vst4_u8(dst6 + 2 * dst_step, _dst5);
5759 vst4_u8(dst7 + 3 * dst_step, _dst6);
5760 vst4_u8(dst6 + 3 * dst_step, _dst7);
5761
5762 src0 += 4 * 8;
5763 src1 += 4 * 8;
5764
5765 dst7 += 4 * dst_step;
5766 dst6 += 4 * dst_step;
5767 }
5768 #else
5769 if (nn > 0)
5770 {
5771 asm volatile(
5772 "0: \n"
5773 "pld [%1, #256] \n"
5774 "vld4.u8 {d0-d3}, [%1], %10 \n"
5775
5776 "pld [%2, #256] \n"
5777 "vld4.u8 {d4-d7}, [%2], %10 \n"
5778
5779 "pld [%1, #256] \n"
5780 "vld4.u8 {d8-d11}, [%1], %10 \n"
5781
5782 "vtrn.u8 q0, q2 \n" // _src01t_r
5783 "vtrn.u8 q1, q3 \n"
5784
5785 "pld [%2, #256] \n"
5786 "vld4.u8 {d12-d15}, [%2], %10\n"
5787
5788 "pld [%1, #256] \n"
5789 "vld4.u8 {d16-d19}, [%1], %10\n"
5790
5791 "vtrn.u8 q4, q6 \n" // _src23t_r
5792 "vtrn.u8 q5, q7 \n"
5793
5794 "pld [%2, #256] \n"
5795 "vld4.u8 {d20-d23}, [%2], %10\n"
5796
5797 "pld [%1, #256] \n"
5798 "vld4.u8 {d24-d27}, [%1], %10\n"
5799
5800 "vtrn.u8 q8, q10 \n" // _src45t_r
5801 "vtrn.u8 q9, q11 \n"
5802
5803 "pld [%2, #256] \n"
5804 "vld4.u8 {d28-d31}, [%2], %10\n"
5805
5806 "vtrn.u8 q12, q14 \n" // _src67t_r
5807 "vtrn.u8 q13, q15 \n"
5808
5809 "sub %1, %1, %10, lsl #2 \n" // restore src0
5810
5811 "vtrn.u16 q0, q4 \n" // _src02tt_r
5812 "vtrn.u16 q1, q5 \n"
5813
5814 "sub %2, %2, %10, lsl #2 \n" // restore src1
5815
5816 "vtrn.u16 q2, q6 \n" // _src13tt_r
5817 "vtrn.u16 q3, q7 \n"
5818
5819 "add %1, #32 \n" // src0 += 32
5820
5821 "vtrn.u16 q8, q12 \n" // _src46tt_r
5822 "vtrn.u16 q9, q13 \n"
5823
5824 "add %2, #32 \n" // src1 += 32
5825
5826 "vtrn.u16 q10, q14 \n" // _src57tt_r
5827 "vtrn.u16 q11, q15 \n"
5828
5829 "vtrn.u32 q0, q8 \n" // _src04ttt_r
5830 "vtrn.u32 q1, q9 \n"
5831
5832 "vtrn.u32 q2, q10 \n" // _src15ttt_r
5833 "vst4.u8 {d0-d3}, [%3], %11 \n"
5834 "vtrn.u32 q3, q11 \n"
5835
5836 "vtrn.u32 q4, q12 \n" // _src26ttt_r
5837 "vst4.u8 {d4-d7}, [%4], %11 \n"
5838 "vtrn.u32 q5, q13 \n"
5839
5840 "vtrn.u32 q6, q14 \n" // _src37ttt_r
5841 "vst4.u8 {d8-d11}, [%3], %11 \n"
5842 "vtrn.u32 q7, q15 \n"
5843
5844 "subs %0, #1 \n"
5845
5846 "vst4.u8 {d16-d19}, [%3], %11\n"
5847 "vst4.u8 {d12-d15}, [%4], %11\n"
5848 "vst4.u8 {d20-d23}, [%4], %11\n"
5849 "vst4.u8 {d24-d27}, [%3], %11\n"
5850 "vst4.u8 {d28-d31}, [%4], %11\n"
5851
5852 "bne 0b \n"
5853 : "=r"(nn), // %0
5854 "=r"(src0), // %1
5855 "=r"(src1), // %2
5856 "=r"(dst7), // %3
5857 "=r"(dst6) // %4
5858 : "0"(nn),
5859 "1"(src0),
5860 "2"(src1),
5861 "3"(dst7),
5862 "4"(dst6),
5863 "r"(src_step), // %10
5864 "r"(dst_step) // %11
5865 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5866 }
5867 #endif // __aarch64__
5868 for (; remain > 0; remain--)
5869 {
5870 dst7[0] = src0[0];
5871 dst7[1] = src0[1];
5872 dst7[2] = src0[2];
5873 dst7[3] = src0[3];
5874 dst7[4] = src1[0];
5875 dst7[5] = src1[1];
5876 dst7[6] = src1[2];
5877 dst7[7] = src1[3];
5878 dst7[8] = src0[0 + src_step];
5879 dst7[9] = src0[1 + src_step];
5880 dst7[10] = src0[2 + src_step];
5881 dst7[11] = src0[3 + src_step];
5882 dst7[12] = src1[0 + src_step];
5883 dst7[13] = src1[1 + src_step];
5884 dst7[14] = src1[2 + src_step];
5885 dst7[15] = src1[3 + src_step];
5886 dst7[16] = src0[0 + 2 * src_step];
5887 dst7[17] = src0[1 + 2 * src_step];
5888 dst7[18] = src0[2 + 2 * src_step];
5889 dst7[19] = src0[3 + 2 * src_step];
5890 dst7[20] = src1[0 + 2 * src_step];
5891 dst7[21] = src1[1 + 2 * src_step];
5892 dst7[22] = src1[2 + 2 * src_step];
5893 dst7[23] = src1[3 + 2 * src_step];
5894 dst7[24] = src0[0 + 3 * src_step];
5895 dst7[25] = src0[1 + 3 * src_step];
5896 dst7[26] = src0[2 + 3 * src_step];
5897 dst7[27] = src0[3 + 3 * src_step];
5898 dst7[28] = src1[0 + 3 * src_step];
5899 dst7[29] = src1[1 + 3 * src_step];
5900 dst7[30] = src1[2 + 3 * src_step];
5901 dst7[31] = src1[3 + 3 * src_step];
5902
5903 src0 += 4;
5904 src1 += 4;
5905
5906 dst7 -= stride;
5907 }
5908
5909 src0 += srcwgap + 7 * srcstride;
5910 }
5911 #endif // __ARM_NEON
5912 for (; y < srch; y++)
5913 {
5914 unsigned char* dst0 = dstend + y * 4;
5915
5916 int x = 0;
5917 for (; x < srcw; x++)
5918 {
5919 dst0[0] = src0[0];
5920 dst0[1] = src0[1];
5921 dst0[2] = src0[2];
5922 dst0[3] = src0[3];
5923
5924 src0 += 4;
5925 dst0 -= stride;
5926 }
5927
5928 src0 += srcwgap;
5929 }
5930 }
5931
kanna_rotate_c1(const unsigned char * src,int srcw,int srch,unsigned char * dst,int w,int h,int type)5932 void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5933 {
5934 return kanna_rotate_c1(src, srcw, srch, srcw, dst, w, h, w, type);
5935 }
5936
kanna_rotate_c2(const unsigned char * src,int srcw,int srch,unsigned char * dst,int w,int h,int type)5937 void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5938 {
5939 return kanna_rotate_c2(src, srcw, srch, srcw * 2, dst, w, h, w * 2, type);
5940 }
5941
kanna_rotate_c3(const unsigned char * src,int srcw,int srch,unsigned char * dst,int w,int h,int type)5942 void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5943 {
5944 return kanna_rotate_c3(src, srcw, srch, srcw * 3, dst, w, h, w * 3, type);
5945 }
5946
kanna_rotate_c4(const unsigned char * src,int srcw,int srch,unsigned char * dst,int w,int h,int type)5947 void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
5948 {
5949 return kanna_rotate_c4(src, srcw, srch, srcw * 4, dst, w, h, w * 4, type);
5950 }
5951
kanna_rotate_c1(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride,int type)5952 void kanna_rotate_c1(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
5953 {
5954 // assert srcw == w && srch == h for type 1234
5955 // assert srcw == h && srch == w for type 5678
5956
5957 switch (type)
5958 {
5959 case 1:
5960 kanna_rotate_1_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5961 break;
5962 case 2:
5963 kanna_rotate_2_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5964 break;
5965 case 3:
5966 kanna_rotate_3_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5967 break;
5968 case 4:
5969 kanna_rotate_4_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5970 break;
5971 case 5:
5972 kanna_rotate_5_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5973 break;
5974 case 6:
5975 kanna_rotate_6_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5976 break;
5977 case 7:
5978 kanna_rotate_7_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5979 break;
5980 case 8:
5981 kanna_rotate_8_c1(src, srcw, srch, srcstride, dst, w, h, stride);
5982 break;
5983 default:
5984 // unsupported rotate type
5985 break;
5986 }
5987 }
5988
kanna_rotate_c2(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride,int type)5989 void kanna_rotate_c2(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
5990 {
5991 // assert srcw == w && srch == h for type 1234
5992 // assert srcw == h && srch == w for type 5678
5993
5994 switch (type)
5995 {
5996 case 1:
5997 kanna_rotate_1_c2(src, srcw, srch, srcstride, dst, w, h, stride);
5998 break;
5999 case 2:
6000 kanna_rotate_2_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6001 break;
6002 case 3:
6003 kanna_rotate_3_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6004 break;
6005 case 4:
6006 kanna_rotate_4_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6007 break;
6008 case 5:
6009 kanna_rotate_5_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6010 break;
6011 case 6:
6012 kanna_rotate_6_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6013 break;
6014 case 7:
6015 kanna_rotate_7_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6016 break;
6017 case 8:
6018 kanna_rotate_8_c2(src, srcw, srch, srcstride, dst, w, h, stride);
6019 break;
6020 default:
6021 // unsupported rotate type
6022 break;
6023 }
6024 }
6025
kanna_rotate_c3(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride,int type)6026 void kanna_rotate_c3(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
6027 {
6028 // assert srcw == w && srch == h for type 1234
6029 // assert srcw == h && srch == w for type 5678
6030
6031 switch (type)
6032 {
6033 case 1:
6034 kanna_rotate_1_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6035 break;
6036 case 2:
6037 kanna_rotate_2_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6038 break;
6039 case 3:
6040 kanna_rotate_3_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6041 break;
6042 case 4:
6043 kanna_rotate_4_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6044 break;
6045 case 5:
6046 kanna_rotate_5_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6047 break;
6048 case 6:
6049 kanna_rotate_6_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6050 break;
6051 case 7:
6052 kanna_rotate_7_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6053 break;
6054 case 8:
6055 kanna_rotate_8_c3(src, srcw, srch, srcstride, dst, w, h, stride);
6056 break;
6057 default:
6058 // unsupported rotate type
6059 break;
6060 }
6061 }
6062
kanna_rotate_c4(const unsigned char * src,int srcw,int srch,int srcstride,unsigned char * dst,int w,int h,int stride,int type)6063 void kanna_rotate_c4(const unsigned char* src, int srcw, int srch, int srcstride, unsigned char* dst, int w, int h, int stride, int type)
6064 {
6065 // assert srcw == w && srch == h for type 1234
6066 // assert srcw == h && srch == w for type 5678
6067
6068 switch (type)
6069 {
6070 case 1:
6071 kanna_rotate_1_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6072 break;
6073 case 2:
6074 kanna_rotate_2_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6075 break;
6076 case 3:
6077 kanna_rotate_3_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6078 break;
6079 case 4:
6080 kanna_rotate_4_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6081 break;
6082 case 5:
6083 kanna_rotate_5_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6084 break;
6085 case 6:
6086 kanna_rotate_6_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6087 break;
6088 case 7:
6089 kanna_rotate_7_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6090 break;
6091 case 8:
6092 kanna_rotate_8_c4(src, srcw, srch, srcstride, dst, w, h, stride);
6093 break;
6094 default:
6095 // unsupported rotate type
6096 break;
6097 }
6098 }
6099
kanna_rotate_yuv420sp(const unsigned char * src,int srcw,int srch,unsigned char * dst,int w,int h,int type)6100 void kanna_rotate_yuv420sp(const unsigned char* src, int srcw, int srch, unsigned char* dst, int w, int h, int type)
6101 {
6102 // assert srcw % 2 == 0
6103 // assert srch % 2 == 0
6104 // assert w % 2 == 0
6105 // assert h % 2 == 0
6106
6107 const unsigned char* srcY = src;
6108 unsigned char* dstY = dst;
6109 kanna_rotate_c1(srcY, srcw, srch, dstY, w, h, type);
6110
6111 const unsigned char* srcUV = src + srcw * srch;
6112 unsigned char* dstUV = dst + w * h;
6113 kanna_rotate_c2(srcUV, srcw / 2, srch / 2, dstUV, w / 2, h / 2, type);
6114 }
6115 #endif // NCNN_PIXEL_ROTATE
6116
6117 } // namespace ncnn
6118