1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "./aom_dsp_rtcd.h"
13 #include "aom_dsp/mips/macros_msa.h"
14
sub_blk_4x4_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)15 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
16 const uint8_t *pred_ptr, int32_t pred_stride,
17 int16_t *diff_ptr, int32_t diff_stride) {
18 uint32_t src0, src1, src2, src3;
19 uint32_t pred0, pred1, pred2, pred3;
20 v16i8 src = { 0 };
21 v16i8 pred = { 0 };
22 v16u8 src_l0, src_l1;
23 v8i16 diff0, diff1;
24
25 LW4(src_ptr, src_stride, src0, src1, src2, src3);
26 LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
27 INSERT_W4_SB(src0, src1, src2, src3, src);
28 INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
29 ILVRL_B2_UB(src, pred, src_l0, src_l1);
30 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
31 ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
32 }
33
sub_blk_8x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)34 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
35 const uint8_t *pred_ptr, int32_t pred_stride,
36 int16_t *diff_ptr, int32_t diff_stride) {
37 uint32_t loop_cnt;
38 uint64_t src0, src1, pred0, pred1;
39 v16i8 src = { 0 };
40 v16i8 pred = { 0 };
41 v16u8 src_l0, src_l1;
42 v8i16 diff0, diff1;
43
44 for (loop_cnt = 4; loop_cnt--;) {
45 LD2(src_ptr, src_stride, src0, src1);
46 src_ptr += (2 * src_stride);
47 LD2(pred_ptr, pred_stride, pred0, pred1);
48 pred_ptr += (2 * pred_stride);
49
50 INSERT_D2_SB(src0, src1, src);
51 INSERT_D2_SB(pred0, pred1, pred);
52 ILVRL_B2_UB(src, pred, src_l0, src_l1);
53 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
54 ST_SH2(diff0, diff1, diff_ptr, diff_stride);
55 diff_ptr += (2 * diff_stride);
56 }
57 }
58
sub_blk_16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)59 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
60 const uint8_t *pred, int32_t pred_stride,
61 int16_t *diff, int32_t diff_stride) {
62 int8_t count;
63 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
64 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
65 v16u8 src_l0, src_l1;
66 v8i16 diff0, diff1;
67
68 for (count = 2; count--;) {
69 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
70 src += (8 * src_stride);
71
72 LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
73 pred7);
74 pred += (8 * pred_stride);
75
76 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
77 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
78 ST_SH2(diff0, diff1, diff, 8);
79 diff += diff_stride;
80
81 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
82 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
83 ST_SH2(diff0, diff1, diff, 8);
84 diff += diff_stride;
85
86 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
87 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
88 ST_SH2(diff0, diff1, diff, 8);
89 diff += diff_stride;
90
91 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
92 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
93 ST_SH2(diff0, diff1, diff, 8);
94 diff += diff_stride;
95
96 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
97 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
98 ST_SH2(diff0, diff1, diff, 8);
99 diff += diff_stride;
100
101 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
102 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
103 ST_SH2(diff0, diff1, diff, 8);
104 diff += diff_stride;
105
106 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
107 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
108 ST_SH2(diff0, diff1, diff, 8);
109 diff += diff_stride;
110
111 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
112 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
113 ST_SH2(diff0, diff1, diff, 8);
114 diff += diff_stride;
115 }
116 }
117
sub_blk_32x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)118 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
119 const uint8_t *pred, int32_t pred_stride,
120 int16_t *diff, int32_t diff_stride) {
121 uint32_t loop_cnt;
122 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
123 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
124 v16u8 src_l0, src_l1;
125 v8i16 diff0, diff1;
126
127 for (loop_cnt = 8; loop_cnt--;) {
128 LD_SB2(src, 16, src0, src1);
129 src += src_stride;
130 LD_SB2(src, 16, src2, src3);
131 src += src_stride;
132 LD_SB2(src, 16, src4, src5);
133 src += src_stride;
134 LD_SB2(src, 16, src6, src7);
135 src += src_stride;
136
137 LD_SB2(pred, 16, pred0, pred1);
138 pred += pred_stride;
139 LD_SB2(pred, 16, pred2, pred3);
140 pred += pred_stride;
141 LD_SB2(pred, 16, pred4, pred5);
142 pred += pred_stride;
143 LD_SB2(pred, 16, pred6, pred7);
144 pred += pred_stride;
145
146 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
147 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
148 ST_SH2(diff0, diff1, diff, 8);
149 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
150 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
151 ST_SH2(diff0, diff1, diff + 16, 8);
152 diff += diff_stride;
153
154 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
155 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
156 ST_SH2(diff0, diff1, diff, 8);
157 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
158 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
159 ST_SH2(diff0, diff1, diff + 16, 8);
160 diff += diff_stride;
161
162 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
163 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
164 ST_SH2(diff0, diff1, diff, 8);
165 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
166 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
167 ST_SH2(diff0, diff1, diff + 16, 8);
168 diff += diff_stride;
169
170 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
171 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
172 ST_SH2(diff0, diff1, diff, 8);
173 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
174 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
175 ST_SH2(diff0, diff1, diff + 16, 8);
176 diff += diff_stride;
177 }
178 }
179
sub_blk_64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)180 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
181 const uint8_t *pred, int32_t pred_stride,
182 int16_t *diff, int32_t diff_stride) {
183 uint32_t loop_cnt;
184 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
185 v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
186 v16u8 src_l0, src_l1;
187 v8i16 diff0, diff1;
188
189 for (loop_cnt = 32; loop_cnt--;) {
190 LD_SB4(src, 16, src0, src1, src2, src3);
191 src += src_stride;
192 LD_SB4(src, 16, src4, src5, src6, src7);
193 src += src_stride;
194
195 LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
196 pred += pred_stride;
197 LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
198 pred += pred_stride;
199
200 ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
201 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
202 ST_SH2(diff0, diff1, diff, 8);
203 ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
204 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
205 ST_SH2(diff0, diff1, diff + 16, 8);
206 ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
207 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
208 ST_SH2(diff0, diff1, diff + 32, 8);
209 ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
210 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
211 ST_SH2(diff0, diff1, diff + 48, 8);
212 diff += diff_stride;
213
214 ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
215 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
216 ST_SH2(diff0, diff1, diff, 8);
217 ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
218 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
219 ST_SH2(diff0, diff1, diff + 16, 8);
220 ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
221 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
222 ST_SH2(diff0, diff1, diff + 32, 8);
223 ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
224 HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
225 ST_SH2(diff0, diff1, diff + 48, 8);
226 diff += diff_stride;
227 }
228 }
229
aom_subtract_block_msa(int32_t rows,int32_t cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)230 void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
231 ptrdiff_t diff_stride, const uint8_t *src_ptr,
232 ptrdiff_t src_stride, const uint8_t *pred_ptr,
233 ptrdiff_t pred_stride) {
234 if (rows == cols) {
235 switch (rows) {
236 case 4:
237 sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
238 diff_stride);
239 break;
240 case 8:
241 sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
242 diff_stride);
243 break;
244 case 16:
245 sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
246 diff_stride);
247 break;
248 case 32:
249 sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
250 diff_stride);
251 break;
252 case 64:
253 sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
254 diff_stride);
255 break;
256 default:
257 aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
258 src_stride, pred_ptr, pred_stride);
259 break;
260 }
261 } else {
262 aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
263 pred_ptr, pred_stride);
264 }
265 }
266