1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "./aom_dsp_rtcd.h"
13 #include "aom_dsp/mips/macros_msa.h"
14 
sub_blk_4x4_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)15 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
16                             const uint8_t *pred_ptr, int32_t pred_stride,
17                             int16_t *diff_ptr, int32_t diff_stride) {
18   uint32_t src0, src1, src2, src3;
19   uint32_t pred0, pred1, pred2, pred3;
20   v16i8 src = { 0 };
21   v16i8 pred = { 0 };
22   v16u8 src_l0, src_l1;
23   v8i16 diff0, diff1;
24 
25   LW4(src_ptr, src_stride, src0, src1, src2, src3);
26   LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
27   INSERT_W4_SB(src0, src1, src2, src3, src);
28   INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
29   ILVRL_B2_UB(src, pred, src_l0, src_l1);
30   HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
31   ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
32 }
33 
sub_blk_8x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)34 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
35                             const uint8_t *pred_ptr, int32_t pred_stride,
36                             int16_t *diff_ptr, int32_t diff_stride) {
37   uint32_t loop_cnt;
38   uint64_t src0, src1, pred0, pred1;
39   v16i8 src = { 0 };
40   v16i8 pred = { 0 };
41   v16u8 src_l0, src_l1;
42   v8i16 diff0, diff1;
43 
44   for (loop_cnt = 4; loop_cnt--;) {
45     LD2(src_ptr, src_stride, src0, src1);
46     src_ptr += (2 * src_stride);
47     LD2(pred_ptr, pred_stride, pred0, pred1);
48     pred_ptr += (2 * pred_stride);
49 
50     INSERT_D2_SB(src0, src1, src);
51     INSERT_D2_SB(pred0, pred1, pred);
52     ILVRL_B2_UB(src, pred, src_l0, src_l1);
53     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
54     ST_SH2(diff0, diff1, diff_ptr, diff_stride);
55     diff_ptr += (2 * diff_stride);
56   }
57 }
58 
sub_blk_16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)59 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
60                               const uint8_t *pred, int32_t pred_stride,
61                               int16_t *diff, int32_t diff_stride) {
62   int8_t count;
63   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
64   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
65   v16u8 src_l0, src_l1;
66   v8i16 diff0, diff1;
67 
68   for (count = 2; count--;) {
69     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
70     src += (8 * src_stride);
71 
72     LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
73            pred7);
74     pred += (8 * pred_stride);
75 
76     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
77     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
78     ST_SH2(diff0, diff1, diff, 8);
79     diff += diff_stride;
80 
81     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
82     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
83     ST_SH2(diff0, diff1, diff, 8);
84     diff += diff_stride;
85 
86     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
87     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
88     ST_SH2(diff0, diff1, diff, 8);
89     diff += diff_stride;
90 
91     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
92     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
93     ST_SH2(diff0, diff1, diff, 8);
94     diff += diff_stride;
95 
96     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
97     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
98     ST_SH2(diff0, diff1, diff, 8);
99     diff += diff_stride;
100 
101     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
102     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
103     ST_SH2(diff0, diff1, diff, 8);
104     diff += diff_stride;
105 
106     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
107     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
108     ST_SH2(diff0, diff1, diff, 8);
109     diff += diff_stride;
110 
111     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
112     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
113     ST_SH2(diff0, diff1, diff, 8);
114     diff += diff_stride;
115   }
116 }
117 
sub_blk_32x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)118 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
119                               const uint8_t *pred, int32_t pred_stride,
120                               int16_t *diff, int32_t diff_stride) {
121   uint32_t loop_cnt;
122   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
123   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
124   v16u8 src_l0, src_l1;
125   v8i16 diff0, diff1;
126 
127   for (loop_cnt = 8; loop_cnt--;) {
128     LD_SB2(src, 16, src0, src1);
129     src += src_stride;
130     LD_SB2(src, 16, src2, src3);
131     src += src_stride;
132     LD_SB2(src, 16, src4, src5);
133     src += src_stride;
134     LD_SB2(src, 16, src6, src7);
135     src += src_stride;
136 
137     LD_SB2(pred, 16, pred0, pred1);
138     pred += pred_stride;
139     LD_SB2(pred, 16, pred2, pred3);
140     pred += pred_stride;
141     LD_SB2(pred, 16, pred4, pred5);
142     pred += pred_stride;
143     LD_SB2(pred, 16, pred6, pred7);
144     pred += pred_stride;
145 
146     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
147     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
148     ST_SH2(diff0, diff1, diff, 8);
149     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
150     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
151     ST_SH2(diff0, diff1, diff + 16, 8);
152     diff += diff_stride;
153 
154     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
155     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
156     ST_SH2(diff0, diff1, diff, 8);
157     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
158     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
159     ST_SH2(diff0, diff1, diff + 16, 8);
160     diff += diff_stride;
161 
162     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
163     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
164     ST_SH2(diff0, diff1, diff, 8);
165     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
166     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
167     ST_SH2(diff0, diff1, diff + 16, 8);
168     diff += diff_stride;
169 
170     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
171     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
172     ST_SH2(diff0, diff1, diff, 8);
173     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
174     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
175     ST_SH2(diff0, diff1, diff + 16, 8);
176     diff += diff_stride;
177   }
178 }
179 
sub_blk_64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)180 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
181                               const uint8_t *pred, int32_t pred_stride,
182                               int16_t *diff, int32_t diff_stride) {
183   uint32_t loop_cnt;
184   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
185   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
186   v16u8 src_l0, src_l1;
187   v8i16 diff0, diff1;
188 
189   for (loop_cnt = 32; loop_cnt--;) {
190     LD_SB4(src, 16, src0, src1, src2, src3);
191     src += src_stride;
192     LD_SB4(src, 16, src4, src5, src6, src7);
193     src += src_stride;
194 
195     LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
196     pred += pred_stride;
197     LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
198     pred += pred_stride;
199 
200     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
201     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
202     ST_SH2(diff0, diff1, diff, 8);
203     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
204     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
205     ST_SH2(diff0, diff1, diff + 16, 8);
206     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
207     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
208     ST_SH2(diff0, diff1, diff + 32, 8);
209     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
210     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
211     ST_SH2(diff0, diff1, diff + 48, 8);
212     diff += diff_stride;
213 
214     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
215     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
216     ST_SH2(diff0, diff1, diff, 8);
217     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
218     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
219     ST_SH2(diff0, diff1, diff + 16, 8);
220     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
221     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
222     ST_SH2(diff0, diff1, diff + 32, 8);
223     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
224     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
225     ST_SH2(diff0, diff1, diff + 48, 8);
226     diff += diff_stride;
227   }
228 }
229 
aom_subtract_block_msa(int32_t rows,int32_t cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)230 void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
231                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
232                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
233                             ptrdiff_t pred_stride) {
234   if (rows == cols) {
235     switch (rows) {
236       case 4:
237         sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
238                         diff_stride);
239         break;
240       case 8:
241         sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
242                         diff_stride);
243         break;
244       case 16:
245         sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
246                           diff_stride);
247         break;
248       case 32:
249         sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
250                           diff_stride);
251         break;
252       case 64:
253         sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
254                           diff_stride);
255         break;
256       default:
257         aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
258                              src_stride, pred_ptr, pred_stride);
259         break;
260     }
261   } else {
262     aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
263                          pred_ptr, pred_stride);
264   }
265 }
266