1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13 
sub_blk_4x4_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)14 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
15                             const uint8_t *pred_ptr, int32_t pred_stride,
16                             int16_t *diff_ptr, int32_t diff_stride) {
17   uint32_t src0, src1, src2, src3;
18   uint32_t pred0, pred1, pred2, pred3;
19   v16i8 src = { 0 };
20   v16i8 pred = { 0 };
21   v16u8 src_l0, src_l1;
22   v8i16 diff0, diff1;
23 
24   LW4(src_ptr, src_stride, src0, src1, src2, src3);
25   LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
26   INSERT_W4_SB(src0, src1, src2, src3, src);
27   INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
28   ILVRL_B2_UB(src, pred, src_l0, src_l1);
29   HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
30   ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
31 }
32 
sub_blk_8x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * pred_ptr,int32_t pred_stride,int16_t * diff_ptr,int32_t diff_stride)33 static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
34                             const uint8_t *pred_ptr, int32_t pred_stride,
35                             int16_t *diff_ptr, int32_t diff_stride) {
36   uint32_t loop_cnt;
37   uint64_t src0, src1, pred0, pred1;
38   v16i8 src = { 0 };
39   v16i8 pred = { 0 };
40   v16u8 src_l0, src_l1;
41   v8i16 diff0, diff1;
42 
43   for (loop_cnt = 4; loop_cnt--;) {
44     LD2(src_ptr, src_stride, src0, src1);
45     src_ptr += (2 * src_stride);
46     LD2(pred_ptr, pred_stride, pred0, pred1);
47     pred_ptr += (2 * pred_stride);
48 
49     INSERT_D2_SB(src0, src1, src);
50     INSERT_D2_SB(pred0, pred1, pred);
51     ILVRL_B2_UB(src, pred, src_l0, src_l1);
52     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
53     ST_SH2(diff0, diff1, diff_ptr, diff_stride);
54     diff_ptr += (2 * diff_stride);
55   }
56 }
57 
sub_blk_16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)58 static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
59                               const uint8_t *pred, int32_t pred_stride,
60                               int16_t *diff, int32_t diff_stride) {
61   int8_t count;
62   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
63   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
64   v16u8 src_l0, src_l1;
65   v8i16 diff0, diff1;
66 
67   for (count = 2; count--;) {
68     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
69     src += (8 * src_stride);
70 
71     LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
72            pred7);
73     pred += (8 * pred_stride);
74 
75     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
76     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
77     ST_SH2(diff0, diff1, diff, 8);
78     diff += diff_stride;
79 
80     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
81     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
82     ST_SH2(diff0, diff1, diff, 8);
83     diff += diff_stride;
84 
85     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
86     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
87     ST_SH2(diff0, diff1, diff, 8);
88     diff += diff_stride;
89 
90     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
91     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
92     ST_SH2(diff0, diff1, diff, 8);
93     diff += diff_stride;
94 
95     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
96     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
97     ST_SH2(diff0, diff1, diff, 8);
98     diff += diff_stride;
99 
100     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
101     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
102     ST_SH2(diff0, diff1, diff, 8);
103     diff += diff_stride;
104 
105     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
106     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
107     ST_SH2(diff0, diff1, diff, 8);
108     diff += diff_stride;
109 
110     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
111     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
112     ST_SH2(diff0, diff1, diff, 8);
113     diff += diff_stride;
114   }
115 }
116 
sub_blk_32x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)117 static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
118                               const uint8_t *pred, int32_t pred_stride,
119                               int16_t *diff, int32_t diff_stride) {
120   uint32_t loop_cnt;
121   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
122   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
123   v16u8 src_l0, src_l1;
124   v8i16 diff0, diff1;
125 
126   for (loop_cnt = 8; loop_cnt--;) {
127     LD_SB2(src, 16, src0, src1);
128     src += src_stride;
129     LD_SB2(src, 16, src2, src3);
130     src += src_stride;
131     LD_SB2(src, 16, src4, src5);
132     src += src_stride;
133     LD_SB2(src, 16, src6, src7);
134     src += src_stride;
135 
136     LD_SB2(pred, 16, pred0, pred1);
137     pred += pred_stride;
138     LD_SB2(pred, 16, pred2, pred3);
139     pred += pred_stride;
140     LD_SB2(pred, 16, pred4, pred5);
141     pred += pred_stride;
142     LD_SB2(pred, 16, pred6, pred7);
143     pred += pred_stride;
144 
145     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
146     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
147     ST_SH2(diff0, diff1, diff, 8);
148     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
149     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
150     ST_SH2(diff0, diff1, diff + 16, 8);
151     diff += diff_stride;
152 
153     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
154     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
155     ST_SH2(diff0, diff1, diff, 8);
156     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
157     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
158     ST_SH2(diff0, diff1, diff + 16, 8);
159     diff += diff_stride;
160 
161     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
162     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
163     ST_SH2(diff0, diff1, diff, 8);
164     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
165     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
166     ST_SH2(diff0, diff1, diff + 16, 8);
167     diff += diff_stride;
168 
169     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
170     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
171     ST_SH2(diff0, diff1, diff, 8);
172     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
173     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
174     ST_SH2(diff0, diff1, diff + 16, 8);
175     diff += diff_stride;
176   }
177 }
178 
sub_blk_64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * pred,int32_t pred_stride,int16_t * diff,int32_t diff_stride)179 static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
180                               const uint8_t *pred, int32_t pred_stride,
181                               int16_t *diff, int32_t diff_stride) {
182   uint32_t loop_cnt;
183   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
184   v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
185   v16u8 src_l0, src_l1;
186   v8i16 diff0, diff1;
187 
188   for (loop_cnt = 32; loop_cnt--;) {
189     LD_SB4(src, 16, src0, src1, src2, src3);
190     src += src_stride;
191     LD_SB4(src, 16, src4, src5, src6, src7);
192     src += src_stride;
193 
194     LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
195     pred += pred_stride;
196     LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
197     pred += pred_stride;
198 
199     ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
200     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
201     ST_SH2(diff0, diff1, diff, 8);
202     ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
203     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
204     ST_SH2(diff0, diff1, diff + 16, 8);
205     ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
206     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
207     ST_SH2(diff0, diff1, diff + 32, 8);
208     ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
209     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
210     ST_SH2(diff0, diff1, diff + 48, 8);
211     diff += diff_stride;
212 
213     ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
214     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
215     ST_SH2(diff0, diff1, diff, 8);
216     ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
217     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
218     ST_SH2(diff0, diff1, diff + 16, 8);
219     ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
220     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
221     ST_SH2(diff0, diff1, diff + 32, 8);
222     ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
223     HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
224     ST_SH2(diff0, diff1, diff + 48, 8);
225     diff += diff_stride;
226   }
227 }
228 
vpx_subtract_block_msa(int32_t rows,int32_t cols,int16_t * diff_ptr,ptrdiff_t diff_stride,const uint8_t * src_ptr,ptrdiff_t src_stride,const uint8_t * pred_ptr,ptrdiff_t pred_stride)229 void vpx_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
230                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
231                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
232                             ptrdiff_t pred_stride) {
233   if (rows == cols) {
234     switch (rows) {
235       case 4:
236         sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
237                         diff_stride);
238         break;
239       case 8:
240         sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
241                         diff_stride);
242         break;
243       case 16:
244         sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
245                           diff_stride);
246         break;
247       case 32:
248         sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
249                           diff_stride);
250         break;
251       case 64:
252         sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
253                           diff_stride);
254         break;
255       default:
256         vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
257                              src_stride, pred_ptr, pred_stride);
258         break;
259     }
260   } else {
261     vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
262                          pred_ptr, pred_stride);
263   }
264 }
265