1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h263dsp_mips.h"
23 
h263_dct_unquantize_msa(int16_t * block,int16_t qmul,int16_t qadd,int8_t n_coeffs,uint8_t loop_start)24 static void h263_dct_unquantize_msa(int16_t *block, int16_t qmul,
25                                     int16_t qadd, int8_t n_coeffs,
26                                     uint8_t loop_start)
27 {
28     int16_t *block_dup = block;
29     int32_t level, cnt;
30     v8i16 block_vec, qmul_vec, qadd_vec, sub;
31     v8i16 add, mask, mul, zero_mask;
32 
33     qmul_vec = __msa_fill_h(qmul);
34     qadd_vec = __msa_fill_h(qadd);
35     for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
36         block_vec = LD_SH(block_dup + loop_start);
37         mask = __msa_clti_s_h(block_vec, 0);
38         zero_mask = __msa_ceqi_h(block_vec, 0);
39         mul = block_vec * qmul_vec;
40         sub = mul - qadd_vec;
41         add = mul + qadd_vec;
42         add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8) mask);
43         block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
44                                          (v16u8) zero_mask);
45         ST_SH(block_vec, block_dup + loop_start);
46         block_dup += 8;
47     }
48 
49     cnt = ((n_coeffs >> 3) * 8) + loop_start;
50 
51     for (; cnt <= n_coeffs; cnt++) {
52         level = block[cnt];
53         if (level) {
54             if (level < 0) {
55                 level = level * qmul - qadd;
56             } else {
57                 level = level * qmul + qadd;
58             }
59             block[cnt] = level;
60         }
61     }
62 }
63 
mpeg2_dct_unquantize_inter_msa(int16_t * block,int32_t qscale,const int16_t * quant_matrix)64 static int32_t mpeg2_dct_unquantize_inter_msa(int16_t *block,
65                                               int32_t qscale,
66                                               const int16_t *quant_matrix)
67 {
68     int32_t cnt, sum_res = -1;
69     v8i16 block_vec, block_neg, qscale_vec, mask;
70     v8i16 block_org0, block_org1, block_org2, block_org3;
71     v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
72     v8i16 sum, mul, zero_mask;
73     v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
74     v4i32 block_l, block_r, sad;
75 
76     qscale_vec = __msa_fill_h(qscale);
77     for (cnt = 0; cnt < 2; cnt++) {
78         LD_SH4(block, 8, block_org0, block_org1, block_org2, block_org3);
79         LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
80         mask = __msa_clti_s_h(block_org0, 0);
81         zero_mask = __msa_ceqi_h(block_org0, 0);
82         block_neg = -block_org0;
83         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
84                                          (v16u8) mask);
85         block_vec <<= 1;
86         block_vec += 1;
87         UNPCK_SH_SW(block_vec, block_r, block_l);
88         UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
89         UNPCK_SH_SW(quant_m0, quant_m_r, quant_m_l);
90         mul_vec = block_l * qscale_l;
91         mul_vec *= quant_m_l;
92         block_l = mul_vec >> 4;
93         mul_vec = block_r * qscale_r;
94         mul_vec *= quant_m_r;
95         block_r = mul_vec >> 4;
96         mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
97         block_neg = - mul;
98         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
99                                    (v16u8) mask);
100         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
101                                    (v16u8) zero_mask);
102         ST_SH(sum, block);
103         block += 8;
104         quant_matrix += 8;
105         sad = __msa_hadd_s_w(sum, sum);
106         sum_res += HADD_SW_S32(sad);
107         mask = __msa_clti_s_h(block_org1, 0);
108         zero_mask = __msa_ceqi_h(block_org1, 0);
109         block_neg = - block_org1;
110         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
111                                          (v16u8) mask);
112         block_vec <<= 1;
113         block_vec += 1;
114         UNPCK_SH_SW(block_vec, block_r, block_l);
115         UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
116         UNPCK_SH_SW(quant_m1, quant_m_r, quant_m_l);
117         mul_vec = block_l * qscale_l;
118         mul_vec *= quant_m_l;
119         block_l = mul_vec >> 4;
120         mul_vec = block_r * qscale_r;
121         mul_vec *= quant_m_r;
122         block_r = mul_vec >> 4;
123         mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
124         block_neg = - mul;
125         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
126                                    (v16u8) mask);
127         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
128                                    (v16u8) zero_mask);
129         ST_SH(sum, block);
130 
131         block += 8;
132         quant_matrix += 8;
133         sad = __msa_hadd_s_w(sum, sum);
134         sum_res += HADD_SW_S32(sad);
135         mask = __msa_clti_s_h(block_org2, 0);
136         zero_mask = __msa_ceqi_h(block_org2, 0);
137         block_neg = - block_org2;
138         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
139                                          (v16u8) mask);
140         block_vec <<= 1;
141         block_vec += 1;
142         UNPCK_SH_SW(block_vec, block_r, block_l);
143         UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
144         UNPCK_SH_SW(quant_m2, quant_m_r, quant_m_l);
145         mul_vec = block_l * qscale_l;
146         mul_vec *= quant_m_l;
147         block_l = mul_vec >> 4;
148         mul_vec = block_r * qscale_r;
149         mul_vec *= quant_m_r;
150         block_r = mul_vec >> 4;
151         mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
152         block_neg = - mul;
153         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
154                                    (v16u8) mask);
155         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
156                                    (v16u8) zero_mask);
157         ST_SH(sum, block);
158 
159         block += 8;
160         quant_matrix += 8;
161         sad = __msa_hadd_s_w(sum, sum);
162         sum_res += HADD_SW_S32(sad);
163         mask = __msa_clti_s_h(block_org3, 0);
164         zero_mask = __msa_ceqi_h(block_org3, 0);
165         block_neg = - block_org3;
166         block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
167                                          (v16u8) mask);
168         block_vec <<= 1;
169         block_vec += 1;
170         UNPCK_SH_SW(block_vec, block_r, block_l);
171         UNPCK_SH_SW(qscale_vec, qscale_r, qscale_l);
172         UNPCK_SH_SW(quant_m3, quant_m_r, quant_m_l);
173         mul_vec = block_l * qscale_l;
174         mul_vec *= quant_m_l;
175         block_l = mul_vec >> 4;
176         mul_vec = block_r * qscale_r;
177         mul_vec *= quant_m_r;
178         block_r = mul_vec >> 4;
179         mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
180         block_neg = - mul;
181         sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
182                                    (v16u8) mask);
183         sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
184                                    (v16u8) zero_mask);
185         ST_SH(sum, block);
186 
187         block += 8;
188         quant_matrix += 8;
189         sad = __msa_hadd_s_w(sum, sum);
190         sum_res += HADD_SW_S32(sad);
191     }
192 
193     return sum_res;
194 }
195 
ff_dct_unquantize_h263_intra_msa(MpegEncContext * s,int16_t * block,int32_t index,int32_t qscale)196 void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
197                                       int16_t *block, int32_t index,
198                                       int32_t qscale)
199 {
200     int32_t qmul, qadd;
201     int32_t nCoeffs;
202 
203     av_assert2(s->block_last_index[index] >= 0 || s->h263_aic);
204 
205     qmul = qscale << 1;
206 
207     if (!s->h263_aic) {
208         block[0] *= index < 4 ? s->y_dc_scale : s->c_dc_scale;
209         qadd = (qscale - 1) | 1;
210     } else {
211         qadd = 0;
212     }
213     if (s->ac_pred)
214         nCoeffs = 63;
215     else
216         nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
217 
218     h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
219 }
220 
ff_dct_unquantize_h263_inter_msa(MpegEncContext * s,int16_t * block,int32_t index,int32_t qscale)221 void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
222                                       int16_t *block, int32_t index,
223                                       int32_t qscale)
224 {
225     int32_t qmul, qadd;
226     int32_t nCoeffs;
227 
228     av_assert2(s->block_last_index[index] >= 0);
229 
230     qadd = (qscale - 1) | 1;
231     qmul = qscale << 1;
232 
233     nCoeffs = s->inter_scantable.raster_end[s->block_last_index[index]];
234 
235     h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
236 }
237 
ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext * s,int16_t * block,int32_t index,int32_t qscale)238 void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
239                                        int16_t *block, int32_t index,
240                                        int32_t qscale)
241 {
242     const uint16_t *quant_matrix;
243     int32_t sum = -1;
244 
245     quant_matrix = s->inter_matrix;
246 
247     sum = mpeg2_dct_unquantize_inter_msa(block, qscale, quant_matrix);
248 
249     block[63] ^= sum & 1;
250 }
251