1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MIPS version of dsp functions
11 //
12 // Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
13 //             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
14 
15 #include "src/dsp/dsp.h"
16 
17 #if defined(WEBP_USE_MIPS32)
18 
19 #include "src/dsp/mips_macro.h"
20 
21 static const int kC1 = 20091 + (1 << 16);
22 static const int kC2 = 35468;
23 
abs_mips32(int x)24 static WEBP_INLINE int abs_mips32(int x) {
25   const int sign = x >> 31;
26   return (x ^ sign) - sign;
27 }
28 
29 // 4 pixels in, 2 pixels out
do_filter2(uint8_t * p,int step)30 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
31   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
32   const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
33   const int a1 = VP8ksclip2[(a + 4) >> 3];
34   const int a2 = VP8ksclip2[(a + 3) >> 3];
35   p[-step] = VP8kclip1[p0 + a2];
36   p[    0] = VP8kclip1[q0 - a1];
37 }
38 
39 // 4 pixels in, 4 pixels out
do_filter4(uint8_t * p,int step)40 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
41   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
42   const int a = 3 * (q0 - p0);
43   const int a1 = VP8ksclip2[(a + 4) >> 3];
44   const int a2 = VP8ksclip2[(a + 3) >> 3];
45   const int a3 = (a1 + 1) >> 1;
46   p[-2 * step] = VP8kclip1[p1 + a3];
47   p[-    step] = VP8kclip1[p0 + a2];
48   p[        0] = VP8kclip1[q0 - a1];
49   p[     step] = VP8kclip1[q1 - a3];
50 }
51 
52 // 6 pixels in, 6 pixels out
do_filter6(uint8_t * p,int step)53 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
54   const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
55   const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
56   const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
57   // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
58   const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
59   const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
60   const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
61   p[-3 * step] = VP8kclip1[p2 + a3];
62   p[-2 * step] = VP8kclip1[p1 + a2];
63   p[-    step] = VP8kclip1[p0 + a1];
64   p[        0] = VP8kclip1[q0 - a1];
65   p[     step] = VP8kclip1[q1 - a2];
66   p[ 2 * step] = VP8kclip1[q2 - a3];
67 }
68 
hev(const uint8_t * p,int step,int thresh)69 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
70   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
71   return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
72 }
73 
needs_filter(const uint8_t * p,int step,int t)74 static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
75   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
76   return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
77 }
78 
needs_filter2(const uint8_t * p,int step,int t,int it)79 static WEBP_INLINE int needs_filter2(const uint8_t* p,
80                                      int step, int t, int it) {
81   const int p3 = p[-4 * step], p2 = p[-3 * step];
82   const int p1 = p[-2 * step], p0 = p[-step];
83   const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
84   if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
85     return 0;
86   }
87   return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
88          abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
89          abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
90 }
91 
FilterLoop26(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)92 static WEBP_INLINE void FilterLoop26(uint8_t* p,
93                                      int hstride, int vstride, int size,
94                                      int thresh, int ithresh, int hev_thresh) {
95   const int thresh2 = 2 * thresh + 1;
96   while (size-- > 0) {
97     if (needs_filter2(p, hstride, thresh2, ithresh)) {
98       if (hev(p, hstride, hev_thresh)) {
99         do_filter2(p, hstride);
100       } else {
101         do_filter6(p, hstride);
102       }
103     }
104     p += vstride;
105   }
106 }
107 
FilterLoop24(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)108 static WEBP_INLINE void FilterLoop24(uint8_t* p,
109                                      int hstride, int vstride, int size,
110                                      int thresh, int ithresh, int hev_thresh) {
111   const int thresh2 = 2 * thresh + 1;
112   while (size-- > 0) {
113     if (needs_filter2(p, hstride, thresh2, ithresh)) {
114       if (hev(p, hstride, hev_thresh)) {
115         do_filter2(p, hstride);
116       } else {
117         do_filter4(p, hstride);
118       }
119     }
120     p += vstride;
121   }
122 }
123 
124 // on macroblock edges
VFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)125 static void VFilter16(uint8_t* p, int stride,
126                       int thresh, int ithresh, int hev_thresh) {
127   FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
128 }
129 
HFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)130 static void HFilter16(uint8_t* p, int stride,
131                       int thresh, int ithresh, int hev_thresh) {
132   FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
133 }
134 
135 // 8-pixels wide variant, for chroma filtering
VFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)136 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
137                      int thresh, int ithresh, int hev_thresh) {
138   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
139   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
140 }
141 
HFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)142 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
143                      int thresh, int ithresh, int hev_thresh) {
144   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
145   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
146 }
147 
VFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)148 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
149                       int thresh, int ithresh, int hev_thresh) {
150   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
151   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
152 }
153 
HFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)154 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
155                       int thresh, int ithresh, int hev_thresh) {
156   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
157   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
158 }
159 
160 // on three inner edges
VFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)161 static void VFilter16i(uint8_t* p, int stride,
162                        int thresh, int ithresh, int hev_thresh) {
163   int k;
164   for (k = 3; k > 0; --k) {
165     p += 4 * stride;
166     FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
167   }
168 }
169 
HFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)170 static void HFilter16i(uint8_t* p, int stride,
171                        int thresh, int ithresh, int hev_thresh) {
172   int k;
173   for (k = 3; k > 0; --k) {
174     p += 4;
175     FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
176   }
177 }
178 
179 //------------------------------------------------------------------------------
180 // Simple In-loop filtering (Paragraph 15.2)
181 
SimpleVFilter16(uint8_t * p,int stride,int thresh)182 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
183   int i;
184   const int thresh2 = 2 * thresh + 1;
185   for (i = 0; i < 16; ++i) {
186     if (needs_filter(p + i, stride, thresh2)) {
187       do_filter2(p + i, stride);
188     }
189   }
190 }
191 
SimpleHFilter16(uint8_t * p,int stride,int thresh)192 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
193   int i;
194   const int thresh2 = 2 * thresh + 1;
195   for (i = 0; i < 16; ++i) {
196     if (needs_filter(p + i * stride, 1, thresh2)) {
197       do_filter2(p + i * stride, 1);
198     }
199   }
200 }
201 
SimpleVFilter16i(uint8_t * p,int stride,int thresh)202 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
203   int k;
204   for (k = 3; k > 0; --k) {
205     p += 4 * stride;
206     SimpleVFilter16(p, stride, thresh);
207   }
208 }
209 
SimpleHFilter16i(uint8_t * p,int stride,int thresh)210 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
211   int k;
212   for (k = 3; k > 0; --k) {
213     p += 4;
214     SimpleHFilter16(p, stride, thresh);
215   }
216 }
217 
TransformOne(const int16_t * in,uint8_t * dst)218 static void TransformOne(const int16_t* in, uint8_t* dst) {
219   int temp0, temp1, temp2, temp3, temp4;
220   int temp5, temp6, temp7, temp8, temp9;
221   int temp10, temp11, temp12, temp13, temp14;
222   int temp15, temp16, temp17, temp18;
223   int16_t* p_in = (int16_t*)in;
224 
225   // loops unrolled and merged to avoid usage of tmp buffer
226   // and to reduce number of stalls. MUL macro is written
227   // in assembler and inlined
228   __asm__ volatile(
229     "lh       %[temp0],  0(%[in])                      \n\t"
230     "lh       %[temp8],  16(%[in])                     \n\t"
231     "lh       %[temp4],  8(%[in])                      \n\t"
232     "lh       %[temp12], 24(%[in])                     \n\t"
233     "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
234     "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
235     "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
236     "mul      %[temp17], %[temp12], %[kC1]             \n\t"
237     "mul      %[temp4],  %[temp4],  %[kC1]             \n\t"
238     "mul      %[temp12], %[temp12], %[kC2]             \n\t"
239     "lh       %[temp1],  2(%[in])                      \n\t"
240     "lh       %[temp5],  10(%[in])                     \n\t"
241     "lh       %[temp9],  18(%[in])                     \n\t"
242     "lh       %[temp13], 26(%[in])                     \n\t"
243     "sra      %[temp8],  %[temp8],  16                 \n\t"
244     "sra      %[temp17], %[temp17], 16                 \n\t"
245     "sra      %[temp4],  %[temp4],  16                 \n\t"
246     "sra      %[temp12], %[temp12], 16                 \n\t"
247     "lh       %[temp2],  4(%[in])                      \n\t"
248     "lh       %[temp6],  12(%[in])                     \n\t"
249     "lh       %[temp10], 20(%[in])                     \n\t"
250     "lh       %[temp14], 28(%[in])                     \n\t"
251     "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
252     "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
253     "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
254     "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
255     "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
256     "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
257     "lh       %[temp3],  6(%[in])                      \n\t"
258     "lh       %[temp7],  14(%[in])                     \n\t"
259     "lh       %[temp11], 22(%[in])                     \n\t"
260     "lh       %[temp15], 30(%[in])                     \n\t"
261     "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
262     "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
263     "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
264     "mul      %[temp17], %[temp13], %[kC1]             \n\t"
265     "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
266     "mul      %[temp13], %[temp13], %[kC2]             \n\t"
267     "sra      %[temp9],  %[temp9],  16                 \n\t"
268     "sra      %[temp17], %[temp17], 16                 \n\t"
269     "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
270     "sra      %[temp5],  %[temp5],  16                 \n\t"
271     "sra      %[temp13], %[temp13], 16                 \n\t"
272     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
273     "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
274     "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
275     "mul      %[temp17], %[temp14], %[kC1]             \n\t"
276     "mul      %[temp14], %[temp14], %[kC2]             \n\t"
277     "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
278     "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
279     "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
280     "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
281     "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
282     "mul      %[temp6],  %[temp6],  %[kC1]             \n\t"
283     "sra      %[temp17], %[temp17], 16                 \n\t"
284     "sra      %[temp14], %[temp14], 16                 \n\t"
285     "sra      %[temp10], %[temp10], 16                 \n\t"
286     "sra      %[temp6],  %[temp6],  16                 \n\t"
287     "subu     %[temp17], %[temp10], %[temp17]          \n\t"
288     "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
289     "addu     %[temp10], %[temp16], %[temp6]           \n\t"
290     "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
291     "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
292     "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
293     "mul      %[temp17], %[temp15], %[kC1]             \n\t"
294     "mul      %[temp15], %[temp15], %[kC2]             \n\t"
295     "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
296     "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
297     "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
298     "mul      %[temp7],  %[temp7],  %[kC1]             \n\t"
299     "addiu    %[temp8],  %[temp8],  4                  \n\t"
300     "addiu    %[temp12], %[temp12], 4                  \n\t"
301     "addiu    %[temp0],  %[temp0],  4                  \n\t"
302     "addiu    %[temp4],  %[temp4],  4                  \n\t"
303     "sra      %[temp17], %[temp17], 16                 \n\t"
304     "sra      %[temp15], %[temp15], 16                 \n\t"
305     "sra      %[temp11], %[temp11], 16                 \n\t"
306     "sra      %[temp7],  %[temp7],  16                 \n\t"
307     "subu     %[temp17], %[temp11], %[temp17]          \n\t"
308     "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
309     "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
310     "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
311     "addu     %[temp11], %[temp16], %[temp7]           \n\t"
312     "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
313     "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
314     "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
315     "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
316     "mul      %[temp17], %[temp11], %[kC1]             \n\t"
317     "mul      %[temp9],  %[temp9],  %[kC1]             \n\t"
318     "mul      %[temp11], %[temp11], %[kC2]             \n\t"
319     "sra      %[temp10], %[temp10], 16                 \n\t"
320     "sra      %[temp17], %[temp17], 16                 \n\t"
321     "sra      %[temp9],  %[temp9],  16                 \n\t"
322     "sra      %[temp11], %[temp11], 16                 \n\t"
323     "subu     %[temp17], %[temp10], %[temp17]          \n\t"
324     "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
325     "addu     %[temp10], %[temp12], %[temp14]          \n\t"
326     "subu     %[temp12], %[temp12], %[temp14]          \n\t"
327     "mul      %[temp14], %[temp13], %[kC2]             \n\t"
328     "mul      %[temp9],  %[temp15], %[kC1]             \n\t"
329     "mul      %[temp13], %[temp13], %[kC1]             \n\t"
330     "mul      %[temp15], %[temp15], %[kC2]             \n\t"
331     "sra      %[temp14], %[temp14], 16                 \n\t"
332     "sra      %[temp9],  %[temp9],  16                 \n\t"
333     "sra      %[temp13], %[temp13], 16                 \n\t"
334     "sra      %[temp15], %[temp15], 16                 \n\t"
335     "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
336     "addu     %[temp15], %[temp13], %[temp15]          \n\t"
337     "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
338     "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
339     "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
340     "mul      %[temp13], %[temp3],  %[kC1]             \n\t"
341     "mul      %[temp1],  %[temp1],  %[kC1]             \n\t"
342     "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
343     "sra      %[temp2],  %[temp2],  16                 \n\t"
344     "sra      %[temp13], %[temp13], 16                 \n\t"
345     "sra      %[temp1],  %[temp1],  16                 \n\t"
346     "sra      %[temp3],  %[temp3],  16                 \n\t"
347     "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
348     "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
349     "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
350     "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
351     "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
352     "mul      %[temp1],  %[temp7],  %[kC1]             \n\t"
353     "mul      %[temp5],  %[temp5],  %[kC1]             \n\t"
354     "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
355     "sra      %[temp6],  %[temp6],  16                 \n\t"
356     "sra      %[temp1],  %[temp1],  16                 \n\t"
357     "sra      %[temp5],  %[temp5],  16                 \n\t"
358     "sra      %[temp7],  %[temp7],  16                 \n\t"
359     "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
360     "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
361     "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
362     "subu     %[temp16], %[temp16], %[temp11]          \n\t"
363     "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
364     "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
365     "sra      %[temp5],  %[temp5],  3                  \n\t"
366     "sra      %[temp16], %[temp16], 3                  \n\t"
367     "sra      %[temp11], %[temp11], 3                  \n\t"
368     "sra      %[temp8],  %[temp8],  3                  \n\t"
369     "addu     %[temp17], %[temp10], %[temp15]          \n\t"
370     "subu     %[temp10], %[temp10], %[temp15]          \n\t"
371     "addu     %[temp15], %[temp12], %[temp9]           \n\t"
372     "subu     %[temp12], %[temp12], %[temp9]           \n\t"
373     "sra      %[temp17], %[temp17], 3                  \n\t"
374     "sra      %[temp10], %[temp10], 3                  \n\t"
375     "sra      %[temp15], %[temp15], 3                  \n\t"
376     "sra      %[temp12], %[temp12], 3                  \n\t"
377     "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
378     "subu     %[temp14], %[temp14], %[temp3]           \n\t"
379     "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
380     "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
381     "sra      %[temp9],  %[temp9],  3                  \n\t"
382     "sra      %[temp14], %[temp14], 3                  \n\t"
383     "sra      %[temp3],  %[temp3],  3                  \n\t"
384     "sra      %[temp0],  %[temp0],  3                  \n\t"
385     "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
386     "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
387     "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
388     "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
389     "sra      %[temp13], %[temp13], 3                  \n\t"
390     "sra      %[temp2],  %[temp2],  3                  \n\t"
391     "sra      %[temp7],  %[temp7],  3                  \n\t"
392     "sra      %[temp4],  %[temp4],  3                  \n\t"
393     "addiu    %[temp6],  $zero,     255                \n\t"
394     "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
395     "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
396     "sra      %[temp5],  %[temp1],  8                  \n\t"
397     "sra      %[temp18], %[temp1],  31                 \n\t"
398     "beqz     %[temp5],  1f                            \n\t"
399     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
400     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
401   "1:                                                  \n\t"
402     "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
403     "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
404     "addu     %[temp18], %[temp18], %[temp11]          \n\t"
405     "sra      %[temp11], %[temp18], 8                  \n\t"
406     "sra      %[temp1],  %[temp18], 31                 \n\t"
407     "beqz     %[temp11], 2f                            \n\t"
408     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
409     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
410   "2:                                                  \n\t"
411     "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
412     "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
413     "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
414     "sra      %[temp8],  %[temp1],  8                  \n\t"
415     "sra      %[temp18], %[temp1],  31                 \n\t"
416     "beqz     %[temp8],  3f                            \n\t"
417     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
418     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
419   "3:                                                  \n\t"
420     "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
421     "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
422     "addu     %[temp18], %[temp18], %[temp16]          \n\t"
423     "sra      %[temp16], %[temp18], 8                  \n\t"
424     "sra      %[temp1],  %[temp18], 31                 \n\t"
425     "beqz     %[temp16], 4f                            \n\t"
426     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
427     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
428   "4:                                                  \n\t"
429     "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
430     "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
431     "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
432     "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
433     "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
434     "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
435     "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
436     "addu     %[temp11], %[temp11], %[temp12]          \n\t"
437     "addu     %[temp16], %[temp16], %[temp10]          \n\t"
438     "sra      %[temp18], %[temp5],  8                  \n\t"
439     "sra      %[temp1],  %[temp5],  31                 \n\t"
440     "beqz     %[temp18], 5f                            \n\t"
441     "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
442     "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
443   "5:                                                  \n\t"
444     "sra      %[temp18], %[temp8],  8                  \n\t"
445     "sra      %[temp1],  %[temp8],  31                 \n\t"
446     "beqz     %[temp18], 6f                            \n\t"
447     "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
448     "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
449   "6:                                                  \n\t"
450     "sra      %[temp18], %[temp11], 8                  \n\t"
451     "sra      %[temp1],  %[temp11], 31                 \n\t"
452     "sra      %[temp17], %[temp16], 8                  \n\t"
453     "sra      %[temp15], %[temp16], 31                 \n\t"
454     "beqz     %[temp18], 7f                            \n\t"
455     "xor      %[temp11], %[temp11], %[temp11]          \n\t"
456     "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
457   "7:                                                  \n\t"
458     "beqz     %[temp17], 8f                            \n\t"
459     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
460     "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
461   "8:                                                  \n\t"
462     "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
463     "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
464     "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
465     "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
466     "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
467     "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
468     "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
469     "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
470     "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
471     "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
472     "addu     %[temp11], %[temp11], %[temp0]           \n\t"
473     "addu     %[temp16], %[temp16], %[temp14]          \n\t"
474     "sra      %[temp18], %[temp5],  8                  \n\t"
475     "sra      %[temp1],  %[temp5],  31                 \n\t"
476     "sra      %[temp17], %[temp8],  8                  \n\t"
477     "sra      %[temp15], %[temp8],  31                 \n\t"
478     "sra      %[temp12], %[temp11], 8                  \n\t"
479     "sra      %[temp10], %[temp11], 31                 \n\t"
480     "sra      %[temp9],  %[temp16], 8                  \n\t"
481     "sra      %[temp3],  %[temp16], 31                 \n\t"
482     "beqz     %[temp18], 9f                            \n\t"
483     "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
484     "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
485   "9:                                                  \n\t"
486     "beqz     %[temp17], 10f                           \n\t"
487     "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
488     "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
489   "10:                                                 \n\t"
490     "beqz     %[temp12], 11f                           \n\t"
491     "xor      %[temp11], %[temp11], %[temp11]          \n\t"
492     "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
493   "11:                                                 \n\t"
494     "beqz     %[temp9],  12f                           \n\t"
495     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
496     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
497   "12:                                                 \n\t"
498     "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
499     "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
500     "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
501     "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
502     "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
503     "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
504     "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
505     "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
506     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
507     "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
508     "addu     %[temp11], %[temp11], %[temp4]           \n\t"
509     "addu     %[temp16], %[temp16], %[temp2]           \n\t"
510     "sra      %[temp18], %[temp5],  8                  \n\t"
511     "sra      %[temp1],  %[temp5],  31                 \n\t"
512     "sra      %[temp17], %[temp8],  8                  \n\t"
513     "sra      %[temp15], %[temp8],  31                 \n\t"
514     "sra      %[temp12], %[temp11], 8                  \n\t"
515     "sra      %[temp10], %[temp11], 31                 \n\t"
516     "sra      %[temp9],  %[temp16], 8                  \n\t"
517     "sra      %[temp3],  %[temp16], 31                 \n\t"
518     "beqz     %[temp18], 13f                           \n\t"
519     "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
520     "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
521   "13:                                                 \n\t"
522     "beqz     %[temp17], 14f                           \n\t"
523     "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
524     "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
525   "14:                                                 \n\t"
526     "beqz     %[temp12], 15f                           \n\t"
527     "xor      %[temp11], %[temp11], %[temp11]          \n\t"
528     "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
529   "15:                                                 \n\t"
530     "beqz     %[temp9],  16f                           \n\t"
531     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
532     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
533   "16:                                                 \n\t"
534     "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
535     "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
536     "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
537     "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
538 
539     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
540       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
541       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
542       [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
543       [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
544       [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
545       [temp18]"=&r"(temp18)
546     : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
547     : "memory", "hi", "lo"
548   );
549 }
550 
TransformTwo(const int16_t * in,uint8_t * dst,int do_two)551 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
552   TransformOne(in, dst);
553   if (do_two) {
554     TransformOne(in + 16, dst + 4);
555   }
556 }
557 
558 //------------------------------------------------------------------------------
559 // Entry point
560 
561 extern void VP8DspInitMIPS32(void);
562 
VP8DspInitMIPS32(void)563 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
564   VP8InitClipTables();
565 
566   VP8Transform = TransformTwo;
567 
568   VP8VFilter16 = VFilter16;
569   VP8HFilter16 = HFilter16;
570   VP8VFilter8 = VFilter8;
571   VP8HFilter8 = HFilter8;
572   VP8VFilter16i = VFilter16i;
573   VP8HFilter16i = HFilter16i;
574   VP8VFilter8i = VFilter8i;
575   VP8HFilter8i = HFilter8i;
576 
577   VP8SimpleVFilter16 = SimpleVFilter16;
578   VP8SimpleHFilter16 = SimpleHFilter16;
579   VP8SimpleVFilter16i = SimpleVFilter16i;
580   VP8SimpleHFilter16i = SimpleHFilter16i;
581 }
582 
583 #else  // !WEBP_USE_MIPS32
584 
585 WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
586 
587 #endif  // WEBP_USE_MIPS32
588