1/*
2 * MIPS DSPr2 optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5 * All Rights Reserved.
6 * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
7 *           Darko Laus       (darko.laus@imgtec.com)
8 * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
9 * This software is provided 'as-is', without any express or implied
10 * warranty.  In no event will the authors be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute it
15 * freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must not
18 *    claim that you wrote the original software. If you use this software
19 *    in a product, an acknowledgment in the product documentation would be
20 *    appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must not be
22 *    misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source distribution.
24 */
25
26#include "jsimd_mips_dspr2_asm.h"
27
28/*****************************************************************************/
29LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
30/*
31 * a0     - cinfo->image_width
32 * a1     - input_buf
33 * a2     - output_buf
34 * a3     - output_row
35 * 16(sp) - num_rows
36 * 20(sp) - cinfo->num_components
37 *
38 * Null conversion for compression
39 */
40
41    SAVE_REGS_ON_STACK 8, s0, s1
42
43    lw        t9, 24(sp)   // t9 = num_rows
44    lw        s0, 28(sp)   // s0 = cinfo->num_components
45    andi      t0, a0, 3    // t0 = cinfo->image_width & 3
46    beqz      t0, 4f       // no residual
47     nop
480:
49    addiu     t9, t9, -1
50    bltz      t9, 7f
51     li       t1, 0
521:
53    sll       t3, t1, 2
54    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
55    lw        t2, 0(a1)    // t2 = inptr = *input_buf
56    sll       t4, a3, 2
57    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
58    addu      t2, t2, t1
59    addu      s1, t5, a0
60    addu      t6, t5, t0
612:
62    lbu       t3, 0(t2)
63    addiu     t5, t5, 1
64    sb        t3, -1(t5)
65    bne       t6, t5, 2b
66     addu     t2, t2, s0
673:
68    lbu       t3, 0(t2)
69    addu      t4, t2, s0
70    addu      t7, t4, s0
71    addu      t8, t7, s0
72    addu      t2, t8, s0
73    lbu       t4, 0(t4)
74    lbu       t7, 0(t7)
75    lbu       t8, 0(t8)
76    addiu     t5, t5, 4
77    sb        t3, -4(t5)
78    sb        t4, -3(t5)
79    sb        t7, -2(t5)
80    bne       s1, t5, 3b
81     sb       t8, -1(t5)
82    addiu     t1, t1, 1
83    bne       t1, s0, 1b
84     nop
85    addiu     a1, a1, 4
86    bgez      t9, 0b
87     addiu    a3, a3, 1
88    b         7f
89     nop
904:
91    addiu     t9, t9, -1
92    bltz      t9, 7f
93     li       t1, 0
945:
95    sll       t3, t1, 2
96    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
97    lw        t2, 0(a1)    // t2 = inptr = *input_buf
98    sll       t4, a3, 2
99    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
100    addu      t2, t2, t1
101    addu      s1, t5, a0
102    addu      t6, t5, t0
1036:
104    lbu       t3, 0(t2)
105    addu      t4, t2, s0
106    addu      t7, t4, s0
107    addu      t8, t7, s0
108    addu      t2, t8, s0
109    lbu       t4, 0(t4)
110    lbu       t7, 0(t7)
111    lbu       t8, 0(t8)
112    addiu     t5, t5, 4
113    sb        t3, -4(t5)
114    sb        t4, -3(t5)
115    sb        t7, -2(t5)
116    bne       s1, t5, 6b
117     sb       t8, -1(t5)
118    addiu     t1, t1, 1
119    bne       t1, s0, 5b
120     nop
121    addiu     a1, a1, 4
122    bgez      t9, 4b
123     addiu    a3, a3, 1
1247:
125    RESTORE_REGS_FROM_STACK 8, s0, s1
126
127    j         ra
128     nop
129
130END(jsimd_c_null_convert_mips_dspr2)
131
132/*****************************************************************************/
133/*
134 * jsimd_extrgb_ycc_convert_mips_dspr2
135 * jsimd_extbgr_ycc_convert_mips_dspr2
136 * jsimd_extrgbx_ycc_convert_mips_dspr2
137 * jsimd_extbgrx_ycc_convert_mips_dspr2
138 * jsimd_extxbgr_ycc_convert_mips_dspr2
139 * jsimd_extxrgb_ycc_convert_mips_dspr2
140 *
141 * Colorspace conversion RGB -> YCbCr
142 */
143
144.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
145
146.macro DO_RGB_TO_YCC r,    \
147                     g,    \
148                     b,    \
149                     inptr
150    lbu     \r, \r_offs(\inptr)
151    lbu     \g, \g_offs(\inptr)
152    lbu     \b, \b_offs(\inptr)
153    addiu   \inptr, \pixel_size
154.endm
155
156LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
157/*
158 * a0     - cinfo->image_width
159 * a1     - input_buf
160 * a2     - output_buf
161 * a3     - output_row
162 * 16(sp) - num_rows
163 */
164
165    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
166
167    lw      t7, 48(sp)        // t7 = num_rows
168    li      s0, 0x4c8b        // FIX(0.29900)
169    li      s1, 0x9646        // FIX(0.58700)
170    li      s2, 0x1d2f        // FIX(0.11400)
171    li      s3, 0xffffd4cd    // -FIX(0.16874)
172    li      s4, 0xffffab33    // -FIX(0.33126)
173    li      s5, 0x8000        // FIX(0.50000)
174    li      s6, 0xffff94d1    // -FIX(0.41869)
175    li      s7, 0xffffeb2f    // -FIX(0.08131)
176    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
177
1780:
179    addiu   t7, -1            // --num_rows
180    lw      t6, 0(a1)         // t6 = input_buf[0]
181    lw      t0, 0(a2)
182    lw      t1, 4(a2)
183    lw      t2, 8(a2)
184    sll     t3, a3, 2
185    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
186    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
187    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
188
189    addu    t9, t2, a0        // t9 = end address
190    addiu   a3, 1
191
1921:
193    DO_RGB_TO_YCC t3, t4, t5, t6
194
195    mtlo    s5, $ac0
196    mtlo    t8, $ac1
197    mtlo    t8, $ac2
198    maddu   $ac0, s2, t5
199    maddu   $ac1, s5, t5
200    maddu   $ac2, s5, t3
201    maddu   $ac0, s0, t3
202    maddu   $ac1, s3, t3
203    maddu   $ac2, s6, t4
204    maddu   $ac0, s1, t4
205    maddu   $ac1, s4, t4
206    maddu   $ac2, s7, t5
207    extr.w  t3, $ac0, 16
208    extr.w  t4, $ac1, 16
209    extr.w  t5, $ac2, 16
210    sb      t3, 0(t0)
211    sb      t4, 0(t1)
212    sb      t5, 0(t2)
213    addiu   t0, 1
214    addiu   t2, 1
215    bne     t2, t9, 1b
216     addiu  t1, 1
217    bgtz    t7, 0b
218     addiu  a1, 4
219
220    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
221
222    j ra
223     nop
224END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
225
226.purgem DO_RGB_TO_YCC
227
228.endm
229
230/*------------------------------------------id -- pix R  G  B */
231GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
232GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
233GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
234GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
235GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
236GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
237
238/*****************************************************************************/
239/*
240 * jsimd_ycc_extrgb_convert_mips_dspr2
241 * jsimd_ycc_extbgr_convert_mips_dspr2
242 * jsimd_ycc_extrgbx_convert_mips_dspr2
243 * jsimd_ycc_extbgrx_convert_mips_dspr2
244 * jsimd_ycc_extxbgr_convert_mips_dspr2
245 * jsimd_ycc_extxrgb_convert_mips_dspr2
246 *
247 * Colorspace conversion YCbCr -> RGB
248 */
249
250.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
251
252.macro STORE_YCC_TO_RGB  scratch0 \
253                         scratch1 \
254                         scratch2 \
255                         outptr
256    sb       \scratch0, \r_offs(\outptr)
257    sb       \scratch1, \g_offs(\outptr)
258    sb       \scratch2, \b_offs(\outptr)
259.if (\pixel_size == 4)
260    li       t0, 0xFF
261    sb       t0, \a_offs(\outptr)
262.endif
263    addiu    \outptr, \pixel_size
264.endm
265
266LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
267/*
268 * a0     - cinfo->image_width
269 * a1     - input_buf
270 * a2     - input_row
271 * a3     - output_buf
272 * 16(sp) - num_rows
273 */
274
275    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
276
277    lw         s1, 48(sp)
278    li         t3, 0x8000
279    li         t4, 0x166e9     // FIX(1.40200)
280    li         t5, 0x1c5a2     // FIX(1.77200)
281    li         t6, 0xffff492e  // -FIX(0.71414)
282    li         t7, 0xffffa7e6  // -FIX(0.34414)
283    repl.ph    t8, 128
284
2850:
286    lw         s0, 0(a3)
287    lw         t0, 0(a1)
288    lw         t1, 4(a1)
289    lw         t2, 8(a1)
290    sll        s5, a2, 2
291    addiu      s1, -1
292    lwx        s2, s5(t0)
293    lwx        s3, s5(t1)
294    lwx        s4, s5(t2)
295    addu       t9, s2, a0
296    addiu      a2, 1
297
2981:
299    lbu        s7, 0(s4)       // cr
300    lbu        s6, 0(s3)       // cb
301    lbu        s5, 0(s2)       // y
302    addiu      s2, 1
303    addiu      s4, 1
304    addiu      s7, -128
305    addiu      s6, -128
306    mul        t2, t7, s6
307    mul        t0, t6, s7      // Crgtab[cr]
308    sll        s7, 15
309    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
310    sll        s6, 15
311    addu       t2, t3          // Cbgtab[cb]
312    addu       t2, t0
313
314    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
315    sra        t2, 16
316    addu       t1, s5
317    addu       t2, s5          // add y
318    ins        t2, t1, 16, 16
319    subu.ph    t2, t2, t8
320    addu       t0, s5
321    shll_s.ph  t2, t2, 8
322    subu       t0, 128
323    shra.ph    t2, t2, 8
324    shll_s.w   t0, t0, 24
325    addu.ph    t2, t2, t8      // clip & store
326    sra        t0, t0, 24
327    sra        t1, t2, 16
328    addiu      t0, 128
329
330    STORE_YCC_TO_RGB t1, t2, t0, s0
331
332    bne        s2, t9, 1b
333     addiu     s3, 1
334    bgtz       s1, 0b
335     addiu     a3, 4
336
337    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
338
339    j ra
340     nop
341END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
342
343.purgem STORE_YCC_TO_RGB
344
345.endm
346
347/*------------------------------------------id -- pix R  G  B  A */
348GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
349GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
350GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
351GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
352GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
353GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
354
355/*****************************************************************************/
356/*
357 * jsimd_extrgb_gray_convert_mips_dspr2
358 * jsimd_extbgr_gray_convert_mips_dspr2
359 * jsimd_extrgbx_gray_convert_mips_dspr2
360 * jsimd_extbgrx_gray_convert_mips_dspr2
361 * jsimd_extxbgr_gray_convert_mips_dspr2
362 * jsimd_extxrgb_gray_convert_mips_dspr2
363 *
364 * Colorspace conversion RGB -> GRAY
365 */
366
367.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
368
369.macro DO_RGB_TO_GRAY r,    \
370                      g,    \
371                      b,    \
372                      inptr
373    lbu     \r, \r_offs(\inptr)
374    lbu     \g, \g_offs(\inptr)
375    lbu     \b, \b_offs(\inptr)
376    addiu   \inptr, \pixel_size
377.endm
378
379LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
380/*
381 * a0     - cinfo->image_width
382 * a1     - input_buf
383 * a2     - output_buf
384 * a3     - output_row
385 * 16(sp) - num_rows
386 */
387
388    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
389
390    li      s0, 0x4c8b             // s0 = FIX(0.29900)
391    li      s1, 0x9646             // s1 = FIX(0.58700)
392    li      s2, 0x1d2f             // s2 = FIX(0.11400)
393    li      s7, 0x8000             // s7 = FIX(0.50000)
394    lw      s6, 48(sp)
395    andi    t7, a0, 3
396
3970:
398    addiu   s6, -1                 // s6 = num_rows
399    lw      t0, 0(a1)
400    lw      t1, 0(a2)
401    sll     t3, a3, 2
402    lwx     t1, t3(t1)
403    addiu   a3, 1
404    addu    t9, t1, a0
405    subu    t8, t9, t7
406    beq     t1, t8, 2f
407     nop
408
4091:
410    DO_RGB_TO_GRAY t3, t4, t5, t0
411    DO_RGB_TO_GRAY s3, s4, s5, t0
412
413    mtlo    s7, $ac0
414    maddu   $ac0, s2, t5
415    maddu   $ac0, s1, t4
416    maddu   $ac0, s0, t3
417    mtlo    s7, $ac1
418    maddu   $ac1, s2, s5
419    maddu   $ac1, s1, s4
420    maddu   $ac1, s0, s3
421    extr.w  t6, $ac0, 16
422
423    DO_RGB_TO_GRAY t3, t4, t5, t0
424    DO_RGB_TO_GRAY s3, s4, s5, t0
425
426    mtlo    s7, $ac0
427    maddu   $ac0, s2, t5
428    maddu   $ac0, s1, t4
429    extr.w  t2, $ac1, 16
430    maddu   $ac0, s0, t3
431    mtlo    s7, $ac1
432    maddu   $ac1, s2, s5
433    maddu   $ac1, s1, s4
434    maddu   $ac1, s0, s3
435    extr.w  t5, $ac0, 16
436    sb      t6, 0(t1)
437    sb      t2, 1(t1)
438    extr.w  t3, $ac1, 16
439    addiu   t1, 4
440    sb      t5, -2(t1)
441    sb      t3, -1(t1)
442    bne     t1, t8, 1b
443     nop
444
4452:
446    beqz    t7, 4f
447     nop
448
4493:
450    DO_RGB_TO_GRAY t3, t4, t5, t0
451
452    mtlo    s7, $ac0
453    maddu   $ac0, s2, t5
454    maddu   $ac0, s1, t4
455    maddu   $ac0, s0, t3
456    extr.w  t6, $ac0, 16
457    sb      t6, 0(t1)
458    addiu   t1, 1
459    bne     t1, t9, 3b
460     nop
461
4624:
463    bgtz    s6, 0b
464     addiu  a1, 4
465
466    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
467
468    j ra
469     nop
470END(jsimd_\colorid\()_gray_convert_mips_dspr2)
471
472.purgem DO_RGB_TO_GRAY
473
474.endm
475
476/*------------------------------------------id --  pix R  G  B */
477GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
478GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
479GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
480GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
481GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
482GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
483/*****************************************************************************/
484/*
485 * jsimd_h2v2_merged_upsample_mips_dspr2
486 * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
487 * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
488 * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
489 * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
490 * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
491 * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
492 *
493 * Merged h2v2 upsample routines
494 */
495.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
496                                                pixel_size, \
497                                                r1_offs,    \
498                                                g1_offs,    \
499                                                b1_offs,    \
500                                                a1_offs,    \
501                                                r2_offs,    \
502                                                g2_offs,    \
503                                                b2_offs,    \
504                                                a2_offs
505
506.macro STORE_H2V2_2_PIXELS  scratch0 \
507                            scratch1 \
508                            scratch2 \
509                            scratch3 \
510                            scratch4 \
511                            scratch5 \
512                            outptr
513    sb       \scratch0, \r1_offs(\outptr)
514    sb       \scratch1, \g1_offs(\outptr)
515    sb       \scratch2, \b1_offs(\outptr)
516    sb       \scratch3, \r2_offs(\outptr)
517    sb       \scratch4, \g2_offs(\outptr)
518    sb       \scratch5, \b2_offs(\outptr)
519.if (\pixel_size == 8)
520    li       \scratch0, 0xFF
521    sb       \scratch0, \a1_offs(\outptr)
522    sb       \scratch0, \a2_offs(\outptr)
523.endif
524    addiu    \outptr, \pixel_size
525.endm
526
527.macro STORE_H2V2_1_PIXEL  scratch0 \
528                           scratch1 \
529                           scratch2 \
530                           outptr
531    sb    \scratch0, \r1_offs(\outptr)
532    sb    \scratch1, \g1_offs(\outptr)
533    sb    \scratch2, \b1_offs(\outptr)
534
535.if (\pixel_size == 8)
536    li    t0, 0xFF
537    sb    t0, \a1_offs(\outptr)
538.endif
539.endm
540
541LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
542/*
543 * a0     - cinfo->output_width
544 * a1     - input_buf
545 * a2     - in_row_group_ctr
546 * a3     - output_buf
547 * 16(sp) - cinfo->sample_range_limit
548 */
549
550    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
551
552    lw           t9, 56(sp)        // cinfo->sample_range_limit
553    lw           v0, 0(a1)
554    lw           v1, 4(a1)
555    lw           t0, 8(a1)
556    sll          t1, a2, 3
557    addiu        t2, t1, 4
558    sll          t3, a2, 2
559    lw           t4, 0(a3)         // t4 = output_buf[0]
560    lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
561    lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
562    lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
563    lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
564    lw           t7, 4(a3)         // t7 = output_buf[1]
565    li           s1, 0xe6ea
566    addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
567    addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
568    addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
569    xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
570    srl          t3, a0, 1
571    blez         t3, 2f
572     addu        t0, t5, t3        // t0 = end address
573 1:
574    lbu          t3, 0(t5)
575    lbu          s3, 0(t6)
576    addiu        t5, t5, 1
577    addiu        t3, t3, -128      // (cb - 128)
578    addiu        s3, s3, -128      // (cr - 128)
579    mult         $ac1, s1, t3
580    madd         $ac1, s2, s3
581    sll          s3, s3, 15
582    sll          t3, t3, 15
583    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
584    extr_r.w     s5, $ac1, 16
585    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
586    lbu          v0, 0(t1)
587    addiu        t6, t6, 1
588    addiu        t1, t1, 2
589    addu         t3, v0, s4        // y+cred
590    addu         s3, v0, s5        // y+cgreen
591    addu         v1, v0, s6        // y+cblue
592    addu         t3, t9, t3        // y+cred
593    addu         s3, t9, s3        // y+cgreen
594    addu         v1, t9, v1        // y+cblue
595    lbu          AT, 0(t3)
596    lbu          s7, 0(s3)
597    lbu          ra, 0(v1)
598    lbu          v0, -1(t1)
599    addu         t3, v0, s4        // y+cred
600    addu         s3, v0, s5        // y+cgreen
601    addu         v1, v0, s6        // y+cblue
602    addu         t3, t9, t3        // y+cred
603    addu         s3, t9, s3        // y+cgreen
604    addu         v1, t9, v1        // y+cblue
605    lbu          t3, 0(t3)
606    lbu          s3, 0(s3)
607    lbu          v1, 0(v1)
608    lbu          v0, 0(t2)
609
610    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
611
612    addu         t3, v0, s4        // y+cred
613    addu         s3, v0, s5        // y+cgreen
614    addu         v1, v0, s6        // y+cblue
615    addu         t3, t9, t3        // y+cred
616    addu         s3, t9, s3        // y+cgreen
617    addu         v1, t9, v1        // y+cblue
618    lbu          AT, 0(t3)
619    lbu          s7, 0(s3)
620    lbu          ra, 0(v1)
621    lbu          v0, 1(t2)
622    addiu        t2, t2, 2
623    addu         t3, v0, s4        // y+cred
624    addu         s3, v0, s5        // y+cgreen
625    addu         v1, v0, s6        // y+cblue
626    addu         t3, t9, t3        // y+cred
627    addu         s3, t9, s3        // y+cgreen
628    addu         v1, t9, v1        // y+cblue
629    lbu          t3, 0(t3)
630    lbu          s3, 0(s3)
631    lbu          v1, 0(v1)
632
633    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
634
635    bne          t0, t5, 1b
636     nop
6372:
638    andi         t0, a0, 1
639    beqz         t0, 4f
640     lbu          t3, 0(t5)
641    lbu          s3, 0(t6)
642    addiu        t3, t3, -128      // (cb - 128)
643    addiu        s3, s3, -128      // (cr - 128)
644    mult         $ac1, s1, t3
645    madd         $ac1, s2, s3
646    sll          s3, s3, 15
647    sll          t3, t3, 15
648    lbu          v0, 0(t1)
649    extr_r.w     s5, $ac1, 16
650    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
651    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
652    addu         t3, v0, s4        // y+cred
653    addu         s3, v0, s5        // y+cgreen
654    addu         v1, v0, s6        // y+cblue
655    addu         t3, t9, t3        // y+cred
656    addu         s3, t9, s3        // y+cgreen
657    addu         v1, t9, v1        // y+cblue
658    lbu          t3, 0(t3)
659    lbu          s3, 0(s3)
660    lbu          v1, 0(v1)
661    lbu          v0, 0(t2)
662
663    STORE_H2V2_1_PIXEL t3, s3, v1, t4
664
665    addu         t3, v0, s4        // y+cred
666    addu         s3, v0, s5        // y+cgreen
667    addu         v1, v0, s6        // y+cblue
668    addu         t3, t9, t3        // y+cred
669    addu         s3, t9, s3        // y+cgreen
670    addu         v1, t9, v1        // y+cblue
671    lbu          t3, 0(t3)
672    lbu          s3, 0(s3)
673    lbu          v1, 0(v1)
674
675    STORE_H2V2_1_PIXEL t3, s3, v1, t7
6764:
677    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
678
679    j           ra
680     nop
681
682END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
683
684.purgem STORE_H2V2_1_PIXEL
685.purgem STORE_H2V2_2_PIXELS
686.endm
687
688/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
689GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
690GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
691GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
692GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
693GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
694GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
695/*****************************************************************************/
696/*
697 * jsimd_h2v1_merged_upsample_mips_dspr2
698 * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
699 * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
700 * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
701 * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
702 * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
703 * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
704 *
705 * Merged h2v1 upsample routines
706 */
707
708.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
709                                                pixel_size, \
710                                                r1_offs,    \
711                                                g1_offs,    \
712                                                b1_offs,    \
713                                                a1_offs,    \
714                                                r2_offs,    \
715                                                g2_offs,    \
716                                                b2_offs,    \
717                                                a2_offs
718
719.macro STORE_H2V1_2_PIXELS  scratch0 \
720                            scratch1 \
721                            scratch2 \
722                            scratch3 \
723                            scratch4 \
724                            scratch5 \
725                            outptr
726    sb       \scratch0, \r1_offs(\outptr)
727    sb       \scratch1, \g1_offs(\outptr)
728    sb       \scratch2, \b1_offs(\outptr)
729    sb       \scratch3, \r2_offs(\outptr)
730    sb       \scratch4, \g2_offs(\outptr)
731    sb       \scratch5, \b2_offs(\outptr)
732.if (\pixel_size == 8)
733    li       t0, 0xFF
734    sb       t0, \a1_offs(\outptr)
735    sb       t0, \a2_offs(\outptr)
736.endif
737    addiu    \outptr, \pixel_size
738.endm
739
740.macro STORE_H2V1_1_PIXEL  scratch0 \
741                           scratch1 \
742                           scratch2 \
743                           outptr
744    sb    \scratch0, \r1_offs(\outptr)
745    sb    \scratch1, \g1_offs(\outptr)
746    sb    \scratch2, \b1_offs(\outptr)
747.if (\pixel_size == 8)
748    li    t0, 0xFF
749    sb    t0, \a1_offs(\outptr)
750.endif
751.endm
752
753LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
754/*
755 * a0     - cinfo->output_width
756 * a1     - input_buf
757 * a2     - in_row_group_ctr
758 * a3     - output_buf
759 * 16(sp) - range_limit
760 */
761
762    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
763
764    li           t0, 0xe6ea
765    lw           t1, 0(a1)         // t1 = input_buf[0]
766    lw           t2, 4(a1)         // t2 = input_buf[1]
767    lw           t3, 8(a1)         // t3 = input_buf[2]
768    lw           t8, 56(sp)        // t8 = range_limit
769    addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
770    addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
771    addiu        s0, t0, 0x9916    // s0 = 0x8000
772    addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
773    xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
774    srl          t0, a0, 1
775    sll          t4, a2, 2
776    lwx          s5, t4(t1)        // s5 = inptr0
777    lwx          s6, t4(t2)        // s6 = inptr1
778    lwx          s7, t4(t3)        // s7 = inptr2
779    lw           t7, 0(a3)         // t7 = outptr
780    blez         t0, 2f
781     addu        t9, s6, t0        // t9 = end address
7821:
783    lbu          t2, 0(s6)         // t2 = cb
784    lbu          t0, 0(s7)         // t0 = cr
785    lbu          t1, 0(s5)         // t1 = y
786    addiu        t2, t2, -128      // t2 = cb - 128
787    addiu        t0, t0, -128      // t0 = cr - 128
788    mult         $ac1, s4, t2
789    madd         $ac1, s3, t0
790    sll          t0, t0, 15
791    sll          t2, t2, 15
792    mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
793    extr_r.w     t5, $ac1, 16
794    mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
795    addiu        s7, s7, 1
796    addiu        s6, s6, 1
797    addu         t2, t1, t0        // t2 = y + cred
798    addu         t3, t1, t5        // t3 = y + cgreen
799    addu         t4, t1, t6        // t4 = y + cblue
800    addu         t2, t8, t2
801    addu         t3, t8, t3
802    addu         t4, t8, t4
803    lbu          t1, 1(s5)
804    lbu          v0, 0(t2)
805    lbu          v1, 0(t3)
806    lbu          ra, 0(t4)
807    addu         t2, t1, t0
808    addu         t3, t1, t5
809    addu         t4, t1, t6
810    addu         t2, t8, t2
811    addu         t3, t8, t3
812    addu         t4, t8, t4
813    lbu          t2, 0(t2)
814    lbu          t3, 0(t3)
815    lbu          t4, 0(t4)
816
817    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
818
819    bne          t9, s6, 1b
820     addiu       s5, s5, 2
8212:
822    andi         t0, a0, 1
823    beqz         t0, 4f
824     nop
8253:
826    lbu          t2, 0(s6)
827    lbu          t0, 0(s7)
828    lbu          t1, 0(s5)
829    addiu        t2, t2, -128      //(cb - 128)
830    addiu        t0, t0, -128      //(cr - 128)
831    mul          t3, s4, t2
832    mul          t4, s3, t0
833    sll          t0, t0, 15
834    sll          t2, t2, 15
835    mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
836    mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
837    addu         t3, t3, s0
838    addu         t3, t4, t3
839    sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
840    addu         t2, t1, t0       // y + cred
841    addu         t3, t1, t5       // y + cgreen
842    addu         t4, t1, t6       // y + cblue
843    addu         t2, t8, t2
844    addu         t3, t8, t3
845    addu         t4, t8, t4
846    lbu          t2, 0(t2)
847    lbu          t3, 0(t3)
848    lbu          t4, 0(t4)
849
850    STORE_H2V1_1_PIXEL t2, t3, t4, t7
8514:
852    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
853
854    j            ra
855     nop
856
857END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
858
859.purgem STORE_H2V1_1_PIXEL
860.purgem STORE_H2V1_2_PIXELS
861.endm
862
863/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
864GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
865GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
866GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
867GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
868GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
869GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
870/*****************************************************************************/
871/*
872 * jsimd_h2v2_fancy_upsample_mips_dspr2
873 *
874 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
875 */
876LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
877/*
878 * a0     - cinfo->max_v_samp_factor
879 * a1     - downsampled_width
880 * a2     - input_data
881 * a3     - output_data_ptr
882 */
883
884    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
885
886    li             s4, 0
887    lw             s2, 0(a3)       // s2 = *output_data_ptr
8880:
889    li             t9, 2
890    lw             s1, -4(a2)      // s1 = inptr1
891
8921:
893    lw             s0, 0(a2)       // s0 = inptr0
894    lwx            s3, s4(s2)
895    addiu          s5, a1, -2      // s5 = downsampled_width - 2
896    srl            t4, s5, 1
897    sll            t4, t4, 1
898    lbu            t0, 0(s0)
899    lbu            t1, 1(s0)
900    lbu            t2, 0(s1)
901    lbu            t3, 1(s1)
902    addiu          s0, 2
903    addiu          s1, 2
904    addu           t8, s0, t4      // t8 = end address
905    andi           s5, s5, 1       // s5 = residual
906    sll            t4, t0, 1
907    sll            t6, t1, 1
908    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
909    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
910    addu           t7, t0, t2      // t7 = thiscolsum
911    addu           t6, t1, t3      // t5 = nextcolsum
912    sll            t0, t7, 2       // t0 = thiscolsum * 4
913    subu           t1, t0, t7      // t1 = thiscolsum * 3
914    shra_r.w       t0, t0, 4
915    addiu          t1, 7
916    addu           t1, t1, t6
917    srl            t1, t1, 4
918    sb             t0, 0(s3)
919    sb             t1, 1(s3)
920    beq            t8, s0, 22f     // skip to final iteration if width == 3
921     addiu          s3, 2
9222:
923    lh             t0, 0(s0)       // t0 = A3|A2
924    lh             t2, 0(s1)       // t2 = B3|B2
925    addiu          s0, 2
926    addiu          s1, 2
927    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
928    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
929    shll.ph        t1, t0, 1
930    sll            t3, t6, 1
931    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
932    addu           t3, t3, t6      // t3 = this * 3
933    addu.ph        t0, t0, t2      // t0 = next2|next1
934    addu           t1, t3, t7
935    andi           t7, t0, 0xFFFF  // t7 = next1
936    sll            t2, t7, 1
937    addu           t2, t7, t2      // t2 = next1*3
938    addu           t4, t2, t6
939    srl            t6, t0, 16      // t6 = next2
940    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
941    addu           t0, t3, t7
942    addiu          t0, 7
943    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
944    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
945    addu           t2, t2, t6
946    addiu          t2, 7
947    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
948    sb             t1, 0(s3)
949    sb             t0, 1(s3)
950    sb             t4, 2(s3)
951    sb             t2, 3(s3)
952    bne            t8, s0, 2b
953     addiu         s3, 4
95422:
955    beqz           s5, 4f
956     addu          t8, s0, s5
9573:
958    lbu            t0, 0(s0)
959    lbu            t2, 0(s1)
960    addiu          s0, 1
961    addiu          s1, 1
962    sll            t3, t6, 1
963    sll            t1, t0, 1
964    addu           t1, t0, t1      // t1 = inptr0 * 3
965    addu           t3, t3, t6      // t3 = thiscolsum * 3
966    addu           t5, t1, t2
967    addu           t1, t3, t7
968    shra_r.w       t1, t1, 4
969    addu           t0, t3, t5
970    addiu          t0, 7
971    srl            t0, t0, 4
972    sb             t1, 0(s3)
973    sb             t0, 1(s3)
974    addiu          s3, 2
975    move           t7, t6
976    bne            t8, s0, 3b
977     move          t6, t5
9784:
979    sll            t0, t6, 2       // t0 = thiscolsum * 4
980    subu           t1, t0, t6      // t1 = thiscolsum * 3
981    addu           t1, t1, t7
982    addiu          s4, 4
983    shra_r.w       t1, t1, 4
984    addiu          t0, 7
985    srl            t0, t0, 4
986    sb             t1, 0(s3)
987    sb             t0, 1(s3)
988    addiu          t9, -1
989    addiu          s3, 2
990    bnez           t9, 1b
991     lw            s1, 4(a2)
992    srl            t0, s4, 2
993    subu           t0, a0, t0
994    bgtz           t0, 0b
995     addiu         a2, 4
996
997    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
998
999    j ra
1000     nop
1001END(jsimd_h2v2_fancy_upsample_mips_dspr2)
1002
1003/*****************************************************************************/
1004LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
1005/*
1006 * a0     - cinfo->max_v_samp_factor
1007 * a1     - downsampled_width
1008 * a2     - input_data
1009 * a3     - output_data_ptr
1010 */
1011
1012    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1013
1014    .set at
1015
1016    beqz           a0, 3f
1017     sll           t0, a0, 2
1018    lw             s1, 0(a3)
1019    li             s3, 0x10001
1020    addu           s0, s1, t0
10210:
1022    addiu          t8, a1, -2
1023    srl            t9, t8, 2
1024    lw             t7, 0(a2)
1025    lw             s2, 0(s1)
1026    lbu            t0, 0(t7)
1027    lbu            t1, 1(t7)   // t1 = inptr[1]
1028    sll            t2, t0, 1
1029    addu           t2, t2, t0  // t2 = invalue*3
1030    addu           t2, t2, t1
1031    shra_r.w       t2, t2, 2
1032    sb             t0, 0(s2)
1033    sb             t2, 1(s2)
1034    beqz           t9, 11f
1035     addiu         s2, 2
10361:
1037    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
1038    ulw            t1, 1(t7)
1039    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
1040    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
1041    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
1042    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
1043    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
1044    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
1045    shll.ph        t5, t4, 1
1046    shll.ph        t6, t1, 1
1047    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
1048    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
1049    addu.ph        t4, t3, s3
1050    addu.ph        t0, t0, s3
1051    addu.ph        t4, t4, t5
1052    addu.ph        t0, t0, t6
1053    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
1054    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
1055    addu.ph        t2, t2, t5
1056    addu.ph        t3, t3, t6
1057    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
1058    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
1059    shll.ph        t2, t2, 8
1060    shll.ph        t3, t3, 8
1061    or             t2, t4, t2
1062    or             t3, t3, t0
1063    addiu          t9, -1
1064    usw            t3, 0(s2)
1065    usw            t2, 4(s2)
1066    addiu          s2, 8
1067    bgtz           t9, 1b
1068     addiu         t7, 4
106911:
1070    andi           t8, 3
1071    beqz           t8, 22f
1072     addiu         t7, 1
1073
10742:
1075    lbu            t0, 0(t7)
1076    addiu          t7, 1
1077    sll            t1, t0, 1
1078    addu           t2, t0, t1  // t2 = invalue
1079    lbu            t3, -2(t7)
1080    lbu            t4, 0(t7)
1081    addiu          t3, 1
1082    addiu          t4, 2
1083    addu           t3, t3, t2
1084    addu           t4, t4, t2
1085    srl            t3, 2
1086    srl            t4, 2
1087    sb             t3, 0(s2)
1088    sb             t4, 1(s2)
1089    addiu          t8, -1
1090    bgtz           t8, 2b
1091     addiu         s2, 2
1092
109322:
1094    lbu            t0, 0(t7)
1095    lbu            t2, -1(t7)
1096    sll            t1, t0, 1
1097    addu           t1, t1, t0 // t1 = invalue * 3
1098    addu           t1, t1, t2
1099    addiu          t1, 1
1100    srl            t1, t1, 2
1101    sb             t1, 0(s2)
1102    sb             t0, 1(s2)
1103    addiu          s1, 4
1104    bne            s1, s0, 0b
1105     addiu         a2, 4
11063:
1107    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1108
1109    j              ra
1110     nop
1111END(jsimd_h2v1_fancy_upsample_mips_dspr2)
1112
1113/*****************************************************************************/
1114LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
1115/*
1116 * a0     - cinfo->image_width
1117 * a1     - cinfo->max_v_samp_factor
1118 * a2     - compptr->v_samp_factor
1119 * a3     - compptr->width_in_blocks
1120 * 16(sp) - input_data
1121 * 20(sp) - output_data
1122 */
1123    .set at
1124
1125    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1126
1127    beqz        a2, 7f
1128     lw         s1, 44(sp)  // s1 = output_data
1129    lw          s0, 40(sp)  // s0 = input_data
1130    srl         s2, a0, 2
1131    andi        t9, a0, 2
1132    srl         t7, t9, 1
1133    addu        s2, t7, s2
1134    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
1135    srl         t7, t0, 1
1136    subu        s2, t7, s2
11370:
1138    andi        t6, a0, 1   // t6 = temp_index
1139    addiu       t6, -1
1140    lw          t4, 0(s1)   // t4 = outptr
1141    lw          t5, 0(s0)   // t5 = inptr0
1142    li          s3, 0       // s3 = bias
1143    srl         t7, a0, 1   // t7 = image_width1
1144    srl         s4, t7, 2
1145    andi        t8, t7, 3
11461:
1147    ulhu        t0, 0(t5)
1148    ulhu        t1, 2(t5)
1149    ulhu        t2, 4(t5)
1150    ulhu        t3, 6(t5)
1151    raddu.w.qb  t0, t0
1152    raddu.w.qb  t1, t1
1153    raddu.w.qb  t2, t2
1154    raddu.w.qb  t3, t3
1155    shra.ph     t0, t0, 1
1156    shra_r.ph   t1, t1, 1
1157    shra.ph     t2, t2, 1
1158    shra_r.ph   t3, t3, 1
1159    sb          t0, 0(t4)
1160    sb          t1, 1(t4)
1161    sb          t2, 2(t4)
1162    sb          t3, 3(t4)
1163    addiu       s4, -1
1164    addiu       t4, 4
1165    bgtz        s4, 1b
1166     addiu      t5, 8
1167    beqz        t8, 3f
1168     addu       s4, t4, t8
11692:
1170    ulhu        t0, 0(t5)
1171    raddu.w.qb  t0, t0
1172    addqh.w     t0, t0, s3
1173    xori        s3, s3, 1
1174    sb          t0, 0(t4)
1175    addiu       t4, 1
1176    bne         t4, s4, 2b
1177     addiu      t5, 2
11783:
1179    lbux        t1, t6(t5)
1180    sll         t1, 1
1181    addqh.w     t2, t1, s3  // t2 = pixval1
1182    xori        s3, s3, 1
1183    addqh.w     t3, t1, s3  // t3 = pixval2
1184    blez        s2, 5f
1185     append     t3, t2,  8
1186    addu        t5, t4, s2  // t5 = loop_end2
11874:
1188    ush         t3, 0(t4)
1189    addiu       s2, -1
1190    bgtz        s2, 4b
1191     addiu      t4,  2
11925:
1193    beqz        t9, 6f
1194     nop
1195    sb          t2, 0(t4)
11966:
1197    addiu       s1, 4
1198    addiu       a2, -1
1199    bnez        a2, 0b
1200     addiu      s0, 4
12017:
1202    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1203
1204    j           ra
1205    nop
1206END(jsimd_h2v1_downsample_mips_dspr2)
1207
1208/*****************************************************************************/
1209LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
1210
1211/*
1212 * a0     - cinfo->image_width
1213 * a1     - cinfo->max_v_samp_factor
1214 * a2     - compptr->v_samp_factor
1215 * a3     - compptr->width_in_blocks
1216 * 16(sp) - input_data
1217 * 20(sp) - output_data
1218 */
1219    .set at
1220    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1221
1222    beqz         a2, 8f
1223     lw          s1, 52(sp)      // s1 = output_data
1224    lw           s0, 48(sp)      // s0 = input_data
1225
1226    andi         t6, a0, 1       // t6 = temp_index
1227    addiu        t6, -1
1228    srl          t7, a0, 1       // t7 = image_width1
1229    srl          s4, t7, 2
1230    andi         t8, t7, 3
1231    andi         t9, a0, 2
1232    srl          s2, a0, 2
1233    srl          t7, t9, 1
1234    addu         s2, t7, s2
1235    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
1236    srl          t7, t0, 1
1237    subu         s2, t7, s2
12380:
1239    lw           t4, 0(s1)       // t4 = outptr
1240    lw           t5, 0(s0)       // t5 = inptr0
1241    lw           s7, 4(s0)       // s7 = inptr1
1242    li           s6, 1           // s6 = bias
12432:
1244    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
1245    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
1246    ulw          t2, 4(t5)
1247    ulw          t3, 4(s7)
1248    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
1249    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
1250    raddu.w.qb   t1, t7
1251    raddu.w.qb   t0, t0
1252    shra_r.w     t1, t1, 2
1253    addiu        t0, 1
1254    srl          t0, 2
1255    precrq.ph.w  t7, t2, t3
1256    ins          t2, t3, 16, 16
1257    raddu.w.qb   t7, t7
1258    raddu.w.qb   t2, t2
1259    shra_r.w     t7, t7, 2
1260    addiu        t2, 1
1261    srl          t2, 2
1262    sb           t0, 0(t4)
1263    sb           t1, 1(t4)
1264    sb           t2, 2(t4)
1265    sb           t7, 3(t4)
1266    addiu        t4, 4
1267    addiu        t5, 8
1268    addiu        s4, s4, -1
1269    bgtz         s4, 2b
1270     addiu       s7, 8
1271    beqz         t8, 4f
1272     addu        t8, t4, t8
12733:
1274    ulhu         t0, 0(t5)
1275    ulhu         t1, 0(s7)
1276    ins          t0, t1, 16, 16
1277    raddu.w.qb   t0, t0
1278    addu         t0, t0, s6
1279    srl          t0, 2
1280    xori         s6, s6, 3
1281    sb           t0, 0(t4)
1282    addiu        t5, 2
1283    addiu        t4, 1
1284    bne          t8, t4, 3b
1285     addiu       s7, 2
12864:
1287    lbux         t1, t6(t5)
1288    sll          t1, 1
1289    lbux         t0, t6(s7)
1290    sll          t0, 1
1291    addu         t1, t1, t0
1292    addu         t3, t1, s6
1293    srl          t0, t3, 2       // t2 = pixval1
1294    xori         s6, s6, 3
1295    addu         t2, t1, s6
1296    srl          t1, t2, 2       // t3 = pixval2
1297    blez         s2, 6f
1298     append      t1, t0, 8
12995:
1300    ush          t1, 0(t4)
1301    addiu        s2, -1
1302    bgtz         s2, 5b
1303     addiu       t4, 2
13046:
1305    beqz         t9, 7f
1306     nop
1307    sb           t0, 0(t4)
13087:
1309    addiu        s1, 4
1310    addiu        a2, -1
1311    bnez         a2, 0b
1312     addiu       s0, 8
13138:
1314    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1315
1316    j            ra
1317     nop
1318END(jsimd_h2v2_downsample_mips_dspr2)
1319/*****************************************************************************/
1320LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
1321/*
1322 * a0     - input_data
1323 * a1     - output_data
1324 * a2     - compptr->v_samp_factor
1325 * a3     - cinfo->max_v_samp_factor
1326 * 16(sp) - cinfo->smoothing_factor
1327 * 20(sp) - compptr->width_in_blocks
1328 * 24(sp) - cinfo->image_width
1329 */
1330
1331    .set at
1332
1333    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1334
1335    lw          s7, 52(sp)      // compptr->width_in_blocks
1336    lw          s0, 56(sp)      // cinfo->image_width
1337    lw          s6, 48(sp)      // cinfo->smoothing_factor
1338    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
1339    sll         v0, s7, 1
1340    subu        v0, v0, s0
1341    blez        v0, 2f
1342    move        v1, zero
1343    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
13440:
1345    addiu       t1, a0, -4
1346    sll         t2, v1, 2
1347    lwx         t1, t2(t1)
1348    move        t3, v0
1349    addu        t1, t1, s0
1350    lbu         t2, -1(t1)
13511:
1352    addiu       t3, t3, -1
1353    sb          t2, 0(t1)
1354    bgtz        t3, 1b
1355    addiu       t1, t1, 1
1356    addiu       v1, v1, 1
1357    bne         v1, t0, 0b
1358    nop
13592:
1360    li          v0, 80
1361    mul         v0, s6, v0
1362    li          v1, 16384
1363    move        t4, zero
1364    move        t5, zero
1365    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
1366    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
13673:
1368/* Special case for first column: pretend column -1 is same as column 0 */
1369    sll         v0, t4, 2
1370    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
1371    sll         v1, t5, 2
1372    addiu       t9, v1, 4
1373    addiu       s0, v1, -4
1374    addiu       s1, v1, 8
1375    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
1376    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
1377    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
1378    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
1379    lh          v0, 0(s2)
1380    lh          v1, 0(t9)
1381    lh          t0, 0(s0)
1382    lh          t1, 0(s1)
1383    ins         v0, v1, 16, 16
1384    ins         t0, t1, 16, 16
1385    raddu.w.qb  t2, v0
1386    raddu.w.qb  s3, t0
1387    lbu         v0, 0(s2)
1388    lbu         v1, 2(s2)
1389    lbu         t0, 0(t9)
1390    lbu         t1, 2(t9)
1391    addu        v0, v0, v1
1392    mult        $ac1,t2, t6
1393    addu        t0, t0, t1
1394    lbu         t2, 2(s0)
1395    addu        t0, t0, v0
1396    lbu         t3, 2(s1)
1397    addu        s3, t0, s3
1398    lbu         v0, 0(s0)
1399    lbu         t0, 0(s1)
1400    sll         s3, s3, 1
1401    addu        v0, v0, t2
1402    addu        t0, t0, t3
1403    addu        t0, t0, v0
1404    addu        s3, t0, s3
1405    madd        $ac1,s3, t7
1406    extr_r.w    v0, $ac1, 16
1407    addiu       t8, t8, 1
1408    addiu       s2, s2, 2
1409    addiu       t9, t9, 2
1410    addiu       s0, s0, 2
1411    addiu       s1, s1, 2
1412    sb          v0, -1(t8)
1413    addiu       s4, s7, -2
1414    and         s4, s4, 3
1415    addu        s5, s4, t8      //end adress
14164:
1417    lh          v0, 0(s2)
1418    lh          v1, 0(t9)
1419    lh          t0, 0(s0)
1420    lh          t1, 0(s1)
1421    ins         v0, v1, 16, 16
1422    ins         t0, t1, 16, 16
1423    raddu.w.qb  t2, v0
1424    raddu.w.qb  s3, t0
1425    lbu         v0, -1(s2)
1426    lbu         v1, 2(s2)
1427    lbu         t0, -1(t9)
1428    lbu         t1, 2(t9)
1429    addu        v0, v0, v1
1430    mult        $ac1, t2, t6
1431    addu        t0, t0, t1
1432    lbu         t2, 2(s0)
1433    addu        t0, t0, v0
1434    lbu         t3, 2(s1)
1435    addu        s3, t0, s3
1436    lbu         v0, -1(s0)
1437    lbu         t0, -1(s1)
1438    sll         s3, s3, 1
1439    addu        v0, v0, t2
1440    addu        t0, t0, t3
1441    addu        t0, t0, v0
1442    addu        s3, t0, s3
1443    madd        $ac1, s3, t7
1444    extr_r.w    t2, $ac1, 16
1445    addiu       t8, t8, 1
1446    addiu       s2, s2, 2
1447    addiu       t9, t9, 2
1448    addiu       s0, s0, 2
1449    sb          t2, -1(t8)
1450    bne         s5, t8, 4b
1451    addiu       s1, s1, 2
1452    addiu       s5, s7, -2
1453    subu        s5, s5, s4
1454    addu        s5, s5, t8      //end adress
14555:
1456    lh          v0, 0(s2)
1457    lh          v1, 0(t9)
1458    lh          t0, 0(s0)
1459    lh          t1, 0(s1)
1460    ins         v0, v1, 16, 16
1461    ins         t0, t1, 16, 16
1462    raddu.w.qb  t2, v0
1463    raddu.w.qb  s3, t0
1464    lbu         v0, -1(s2)
1465    lbu         v1, 2(s2)
1466    lbu         t0, -1(t9)
1467    lbu         t1, 2(t9)
1468    addu        v0, v0, v1
1469    mult        $ac1, t2, t6
1470    addu        t0, t0, t1
1471    lbu         t2, 2(s0)
1472    addu        t0, t0, v0
1473    lbu         t3, 2(s1)
1474    addu        s3, t0, s3
1475    lbu         v0, -1(s0)
1476    lbu         t0, -1(s1)
1477    sll         s3, s3, 1
1478    addu        v0, v0, t2
1479    addu        t0, t0, t3
1480    lh          v1, 2(t9)
1481    addu        t0, t0, v0
1482    lh          v0, 2(s2)
1483    addu        s3, t0, s3
1484    lh          t0, 2(s0)
1485    lh          t1, 2(s1)
1486    madd        $ac1, s3, t7
1487    extr_r.w    t2, $ac1, 16
1488    ins         t0, t1, 16, 16
1489    ins         v0, v1, 16, 16
1490    raddu.w.qb  s3, t0
1491    lbu         v1, 4(s2)
1492    lbu         t0, 1(t9)
1493    lbu         t1, 4(t9)
1494    sb          t2, 0(t8)
1495    raddu.w.qb  t3, v0
1496    lbu         v0, 1(s2)
1497    addu        t0, t0, t1
1498    mult        $ac1, t3, t6
1499    addu        v0, v0, v1
1500    lbu         t2, 4(s0)
1501    addu        t0, t0, v0
1502    lbu         v0, 1(s0)
1503    addu        s3, t0, s3
1504    lbu         t0, 1(s1)
1505    lbu         t3, 4(s1)
1506    addu        v0, v0, t2
1507    sll         s3, s3, 1
1508    addu        t0, t0, t3
1509    lh          v1, 4(t9)
1510    addu        t0, t0, v0
1511    lh          v0, 4(s2)
1512    addu        s3, t0, s3
1513    lh          t0, 4(s0)
1514    lh          t1, 4(s1)
1515    madd        $ac1, s3, t7
1516    extr_r.w    t2, $ac1, 16
1517    ins         t0, t1, 16, 16
1518    ins         v0, v1, 16, 16
1519    raddu.w.qb  s3, t0
1520    lbu         v1, 6(s2)
1521    lbu         t0, 3(t9)
1522    lbu         t1, 6(t9)
1523    sb          t2, 1(t8)
1524    raddu.w.qb  t3, v0
1525    lbu         v0, 3(s2)
1526    addu        t0, t0,t1
1527    mult        $ac1, t3, t6
1528    addu        v0, v0, v1
1529    lbu         t2, 6(s0)
1530    addu        t0, t0, v0
1531    lbu         v0, 3(s0)
1532    addu        s3, t0, s3
1533    lbu         t0, 3(s1)
1534    lbu         t3, 6(s1)
1535    addu        v0, v0, t2
1536    sll         s3, s3, 1
1537    addu        t0, t0, t3
1538    lh          v1, 6(t9)
1539    addu        t0, t0, v0
1540    lh          v0, 6(s2)
1541    addu        s3, t0, s3
1542    lh          t0, 6(s0)
1543    lh          t1, 6(s1)
1544    madd        $ac1, s3, t7
1545    extr_r.w    t3, $ac1, 16
1546    ins         t0, t1, 16, 16
1547    ins         v0, v1, 16, 16
1548    raddu.w.qb  s3, t0
1549    lbu         v1, 8(s2)
1550    lbu         t0, 5(t9)
1551    lbu         t1, 8(t9)
1552    sb          t3, 2(t8)
1553    raddu.w.qb  t2, v0
1554    lbu         v0, 5(s2)
1555    addu        t0, t0, t1
1556    mult        $ac1, t2, t6
1557    addu        v0, v0, v1
1558    lbu         t2, 8(s0)
1559    addu        t0, t0, v0
1560    lbu         v0, 5(s0)
1561    addu        s3, t0, s3
1562    lbu         t0, 5(s1)
1563    lbu         t3, 8(s1)
1564    addu        v0, v0, t2
1565    sll         s3, s3, 1
1566    addu        t0, t0, t3
1567    addiu       t8, t8, 4
1568    addu        t0, t0, v0
1569    addiu       s2, s2, 8
1570    addu        s3, t0, s3
1571    addiu       t9, t9, 8
1572    madd        $ac1, s3, t7
1573    extr_r.w    t1, $ac1, 16
1574    addiu       s0, s0, 8
1575    addiu       s1, s1, 8
1576    bne         s5, t8, 5b
1577    sb          t1, -1(t8)
1578/* Special case for last column */
1579    lh          v0, 0(s2)
1580    lh          v1, 0(t9)
1581    lh          t0, 0(s0)
1582    lh          t1, 0(s1)
1583    ins         v0, v1, 16, 16
1584    ins         t0, t1, 16, 16
1585    raddu.w.qb  t2, v0
1586    raddu.w.qb  s3, t0
1587    lbu         v0, -1(s2)
1588    lbu         v1, 1(s2)
1589    lbu         t0, -1(t9)
1590    lbu         t1, 1(t9)
1591    addu        v0, v0, v1
1592    mult        $ac1, t2, t6
1593    addu        t0, t0, t1
1594    lbu         t2, 1(s0)
1595    addu        t0, t0, v0
1596    lbu         t3, 1(s1)
1597    addu        s3, t0, s3
1598    lbu         v0, -1(s0)
1599    lbu         t0, -1(s1)
1600    sll         s3, s3, 1
1601    addu        v0, v0, t2
1602    addu        t0, t0, t3
1603    addu        t0, t0, v0
1604    addu        s3, t0, s3
1605    madd        $ac1, s3, t7
1606    extr_r.w    t0, $ac1, 16
1607    addiu       t5, t5, 2
1608    sb          t0, 0(t8)
1609    addiu       t4, t4, 1
1610    bne         t4, a2, 3b
1611    addiu       t5, t5, 2
1612
1613    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1614
1615    j           ra
1616     nop
1617
1618END(jsimd_h2v2_smooth_downsample_mips_dspr2)
1619
1620/*****************************************************************************/
1621LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
1622/*
1623 * a0     - upsample->h_expand[compptr->component_index]
1624 * a1     - upsample->v_expand[compptr->component_index]
1625 * a2     - input_data
1626 * a3     - output_data_ptr
1627 * 16(sp) - cinfo->output_width
1628 * 20(sp) - cinfo->max_v_samp_factor
1629 */
1630    .set at
1631
1632    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1633
1634    lw      s0, 0(a3)    // s0 = output_data
1635    lw      s1, 32(sp)   // s1 = cinfo->output_width
1636    lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
1637    li      t6, 0        // t6 = inrow
1638    beqz    s2, 10f
1639     li     s3, 0        // s3 = outrow
16400:
1641    addu    t0, a2, t6
1642    addu    t7, s0, s3
1643    lw      t3, 0(t0)    // t3 = inptr
1644    lw      t8, 0(t7)    // t8 = outptr
1645    beqz    s1, 4f
1646     addu   t5, t8, s1   // t5 = outend
16471:
1648    lb      t2, 0(t3)    // t2 = invalue = *inptr++
1649    addiu   t3, 1
1650    beqz    a0, 3f
1651     move   t0, a0       // t0 = h_expand
16522:
1653    sb      t2, 0(t8)
1654    addiu   t0, -1
1655    bgtz    t0, 2b
1656     addiu  t8, 1
16573:
1658    bgt     t5, t8, 1b
1659     nop
16604:
1661    addiu   t9, a1, -1   // t9 = v_expand - 1
1662    blez    t9, 9f
1663     nop
16645:
1665    lw      t3, 0(s0)
1666    lw      t4, 4(s0)
1667    subu    t0, s1, 0xF
1668    blez    t0, 7f
1669     addu   t5, t3, s1   // t5 = end address
1670    andi    t7, s1, 0xF  // t7 = residual
1671    subu    t8, t5, t7
16726:
1673    ulw     t0, 0(t3)
1674    ulw     t1, 4(t3)
1675    ulw     t2, 8(t3)
1676    usw     t0, 0(t4)
1677    ulw     t0, 12(t3)
1678    usw     t1, 4(t4)
1679    usw     t2, 8(t4)
1680    usw     t0, 12(t4)
1681    addiu   t3, 16
1682    bne     t3, t8, 6b
1683     addiu  t4, 16
1684    beqz    t7, 8f
1685     nop
16867:
1687    lbu     t0, 0(t3)
1688    sb      t0, 0(t4)
1689    addiu   t3, 1
1690    bne     t3, t5, 7b
1691     addiu  t4, 1
16928:
1693    addiu   t9, -1
1694    bgtz    t9, 5b
1695     addiu  s0, 8
16969:
1697    addu    s3, s3, a1
1698    bne     s3, s2, 0b
1699     addiu  t6, 1
170010:
1701    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1702
1703    j       ra
1704     nop
1705END(jsimd_int_upsample_mips_dspr2)
1706
1707/*****************************************************************************/
1708LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
1709/*
1710 * a0     - cinfo->max_v_samp_factor
1711 * a1     - cinfo->output_width
1712 * a2     - input_data
1713 * a3     - output_data_ptr
1714 */
1715    lw      t7, 0(a3)       // t7 = output_data
1716    andi    t8, a1, 0xf     // t8 = residual
1717    sll     t0, a0, 2
1718    blez    a0, 4f
1719     addu   t9, t7, t0      // t9 = output_data end address
17200:
1721    lw      t5, 0(t7)       // t5 = outptr
1722    lw      t6, 0(a2)       // t6 = inptr
1723    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
1724    subu    t3, t8          // t3 = end address - residual
1725    beq     t5, t3, 2f
1726     move   t4, t8
17271:
1728    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
1729    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
1730    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
1731    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
1732    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
1733    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
1734    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
1735    usw     t0, 0(t5)
1736    usw     t1, 4(t5)
1737    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
1738    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
1739    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
1740    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
1741    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
1742    usw     t2, 8(t5)
1743    usw     t0, 12(t5)
1744    addiu   t5, 16
1745    bne     t5, t3, 1b
1746     addiu  t6, 8
1747    beqz    t8, 3f
1748     move   t4, t8
17492:
1750    lbu     t1, 0(t6)
1751    sb      t1, 0(t5)
1752    sb      t1, 1(t5)
1753    addiu   t4, -2
1754    addiu   t6, 1
1755    bgtz    t4, 2b
1756     addiu  t5, 2
17573:
1758    addiu   t7, 4
1759    bne     t9, t7, 0b
1760     addiu  a2, 4
17614:
1762    j       ra
1763     nop
1764END(jsimd_h2v1_upsample_mips_dspr2)
1765
1766/*****************************************************************************/
1767LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
1768/*
1769 * a0     - cinfo->max_v_samp_factor
1770 * a1     - cinfo->output_width
1771 * a2     - input_data
1772 * a3     - output_data_ptr
1773 */
1774    lw      t7, 0(a3)
1775    blez    a0, 7f
1776     andi   t9, a1, 0xf     // t9 = residual
17770:
1778    lw      t6, 0(a2)       // t6 = inptr
1779    lw      t5, 0(t7)       // t5 = outptr
1780    addu    t8, t5, a1      // t8 = outptr end address
1781    subu    t8, t9          // t8 = end address - residual
1782    beq     t5, t8, 2f
1783     move   t4, t9
17841:
1785    ulw     t0, 0(t6)
1786    srl     t1, t0, 16
1787    ins     t0, t0, 16, 16
1788    ins     t0, t0, 8, 16
1789    ins     t1, t1, 16, 16
1790    ins     t1, t1, 8, 16
1791    ulw     t2, 4(t6)
1792    usw     t0, 0(t5)
1793    usw     t1, 4(t5)
1794    srl     t3, t2, 16
1795    ins     t2, t2, 16, 16
1796    ins     t2, t2, 8, 16
1797    ins     t3, t3, 16, 16
1798    ins     t3, t3, 8, 16
1799    usw     t2, 8(t5)
1800    usw     t3, 12(t5)
1801    addiu   t5, 16
1802    bne     t5, t8, 1b
1803     addiu  t6, 8
1804    beqz    t9, 3f
1805     move   t4, t9
18062:
1807    lbu     t0, 0(t6)
1808    sb      t0, 0(t5)
1809    sb      t0, 1(t5)
1810    addiu   t4, -2
1811    addiu   t6, 1
1812    bgtz    t4, 2b
1813     addiu  t5, 2
18143:
1815    lw      t6, 0(t7)       // t6 = outptr[0]
1816    lw      t5, 4(t7)       // t5 = outptr[1]
1817    addu    t4, t6, a1      // t4 = new end address
1818    beq     a1, t9, 5f
1819     subu   t8, t4, t9
18204:
1821    ulw     t0, 0(t6)
1822    ulw     t1, 4(t6)
1823    ulw     t2, 8(t6)
1824    usw     t0, 0(t5)
1825    ulw     t0, 12(t6)
1826    usw     t1, 4(t5)
1827    usw     t2, 8(t5)
1828    usw     t0, 12(t5)
1829    addiu   t6, 16
1830    bne     t6, t8, 4b
1831     addiu  t5, 16
1832    beqz    t9, 6f
1833     nop
18345:
1835    lbu     t0, 0(t6)
1836    sb      t0, 0(t5)
1837    addiu   t6, 1
1838    bne     t6, t4, 5b
1839     addiu  t5, 1
18406:
1841    addiu   t7, 8
1842    addiu   a0, -2
1843    bgtz    a0, 0b
1844     addiu  a2, 4
18457:
1846    j       ra
1847     nop
1848END(jsimd_h2v2_upsample_mips_dspr2)
1849
1850/*****************************************************************************/
1851LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
1852/*
1853 * a0     - coef_block
1854 * a1     - compptr->dcttable
1855 * a2     - output
1856 * a3     - range_limit
1857 */
1858
1859    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1860
1861    addiu     sp, sp, -256
1862    move      v0, sp
1863    addiu     v1, zero, 8      // v1 = DCTSIZE = 8
18641:
1865    lh        s4, 32(a0)       // s4 = inptr[16]
1866    lh        s5, 64(a0)       // s5 = inptr[32]
1867    lh        s6, 96(a0)       // s6 = inptr[48]
1868    lh        t1, 112(a0)      // t1 = inptr[56]
1869    lh        t7, 16(a0)       // t7 = inptr[8]
1870    lh        t5, 80(a0)       // t5 = inptr[40]
1871    lh        t3, 48(a0)       // t3 = inptr[24]
1872    or        s4, s4, t1
1873    or        s4, s4, t3
1874    or        s4, s4, t5
1875    or        s4, s4, t7
1876    or        s4, s4, s5
1877    or        s4, s4, s6
1878    bnez      s4, 2f
1879     addiu    v1, v1, -1
1880    lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
1881    lh        s6, 0(a0)        // inptr[DCTSIZE*0]
1882    mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
1883    sll       s5, s5, 2
1884    sw        s5, 0(v0)
1885    sw        s5, 32(v0)
1886    sw        s5, 64(v0)
1887    sw        s5, 96(v0)
1888    sw        s5, 128(v0)
1889    sw        s5, 160(v0)
1890    sw        s5, 192(v0)
1891    b         3f
1892     sw       s5, 224(v0)
18932:
1894    lh        t0, 112(a1)
1895    lh        t2, 48(a1)
1896    lh        t4, 80(a1)
1897    lh        t6, 16(a1)
1898    mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
1899    mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
1900    mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
1901    mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
1902    lh        t4, 32(a1)
1903    lh        t5, 32(a0)
1904    lh        t6, 96(a1)
1905    lh        t7, 96(a0)
1906    addu      s0, t0, t1       // z3 = tmp0 + tmp2
1907    addu      s1, t1, t2       // z2 = tmp1 + tmp2
1908    addu      s2, t2, t3       // z4 = tmp1 + tmp3
1909    addu      s3, s0, s2       // z3 + z4
1910    addiu     t9, zero, 9633   // FIX_1_175875602
1911    mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1912    addu      t8, t0, t3       // z1 = tmp0 + tmp3
1913    addiu     t9, zero, 2446   // FIX_0_298631336
1914    mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1915    addiu     t9, zero, 16819  // FIX_2_053119869
1916    mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1917    addiu     t9, zero, 25172  // FIX_3_072711026
1918    mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1919    addiu     t9, zero, 12299  // FIX_1_501321110
1920    mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1921    addiu     t9, zero, 16069  // FIX_1_961570560
1922    mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
1923    addiu     t9, zero, 3196   // FIX_0_390180644
1924    mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
1925    addiu     t9, zero, 7373   // FIX_0_899976223
1926    mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
1927    addiu     t9, zero, 20995  // FIX_2_562915447
1928    mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
1929    subu      s0, s3, s0       // z3 += z5
1930    addu      t0, t0, s0       // tmp0 += z3
1931    addu      t1, t1, s0       // tmp2 += z3
1932    subu      s2, s3, s2       // z4 += z5
1933    addu      t2, t2, s2       // tmp1 += z4
1934    addu      t3, t3, s2       // tmp3 += z4
1935    subu      t0, t0, t8       // tmp0 += z1
1936    subu      t1, t1, s1       // tmp2 += z2
1937    subu      t2, t2, s1       // tmp1 += z2
1938    subu      t3, t3, t8       // tmp3 += z1
1939    mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
1940    addiu     t9, zero, 6270   // FIX_0_765366865
1941    mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
1942    lh        t4, 0(a1)
1943    lh        t5, 0(a0)
1944    lh        t6, 64(a1)
1945    lh        t7, 64(a0)
1946    mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
1947    mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
1948    mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
1949    addiu     t9, zero, 4433   // FIX_0_541196100
1950    addu      s3, s0, s1       // z2 + z3
1951    mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1952    addiu     t9, zero, 15137  // FIX_1_847759065
1953    mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
1954    addu      t4, t5, t6
1955    subu      t5, t5, t6
1956    sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
1957    sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
1958    addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1959    subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
1960    addu      s0, t4, t7
1961    subu      s1, t4, t7
1962    addu      s2, t5, t6
1963    subu      s3, t5, t6
1964    addu      t4, s0, t3
1965    subu      s0, s0, t3
1966    addu      t3, s2, t1
1967    subu      s2, s2, t1
1968    addu      t1, s3, t2
1969    subu      s3, s3, t2
1970    addu      t2, s1, t0
1971    subu      s1, s1, t0
1972    shra_r.w  t4, t4, 11
1973    shra_r.w  t3, t3, 11
1974    shra_r.w  t1, t1, 11
1975    shra_r.w  t2, t2, 11
1976    shra_r.w  s1, s1, 11
1977    shra_r.w  s3, s3, 11
1978    shra_r.w  s2, s2, 11
1979    shra_r.w  s0, s0, 11
1980    sw        t4, 0(v0)
1981    sw        t3, 32(v0)
1982    sw        t1, 64(v0)
1983    sw        t2, 96(v0)
1984    sw        s1, 128(v0)
1985    sw        s3, 160(v0)
1986    sw        s2, 192(v0)
1987    sw        s0, 224(v0)
19883:
1989    addiu     a1, a1, 2
1990    addiu     a0, a0, 2
1991    bgtz      v1, 1b
1992     addiu    v0, v0, 4
1993    move      v0, sp
1994    addiu     v1, zero, 8
19954:
1996    lw        t0, 8(v0)        // z2 = (JLONG) wsptr[2]
1997    lw        t1, 24(v0)       // z3 = (JLONG) wsptr[6]
1998    lw        t2, 0(v0)        // (JLONG) wsptr[0]
1999    lw        t3, 16(v0)       // (JLONG) wsptr[4]
2000    lw        s4, 4(v0)        // (JLONG) wsptr[1]
2001    lw        s5, 12(v0)       // (JLONG) wsptr[3]
2002    lw        s6, 20(v0)       // (JLONG) wsptr[5]
2003    lw        s7, 28(v0)       // (JLONG) wsptr[7]
2004    or        s4, s4, t0
2005    or        s4, s4, t1
2006    or        s4, s4, t3
2007    or        s4, s4, s7
2008    or        s4, s4, s5
2009    or        s4, s4, s6
2010    bnez      s4, 5f
2011     addiu    v1, v1, -1
2012    shra_r.w  s5, t2, 5
2013    andi      s5, s5, 0x3ff
2014    lbux      s5, s5(a3)
2015    lw        s1, 0(a2)
2016    replv.qb  s5, s5
2017    usw       s5, 0(s1)
2018    usw       s5, 4(s1)
2019    b         6f
2020     nop
20215:
2022    addu      t4, t0, t1       // z2 + z3
2023    addiu     t8, zero, 4433   // FIX_0_541196100
2024    mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2025    addiu     t8, zero, 15137  // FIX_1_847759065
2026    mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
2027    addiu     t8, zero, 6270   // FIX_0_765366865
2028    mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
2029    addu      t4, t2, t3       // (JLONG) wsptr[0] + (JLONG) wsptr[4]
2030    subu      t2, t2, t3       // (JLONG) wsptr[0] - (JLONG) wsptr[4]
2031    sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
2032    sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
2033    subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
2034    subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
2035    addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
2036    addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2037    subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
2038    addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
2039    lw        t4, 28(v0)       // tmp0 = (JLONG) wsptr[7]
2040    lw        t6, 12(v0)       // tmp2 = (JLONG) wsptr[3]
2041    lw        t5, 20(v0)       // tmp1 = (JLONG) wsptr[5]
2042    lw        t7, 4(v0)        // tmp3 = (JLONG) wsptr[1]
2043    addu      s0, t4, t6       // z3 = tmp0 + tmp2
2044    addiu     t8, zero, 9633   // FIX_1_175875602
2045    addu      s1, t5, t7       // z4 = tmp1 + tmp3
2046    addu      s2, s0, s1       // z3 + z4
2047    mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2048    addu      s3, t4, t7       // z1 = tmp0 + tmp3
2049    addu      t9, t5, t6       // z2 = tmp1 + tmp2
2050    addiu     t8, zero, 16069  // FIX_1_961570560
2051    mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
2052    addiu     t8, zero, 3196   // FIX_0_390180644
2053    mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
2054    addiu     t8, zero, 2446   // FIX_0_298631336
2055    mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2056    addiu     t8, zero, 7373   // FIX_0_899976223
2057    mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
2058    addiu     t8, zero, 16819  // FIX_2_053119869
2059    mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2060    addiu     t8, zero, 20995  // FIX_2_562915447
2061    mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
2062    addiu     t8, zero, 25172  // FIX_3_072711026
2063    mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2064    addiu     t8, zero, 12299  // FIX_1_501321110
2065    mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2066    subu      s0, s2, s0       // z3 += z5
2067    subu      s1, s2, s1       // z4 += z5
2068    addu      t4, t4, s0
2069    subu      t4, t4, s3       // tmp0
2070    addu      t5, t5, s1
2071    subu      t5, t5, t9       // tmp1
2072    addu      t6, t6, s0
2073    subu      t6, t6, t9       // tmp2
2074    addu      t7, t7, s1
2075    subu      t7, t7, s3       // tmp3
2076    addu      s0, t0, t7
2077    subu      t0, t0, t7
2078    addu      t7, t2, t6
2079    subu      t2, t2, t6
2080    addu      t6, t3, t5
2081    subu      t3, t3, t5
2082    addu      t5, t1, t4
2083    subu      t1, t1, t4
2084    shra_r.w  s0, s0, 18
2085    shra_r.w  t7, t7, 18
2086    shra_r.w  t6, t6, 18
2087    shra_r.w  t5, t5, 18
2088    shra_r.w  t1, t1, 18
2089    shra_r.w  t3, t3, 18
2090    shra_r.w  t2, t2, 18
2091    shra_r.w  t0, t0, 18
2092    andi      s0, s0, 0x3ff
2093    andi      t7, t7, 0x3ff
2094    andi      t6, t6, 0x3ff
2095    andi      t5, t5, 0x3ff
2096    andi      t1, t1, 0x3ff
2097    andi      t3, t3, 0x3ff
2098    andi      t2, t2, 0x3ff
2099    andi      t0, t0, 0x3ff
2100    lw        s1, 0(a2)
2101    lbux      s0, s0(a3)
2102    lbux      t7, t7(a3)
2103    lbux      t6, t6(a3)
2104    lbux      t5, t5(a3)
2105    lbux      t1, t1(a3)
2106    lbux      t3, t3(a3)
2107    lbux      t2, t2(a3)
2108    lbux      t0, t0(a3)
2109    sb        s0, 0(s1)
2110    sb        t7, 1(s1)
2111    sb        t6, 2(s1)
2112    sb        t5, 3(s1)
2113    sb        t1, 4(s1)
2114    sb        t3, 5(s1)
2115    sb        t2, 6(s1)
2116    sb        t0, 7(s1)
21176:
2118    addiu     v0, v0, 32
2119    bgtz      v1, 4b
2120     addiu    a2, a2, 4
2121    addiu     sp, sp, 256
2122
2123    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2124
2125    j         ra
2126     nop
2127
2128END(jsimd_idct_islow_mips_dspr2)
2129
2130/*****************************************************************************/
2131LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
2132/*
2133 * a0     - inptr
2134 * a1     - quantptr
2135 * a2     - wsptr
2136 * a3     - mips_idct_ifast_coefs
2137 */
2138
2139    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2140
2141    addiu          t9, a0, 16            // end address
2142    or             AT, a3, zero
2143
21440:
2145    lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
2146    lw             t0, 0(a0)             // inptr[DCTSIZE*0]
2147    lw             t1, 16(a0)            // inptr[DCTSIZE*1]
2148    muleq_s.w.phl  v0, t0, s0            // tmp0 ...
2149    lw             t2, 32(a0)            // inptr[DCTSIZE*2]
2150    lw             t3, 48(a0)            // inptr[DCTSIZE*3]
2151    lw             t4, 64(a0)            // inptr[DCTSIZE*4]
2152    lw             t5, 80(a0)            // inptr[DCTSIZE*5]
2153    muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
2154    lw             t6, 96(a0)            // inptr[DCTSIZE*6]
2155    lw             t7, 112(a0)           // inptr[DCTSIZE*7]
2156    or             s4, t1, t2
2157    or             s5, t3, t4
2158    bnez           s4, 1f
2159     ins           t0, v0, 16, 16        // ... tmp0
2160    bnez           s5, 1f
2161     or            s6, t5, t6
2162    or             s6, s6, t7
2163    bnez           s6, 1f
2164     sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
2165    sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
2166    sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
2167    sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
2168    sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
2169    sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
2170    sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
2171    sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
2172    addiu          a0, a0, 4
2173    b              2f
2174     addiu         a1, a1, 4
2175
21761:
2177    lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
2178    lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
2179    muleq_s.w.phl  v0, t2, s1            // tmp1 ...
2180    muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
2181    lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
2182    lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
2183    lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
2184    muleq_s.w.phl  v1, t4, s2            // tmp2 ...
2185    muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
2186    lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
2187    lw             t8, 4(AT)             // FIX(1.414213562)
2188    ins            t2, v0, 16, 16        // ... tmp1
2189    muleq_s.w.phl  v0, t6, s3            // tmp3 ...
2190    muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
2191    ins            t4, v1, 16, 16        // ... tmp2
2192    addq.ph        s4, t0, t4            // tmp10
2193    subq.ph        s5, t0, t4            // tmp11
2194    ins            t6, v0, 16, 16        // ... tmp3
2195    subq.ph        s6, t2, t6            // tmp12 ...
2196    addq.ph        s7, t2, t6            // tmp13
2197    mulq_s.ph      s6, s6, t8            // ... tmp12 ...
2198    addq.ph        t0, s4, s7            // tmp0
2199    subq.ph        t6, s4, s7            // tmp3
2200    muleq_s.w.phl  v0, t1, s0            // tmp4 ...
2201    muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
2202    shll_s.ph      s6, s6, 1             // x2
2203    lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
2204    subq.ph        s6, s6, s7            // ... tmp12
2205    muleq_s.w.phl  v1, t7, s3            // tmp7 ...
2206    muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
2207    ins            t1, v0, 16, 16        // ... tmp4
2208    addq.ph        t2, s5, s6            // tmp1
2209    subq.ph        t4, s5, s6            // tmp2
2210    muleq_s.w.phl  v0, t5, s2            // tmp6 ...
2211    muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
2212    ins            t7, v1, 16, 16        // ... tmp7
2213    addq.ph        s5, t1, t7            // z11
2214    subq.ph        s6, t1, t7            // z12
2215    muleq_s.w.phl  v1, t3, s1            // tmp5 ...
2216    muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
2217    ins            t5, v0, 16, 16        // ... tmp6
2218    ins            t3, v1, 16, 16        // ... tmp5
2219    addq.ph        s7, t5, t3            // z13
2220    subq.ph        v0, t5, t3            // z10
2221    addq.ph        t7, s5, s7            // tmp7
2222    subq.ph        s5, s5, s7            // tmp11 ...
2223    addq.ph        v1, v0, s6            // z5 ...
2224    mulq_s.ph      s5, s5, t8            // ... tmp11
2225    lw             t8, 8(AT)             // FIX(1.847759065)
2226    lw             s4, 0(AT)             // FIX(1.082392200)
2227    addq.ph        s0, t0, t7
2228    subq.ph        s1, t0, t7
2229    mulq_s.ph      v1, v1, t8            // ... z5
2230    shll_s.ph      s5, s5, 1             // x2
2231    lw             t8, 12(AT)            // FIX(-2.613125930)
2232    sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
2233    shll_s.ph      v0, v0, 1             // x4
2234    mulq_s.ph      v0, v0, t8            // tmp12 ...
2235    mulq_s.ph      s4, s6, s4            // tmp10 ...
2236    shll_s.ph      v1, v1, 1             // x2
2237    addiu          a0, a0, 4
2238    addiu          a1, a1, 4
2239    sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
2240    shll_s.ph      s6, v0, 1             // x4
2241    shll_s.ph      s4, s4, 1             // x2
2242    addq.ph        s6, s6, v1            // ... tmp12
2243    subq.ph        t5, s6, t7            // tmp6
2244    subq.ph        s4, s4, v1            // ... tmp10
2245    subq.ph        t3, s5, t5            // tmp5
2246    addq.ph        s2, t2, t5
2247    addq.ph        t1, s4, t3            // tmp4
2248    subq.ph        s3, t2, t5
2249    sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
2250    sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
2251    addq.ph        v0, t4, t3
2252    subq.ph        v1, t4, t3
2253    sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
2254    sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
2255    addq.ph        v0, t6, t1
2256    subq.ph        v1, t6, t1
2257    sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
2258    sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
2259
22602:
2261    bne            a0, t9, 0b
2262     addiu         a2, a2, 4
2263
2264    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2265
2266    j              ra
2267     nop
2268
2269END(jsimd_idct_ifast_cols_mips_dspr2)
2270
2271/*****************************************************************************/
2272LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
2273/*
2274 * a0     - wsptr
2275 * a1     - output_buf
2276 * a2     - output_col
2277 * a3     - mips_idct_ifast_coefs
2278 */
2279
2280    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2281
2282    addiu         t9, a0, 128        // end address
2283    lui           s8, 0x8080
2284    ori           s8, s8, 0x8080
2285
22860:
2287    lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
2288    lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
2289    lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
2290    lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
2291    lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
2292    lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
2293    lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
2294    lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
2295    lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
2296    precrq.ph.w   t1, s0, t0         // B b
2297    ins           t0, s0, 16, 16     // A a
2298    bnez          t1, 1f
2299     or           s0, t2, s2
2300    bnez          s0, 1f
2301     or           s0, t4, s4
2302    bnez          s0, 1f
2303     or           s0, t6, s6
2304    bnez          s0, 1f
2305     shll_s.ph    s0, t0, 2          // A a
2306    lw            a3, 0(a1)
2307    lw            AT, 4(a1)
2308    precrq.ph.w   t0, s0, s0         // A A
2309    ins           s0, s0, 16, 16     // a a
2310    addu          a3, a3, a2
2311    addu          AT, AT, a2
2312    precrq.qb.ph  t0, t0, t0         // A A A A
2313    precrq.qb.ph  s0, s0, s0         // a a a a
2314    addu.qb       s0, s0, s8
2315    addu.qb       t0, t0, s8
2316    sw            s0, 0(a3)
2317    sw            s0, 4(a3)
2318    sw            t0, 0(AT)
2319    sw            t0, 4(AT)
2320    addiu         a0, a0, 32
2321    bne           a0, t9, 0b
2322     addiu        a1, a1, 8
2323    b             2f
2324     nop
2325
23261:
2327    precrq.ph.w   t3, s2, t2
2328    ins           t2, s2, 16, 16
2329    precrq.ph.w   t5, s4, t4
2330    ins           t4, s4, 16, 16
2331    precrq.ph.w   t7, s6, t6
2332    ins           t6, s6, 16, 16
2333    lw            t8, 4(AT)          // FIX(1.414213562)
2334    addq.ph       s4, t0, t4         // tmp10
2335    subq.ph       s5, t0, t4         // tmp11
2336    subq.ph       s6, t2, t6         // tmp12 ...
2337    addq.ph       s7, t2, t6         // tmp13
2338    mulq_s.ph     s6, s6, t8         // ... tmp12 ...
2339    addq.ph       t0, s4, s7         // tmp0
2340    subq.ph       t6, s4, s7         // tmp3
2341    shll_s.ph     s6, s6, 1          // x2
2342    subq.ph       s6, s6, s7         // ... tmp12
2343    addq.ph       t2, s5, s6         // tmp1
2344    subq.ph       t4, s5, s6         // tmp2
2345    addq.ph       s5, t1, t7         // z11
2346    subq.ph       s6, t1, t7         // z12
2347    addq.ph       s7, t5, t3         // z13
2348    subq.ph       v0, t5, t3         // z10
2349    addq.ph       t7, s5, s7         // tmp7
2350    subq.ph       s5, s5, s7         // tmp11 ...
2351    addq.ph       v1, v0, s6         // z5 ...
2352    mulq_s.ph     s5, s5, t8         // ... tmp11
2353    lw            t8, 8(AT)          // FIX(1.847759065)
2354    lw            s4, 0(AT)          // FIX(1.082392200)
2355    addq.ph       s0, t0, t7         // tmp0 + tmp7
2356    subq.ph       s7, t0, t7         // tmp0 - tmp7
2357    mulq_s.ph     v1, v1, t8         // ... z5
2358    lw            a3, 0(a1)
2359    lw            t8, 12(AT)         // FIX(-2.613125930)
2360    shll_s.ph     s5, s5, 1          // x2
2361    addu          a3, a3, a2
2362    shll_s.ph     v0, v0, 1          // x4
2363    mulq_s.ph     v0, v0, t8         // tmp12 ...
2364    mulq_s.ph     s4, s6, s4         // tmp10 ...
2365    shll_s.ph     v1, v1, 1          // x2
2366    addiu         a0, a0, 32
2367    addiu         a1, a1, 8
2368    shll_s.ph     s6, v0, 1          // x4
2369    shll_s.ph     s4, s4, 1          // x2
2370    addq.ph       s6, s6, v1         // ... tmp12
2371    shll_s.ph     s0, s0, 2
2372    subq.ph       t5, s6, t7         // tmp6
2373    subq.ph       s4, s4, v1         // ... tmp10
2374    subq.ph       t3, s5, t5         // tmp5
2375    shll_s.ph     s7, s7, 2
2376    addq.ph       t1, s4, t3         // tmp4
2377    addq.ph       s1, t2, t5         // tmp1 + tmp6
2378    subq.ph       s6, t2, t5         // tmp1 - tmp6
2379    addq.ph       s2, t4, t3         // tmp2 + tmp5
2380    subq.ph       s5, t4, t3         // tmp2 - tmp5
2381    addq.ph       s4, t6, t1         // tmp3 + tmp4
2382    subq.ph       s3, t6, t1         // tmp3 - tmp4
2383    shll_s.ph     s1, s1, 2
2384    shll_s.ph     s2, s2, 2
2385    shll_s.ph     s3, s3, 2
2386    shll_s.ph     s4, s4, 2
2387    shll_s.ph     s5, s5, 2
2388    shll_s.ph     s6, s6, 2
2389    precrq.ph.w   t0, s1, s0         // B A
2390    ins           s0, s1, 16, 16     // b a
2391    precrq.ph.w   t2, s3, s2         // D C
2392    ins           s2, s3, 16, 16     // d c
2393    precrq.ph.w   t4, s5, s4         // F E
2394    ins           s4, s5, 16, 16     // f e
2395    precrq.ph.w   t6, s7, s6         // H G
2396    ins           s6, s7, 16, 16     // h g
2397    precrq.qb.ph  t0, t2, t0         // D C B A
2398    precrq.qb.ph  s0, s2, s0         // d c b a
2399    precrq.qb.ph  t4, t6, t4         // H G F E
2400    precrq.qb.ph  s4, s6, s4         // h g f e
2401    addu.qb       s0, s0, s8
2402    addu.qb       s4, s4, s8
2403    sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
2404    sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
2405    lw            a3, -4(a1)
2406    addu.qb       t0, t0, s8
2407    addu          a3, a3, a2
2408    addu.qb       t4, t4, s8
2409    sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
2410    bne           a0, t9, 0b
2411     sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
2412
24132:
2414
2415    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2416
2417    j             ra
2418     nop
2419
2420END(jsimd_idct_ifast_rows_mips_dspr2)
2421
2422/*****************************************************************************/
2423LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
2424/*
2425 * a0     - data
2426 */
2427
2428    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2429
2430    lui       t0, 6437
2431    ori       t0, 2260
2432    lui       t1, 9633
2433    ori       t1, 11363
2434    lui       t2, 0xd39e
2435    ori       t2, 0xe6dc
2436    lui       t3, 0xf72d
2437    ori       t3, 9633
2438    lui       t4, 2261
2439    ori       t4, 9633
2440    lui       t5, 0xd39e
2441    ori       t5, 6437
2442    lui       t6, 9633
2443    ori       t6, 0xd39d
2444    lui       t7, 0xe6dc
2445    ori       t7, 2260
2446    lui       t8, 4433
2447    ori       t8, 10703
2448    lui       t9, 0xd630
2449    ori       t9, 4433
2450    li        s8, 8
2451    move      a1, a0
24521:
2453    lw        s0, 0(a1)     // tmp0 = 1|0
2454    lw        s1, 4(a1)     // tmp1 = 3|2
2455    lw        s2, 8(a1)     // tmp2 = 5|4
2456    lw        s3, 12(a1)    // tmp3 = 7|6
2457    packrl.ph s1, s1, s1    // tmp1 = 2|3
2458    packrl.ph s3, s3, s3    // tmp3 = 6|7
2459    subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
2460    subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
2461    mult      $0, $0        // ac0  = 0
2462    dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
2463    dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
2464    mult      $ac1, $0, $0  // ac1  = 0
2465    dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
2466    dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
2467    mult      $ac2, $0, $0  // ac2  = 0
2468    dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
2469    dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
2470    mult      $ac3, $0, $0  // ac3  = 0
2471    dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
2472    dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
2473    addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
2474    addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
2475    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
2476    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
2477    extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
2478    extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
2479    addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
2480    subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
2481    sh        s0, 2(a1)
2482    sh        s1, 6(a1)
2483    sh        s2, 10(a1)
2484    sh        s3, 14(a1)
2485    mult      $0, $0        // ac0  = 0
2486    dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
2487    mult      $ac1, $0, $0  // ac1  = 0
2488    dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
2489    sra       s4, s5, 16    // tmp4 = t11
2490    addiu     a1, a1, 16
2491    addiu     s8, s8, -1
2492    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
2493    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
2494    addu      s2, s5, s4    // tmp2 = t10 + t11
2495    subu      s3, s5, s4    // tmp3 = t10 - t11
2496    sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
2497    sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
2498    sh        s2, -16(a1)
2499    sh        s3, -8(a1)
2500    sh        s0, -12(a1)
2501    bgtz      s8, 1b
2502     sh       s1, -4(a1)
2503    li        t0, 2260
2504    li        t1, 11363
2505    li        t2, 9633
2506    li        t3, 6436
2507    li        t4, 6437
2508    li        t5, 2261
2509    li        t6, 11362
2510    li        t7, 2259
2511    li        t8, 4433
2512    li        t9, 10703
2513    li        a1, 10704
2514    li        s8, 8
2515
25162:
2517    lh        a2, 0(a0)     // 0
2518    lh        a3, 16(a0)    // 8
2519    lh        v0, 32(a0)    // 16
2520    lh        v1, 48(a0)    // 24
2521    lh        s4, 64(a0)    // 32
2522    lh        s5, 80(a0)    // 40
2523    lh        s6, 96(a0)    // 48
2524    lh        s7, 112(a0)   // 56
2525    addu      s2, v0, s5    // tmp2 = 16 + 40
2526    subu      s5, v0, s5    // tmp5 = 16 - 40
2527    addu      s3, v1, s4    // tmp3 = 24 + 32
2528    subu      s4, v1, s4    // tmp4 = 24 - 32
2529    addu      s0, a2, s7    // tmp0 =  0 + 56
2530    subu      s7, a2, s7    // tmp7 =  0 - 56
2531    addu      s1, a3, s6    // tmp1 =  8 + 48
2532    subu      s6, a3, s6    // tmp6 =  8 - 48
2533    addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
2534    subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
2535    addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
2536    subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
2537    mult      s7, t1        // ac0  = tmp7 * c1
2538    madd      s4, t0        // ac0 += tmp4 * c0
2539    madd      s5, t4        // ac0 += tmp5 * c4
2540    madd      s6, t2        // ac0 += tmp6 * c2
2541    mult      $ac1, s7, t2  // ac1  = tmp7 * c2
2542    msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
2543    msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
2544    msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
2545    mult      $ac2, s7, t4  // ac2  = tmp7 * c4
2546    madd      $ac2, s4, t2  // ac2 += tmp4 * c2
2547    madd      $ac2, s5, t5  // ac2 += tmp5 * c5
2548    msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
2549    mult      $ac3, s7, t0  // ac3  = tmp7 * c0
2550    msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
2551    madd      $ac3, s5, t2  // ac3 += tmp5 * c2
2552    msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
2553    extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
2554    extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
2555    extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
2556    extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
2557    addiu     s8, s8, -1
2558    addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
2559    subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
2560    sh        s0, 16(a0)
2561    sh        s1, 48(a0)
2562    sh        s2, 80(a0)
2563    sh        s3, 112(a0)
2564    mult      v0, t8        // ac0  = tmp12 * c8
2565    madd      v1, t9        // ac0 += tmp13 * c9
2566    mult      $ac1, v1, t8  // ac1  = tmp13 * c8
2567    msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
2568    addiu     a0, a0, 2
2569    extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
2570    extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
2571    shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
2572    shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
2573    sh        s4, -2(a0)
2574    sh        s5, 62(a0)
2575    sh        s6, 30(a0)
2576    bgtz      s8, 2b
2577     sh       s7, 94(a0)
2578
2579    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2580
2581    jr       ra
2582     nop
2583
2584END(jsimd_fdct_islow_mips_dspr2)
2585
2586/*****************************************************************************/
2587LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
2588/*
2589 * a0     - data
2590 */
2591    .set at
2592    SAVE_REGS_ON_STACK 8, s0, s1
2593    li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2594    li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2595    li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2596    li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2597
2598    move         v0, a0
2599    addiu        v1, v0, 128     // end address
2600
26010:
2602    lw           t0, 0(v0)       // tmp0 = 1|0
2603    lw           t1, 4(v0)       // tmp1 = 3|2
2604    lw           t2, 8(v0)       // tmp2 = 5|4
2605    lw           t3, 12(v0)      // tmp3 = 7|6
2606    packrl.ph    t1, t1, t1      // tmp1 = 2|3
2607    packrl.ph    t3, t3, t3      // tmp3 = 6|7
2608    subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
2609    subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
2610    addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
2611    addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
2612    addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
2613    subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
2614    sra          t4, t8, 16      // tmp4 = t11
2615    mult         $0, $0          // ac0  = 0
2616    dpa.w.ph     $ac0, t9, s1
2617    mult         $ac1, $0, $0    // ac1  = 0
2618    dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
2619    dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
2620    mult         $ac2, $0, $0    // ac2  = 0
2621    dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
2622    mult         $ac3, $0, $0    // ac3  = 0
2623    dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
2624    precrq.ph.w  t0, t5, t7      // t0 = t5|t6
2625    addq.ph      t2, t8, t4      // tmp2 = t10 + t11
2626    subq.ph      t3, t8, t4      // tmp3 = t10 - t11
2627    extr.w       t4, $ac0, 8
2628    mult         $0, $0          // ac0  = 0
2629    dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
2630    extr.w       t0, $ac1, 8     // t0 = z5
2631    extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
2632    extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
2633    extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
2634    add          t6, t1, t0      // t6 = z2
2635    add          t7, t7, t0      // t7 = z4
2636    subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
2637    addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
2638    addq.ph      t1, t0, t6      // t1 = z13 + z2
2639    subq.ph      t6, t0, t6      // t6 = z13 - z2
2640    addq.ph      t0, t8, t7      // t0 = z11 + z4
2641    subq.ph      t7, t8, t7      // t7 = z11 - z4
2642    addq.ph      t5, t4, t9
2643    subq.ph      t4, t9, t4
2644    sh           t2, 0(v0)
2645    sh           t5, 4(v0)
2646    sh           t3, 8(v0)
2647    sh           t4, 12(v0)
2648    sh           t1, 10(v0)
2649    sh           t6, 6(v0)
2650    sh           t0, 2(v0)
2651    sh           t7, 14(v0)
2652    addiu        v0, 16
2653    bne          v1, v0, 0b
2654     nop
2655    move         v0, a0
2656    addiu        v1, v0, 16
2657
26581:
2659    lh           t0, 0(v0)       // 0
2660    lh           t1, 16(v0)      // 8
2661    lh           t2, 32(v0)      // 16
2662    lh           t3, 48(v0)      // 24
2663    lh           t4, 64(v0)      // 32
2664    lh           t5, 80(v0)      // 40
2665    lh           t6, 96(v0)      // 48
2666    lh           t7, 112(v0)     // 56
2667    add          t8, t0, t7      // t8 = tmp0
2668    sub          t7, t0, t7      // t7 = tmp7
2669    add          t0, t1, t6      // t0 = tmp1
2670    sub          t1, t1, t6      // t1 = tmp6
2671    add          t6, t2, t5      // t6 = tmp2
2672    sub          t5, t2, t5      // t5 = tmp5
2673    add          t2, t3, t4      // t2 = tmp3
2674    sub          t3, t3, t4      // t3 = tmp4
2675    add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
2676    sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
2677    sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
2678    ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
2679    add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
2680    mult         $0, $0          // ac0  = 0
2681    dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
2682    add          s0, t4, t2      // t8 = tmp10+tmp11
2683    sub          t4, t4, t2      // t4 = tmp10-tmp11
2684    sh           s0, 0(v0)
2685    sh           t4, 64(v0)
2686    extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
2687    addq.ph      t4, t8, t2      // t9 = tmp13 + z1
2688    subq.ph      t8, t8, t2      // t2 = tmp13 - z1
2689    sh           t4, 32(v0)
2690    sh           t8, 96(v0)
2691    add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
2692    add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
2693    add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
2694    andi         t4, a1, 0xffff
2695    mul          s0, t1, t4
2696    sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2697    ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
2698    mult         $0, $0          // ac0  = 0
2699    mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
2700    extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
2701    add          t2, t7, t8      // t2 = tmp7 + z5
2702    sub          t7, t7, t8      // t7 = tmp7 - z5
2703    andi         t4, a2, 0xffff
2704    mul          t8, t3, t4
2705    sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2706    andi         t4, s1, 0xffff
2707    mul          t6, t0, t4
2708    sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2709    add          t0, t6, t8      // t0 = z3 + z2
2710    sub          t1, t6, t8      // t1 = z3 - z2
2711    add          t3, t6, s0      // t3 = z3 + z4
2712    sub          t4, t6, s0      // t4 = z3 - z4
2713    sub          t5, t2, t1      // t5 = dataptr[5]
2714    sub          t6, t7, t0      // t6 = dataptr[3]
2715    add          t3, t2, t3      // t3 = dataptr[1]
2716    add          t4, t7, t4      // t4 = dataptr[7]
2717    sh           t5, 80(v0)
2718    sh           t6, 48(v0)
2719    sh           t3, 16(v0)
2720    sh           t4, 112(v0)
2721    addiu        v0, 2
2722    bne          v0, v1, 1b
2723     nop
2724
2725    RESTORE_REGS_FROM_STACK 8, s0, s1
2726
2727    j            ra
2728     nop
2729END(jsimd_fdct_ifast_mips_dspr2)
2730
2731/*****************************************************************************/
2732LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
2733/*
2734 * a0     - coef_block
2735 * a1     - divisors
2736 * a2     - workspace
2737 */
2738
2739    .set at
2740
2741    SAVE_REGS_ON_STACK 16, s0, s1, s2
2742
2743    addiu   v0, a2, 124  // v0 = workspace_end
2744    lh      t0, 0(a2)
2745    lh      t1, 0(a1)
2746    lh      t2, 128(a1)
2747    sra     t3, t0, 15
2748    sll     t3, t3, 1
2749    addiu   t3, t3, 1
2750    mul     t0, t0, t3
2751    lh      t4, 384(a1)
2752    lh      t5, 130(a1)
2753    lh      t6, 2(a2)
2754    lh      t7, 2(a1)
2755    lh      t8, 386(a1)
2756
27571:
2758    andi    t1, 0xffff
2759    add     t9, t0, t2
2760    andi    t9, 0xffff
2761    mul     v1, t9, t1
2762    sra     s0, t6, 15
2763    sll     s0, s0, 1
2764    addiu   s0, s0, 1
2765    addiu   t9, t4, 16
2766    srav    v1, v1, t9
2767    mul     v1, v1, t3
2768    mul     t6, t6, s0
2769    andi    t7, 0xffff
2770    addiu   a2, a2, 4
2771    addiu   a1, a1, 4
2772    add     s1, t6, t5
2773    andi    s1, 0xffff
2774    sh      v1, 0(a0)
2775
2776    mul     s2, s1, t7
2777    addiu   s1, t8, 16
2778    srav    s2, s2, s1
2779    mul     s2,s2, s0
2780    lh      t0, 0(a2)
2781    lh      t1, 0(a1)
2782    sra     t3, t0, 15
2783    sll     t3, t3, 1
2784    addiu   t3, t3, 1
2785    mul     t0, t0, t3
2786    lh      t2, 128(a1)
2787    lh      t4, 384(a1)
2788    lh      t5, 130(a1)
2789    lh      t8, 386(a1)
2790    lh      t6, 2(a2)
2791    lh      t7, 2(a1)
2792    sh      s2, 2(a0)
2793    lh      t0, 0(a2)
2794    sra     t3, t0, 15
2795    sll     t3, t3, 1
2796    addiu   t3, t3, 1
2797    mul     t0, t0,t3
2798    bne     a2, v0, 1b
2799     addiu  a0, a0, 4
2800
2801    andi    t1, 0xffff
2802    add     t9, t0, t2
2803    andi    t9, 0xffff
2804    mul     v1, t9, t1
2805    sra     s0, t6, 15
2806    sll     s0, s0, 1
2807    addiu   s0, s0, 1
2808    addiu   t9, t4, 16
2809    srav    v1, v1, t9
2810    mul     v1, v1, t3
2811    mul     t6, t6, s0
2812    andi    t7, 0xffff
2813    sh      v1, 0(a0)
2814    add     s1, t6, t5
2815    andi    s1, 0xffff
2816    mul     s2, s1, t7
2817    addiu   s1, t8, 16
2818    addiu   a2, a2, 4
2819    addiu   a1, a1, 4
2820    srav    s2, s2, s1
2821    mul     s2, s2, s0
2822    sh      s2, 2(a0)
2823
2824    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2825
2826    j       ra
2827     nop
2828
2829END(jsimd_quantize_mips_dspr2)
2830
2831/*****************************************************************************/
2832LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
2833/*
2834 * a0     - coef_block
2835 * a1     - divisors
2836 * a2     - workspace
2837 */
2838
2839    .set at
2840
2841    li         t1, 0x46800100     //integer representation 16384.5
2842    mtc1       t1, f0
2843    li         t0, 63
28440:
2845    lwc1       f2, 0(a2)
2846    lwc1       f10, 0(a1)
2847    lwc1       f4, 4(a2)
2848    lwc1       f12, 4(a1)
2849    lwc1       f6, 8(a2)
2850    lwc1       f14, 8(a1)
2851    lwc1       f8, 12(a2)
2852    lwc1       f16, 12(a1)
2853    madd.s     f2, f0, f2, f10
2854    madd.s     f4, f0, f4, f12
2855    madd.s     f6, f0, f6, f14
2856    madd.s     f8, f0, f8, f16
2857    lwc1       f10, 16(a1)
2858    lwc1       f12, 20(a1)
2859    trunc.w.s  f2, f2
2860    trunc.w.s  f4, f4
2861    trunc.w.s  f6, f6
2862    trunc.w.s  f8, f8
2863    lwc1       f14, 24(a1)
2864    lwc1       f16, 28(a1)
2865    mfc1       t1, f2
2866    mfc1       t2, f4
2867    mfc1       t3, f6
2868    mfc1       t4, f8
2869    lwc1       f2, 16(a2)
2870    lwc1       f4, 20(a2)
2871    lwc1       f6, 24(a2)
2872    lwc1       f8, 28(a2)
2873    madd.s     f2, f0, f2, f10
2874    madd.s     f4, f0, f4, f12
2875    madd.s     f6, f0, f6, f14
2876    madd.s     f8, f0, f8, f16
2877    addiu      t1, t1, -16384
2878    addiu      t2, t2, -16384
2879    addiu      t3, t3, -16384
2880    addiu      t4, t4, -16384
2881    trunc.w.s  f2, f2
2882    trunc.w.s  f4, f4
2883    trunc.w.s  f6, f6
2884    trunc.w.s  f8, f8
2885    sh         t1, 0(a0)
2886    sh         t2, 2(a0)
2887    sh         t3, 4(a0)
2888    sh         t4, 6(a0)
2889    mfc1       t1, f2
2890    mfc1       t2, f4
2891    mfc1       t3, f6
2892    mfc1       t4, f8
2893    addiu      t0, t0, -8
2894    addiu      a2, a2, 32
2895    addiu      a1, a1, 32
2896    addiu      t1, t1, -16384
2897    addiu      t2, t2, -16384
2898    addiu      t3, t3, -16384
2899    addiu      t4, t4, -16384
2900    sh         t1, 8(a0)
2901    sh         t2, 10(a0)
2902    sh         t3, 12(a0)
2903    sh         t4, 14(a0)
2904    bgez       t0, 0b
2905     addiu     a0, a0, 16
2906
2907    j          ra
2908     nop
2909
2910END(jsimd_quantize_float_mips_dspr2)
2911/*****************************************************************************/
2912LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
2913/*
2914 * a0     - compptr->dct_table
2915 * a1     - coef_block
2916 * a2     - output_buf
2917 * a3     - output_col
2918 */
2919    .set at
2920
2921    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2922
2923    addiu     sp, sp, -40
2924    move      v0, sp
2925    addiu     s2, zero, 29692
2926    addiu     s3, zero, -10426
2927    addiu     s4, zero, 6967
2928    addiu     s5, zero, -5906
2929    lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
2930    lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
2931    lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
2932    lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
2933    mul       t4, t5, t0
2934    lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
2935    lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
2936    mul       t6, t6, t1
2937    mul       t5, t5, t0
2938    lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
2939    lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
2940    lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
2941    lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
2942    mul       t7, t7, t2
2943    mult      zero, zero
2944    mul       t8, t8, t3
2945    li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
2946    li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
2947    ins       t6, t5, 16, 16    // t6 = t5|t6
2948    sll       t4, t4, 15
2949    dpa.w.ph  $ac0, t6, s0
2950    lh        t1, 2(a1)
2951    lh        t6, 2(a0)
2952    ins       t8, t7, 16, 16    // t8 = t7|t8
2953    dpa.w.ph  $ac0, t8, s1
2954    mflo      t0, $ac0
2955    mul       t5, t6, t1
2956    lh        t1, 18(a1)
2957    lh        t6, 18(a0)
2958    lh        t2, 50(a1)
2959    lh        t7, 50(a0)
2960    mul       t6, t6, t1
2961    subu      t8, t4, t0
2962    mul       t7, t7, t2
2963    addu      t0, t4, t0
2964    shra_r.w  t0, t0, 13
2965    lh        t1, 82(a1)
2966    lh        t2, 82(a0)
2967    lh        t3, 114(a1)
2968    lh        t4, 114(a0)
2969    shra_r.w  t8, t8, 13
2970    mul       t1, t1, t2
2971    mul       t3, t3, t4
2972    sw        t0, 0(v0)
2973    sw        t8, 20(v0)
2974    sll       t4, t5, 15
2975    ins       t7, t6, 16, 16
2976    mult      zero, zero
2977    dpa.w.ph  $ac0, t7, s0
2978    ins       t3, t1, 16, 16
2979    lh        t1, 6(a1)
2980    lh        t6, 6(a0)
2981    dpa.w.ph  $ac0, t3, s1
2982    mflo      t0, $ac0
2983    mul       t5, t6, t1
2984    lh        t1, 22(a1)
2985    lh        t6, 22(a0)
2986    lh        t2, 54(a1)
2987    lh        t7, 54(a0)
2988    mul       t6, t6, t1
2989    subu      t8, t4, t0
2990    mul       t7, t7, t2
2991    addu      t0, t4, t0
2992    shra_r.w  t0, t0, 13
2993    lh        t1, 86(a1)
2994    lh        t2, 86(a0)
2995    lh        t3, 118(a1)
2996    lh        t4, 118(a0)
2997    shra_r.w  t8, t8, 13
2998    mul       t1, t1, t2
2999    mul       t3, t3, t4
3000    sw        t0, 4(v0)
3001    sw        t8, 24(v0)
3002    sll       t4, t5, 15
3003    ins       t7, t6, 16, 16
3004    mult      zero, zero
3005    dpa.w.ph  $ac0, t7, s0
3006    ins       t3, t1, 16, 16
3007    lh        t1, 10(a1)
3008    lh        t6, 10(a0)
3009    dpa.w.ph  $ac0, t3, s1
3010    mflo      t0, $ac0
3011    mul       t5, t6, t1
3012    lh        t1, 26(a1)
3013    lh        t6, 26(a0)
3014    lh        t2, 58(a1)
3015    lh        t7, 58(a0)
3016    mul       t6, t6, t1
3017    subu      t8, t4, t0
3018    mul       t7, t7, t2
3019    addu      t0, t4, t0
3020    shra_r.w  t0, t0, 13
3021    lh        t1, 90(a1)
3022    lh        t2, 90(a0)
3023    lh        t3, 122(a1)
3024    lh        t4, 122(a0)
3025    shra_r.w  t8, t8, 13
3026    mul       t1, t1, t2
3027    mul       t3, t3, t4
3028    sw        t0, 8(v0)
3029    sw        t8, 28(v0)
3030    sll       t4, t5, 15
3031    ins       t7, t6, 16, 16
3032    mult      zero, zero
3033    dpa.w.ph  $ac0, t7, s0
3034    ins       t3, t1, 16, 16
3035    lh        t1, 14(a1)
3036    lh        t6, 14(a0)
3037    dpa.w.ph  $ac0, t3, s1
3038    mflo      t0, $ac0
3039    mul       t5, t6, t1
3040    lh        t1, 30(a1)
3041    lh        t6, 30(a0)
3042    lh        t2, 62(a1)
3043    lh        t7, 62(a0)
3044    mul       t6, t6, t1
3045    subu      t8, t4, t0
3046    mul       t7, t7, t2
3047    addu      t0, t4, t0
3048    shra_r.w  t0, t0, 13
3049    lh        t1, 94(a1)
3050    lh        t2, 94(a0)
3051    lh        t3, 126(a1)
3052    lh        t4, 126(a0)
3053    shra_r.w  t8, t8, 13
3054    mul       t1, t1, t2
3055    mul       t3, t3, t4
3056    sw        t0, 12(v0)
3057    sw        t8, 32(v0)
3058    sll       t4, t5, 15
3059    ins       t7, t6, 16, 16
3060    mult      zero, zero
3061    dpa.w.ph  $ac0, t7, s0
3062    ins       t3, t1, 16, 16
3063    dpa.w.ph  $ac0, t3, s1
3064    mflo      t0, $ac0
3065    lw        t9, 0(a2)
3066    lw        t3, 0(v0)
3067    lw        t7, 4(v0)
3068    lw        t1, 8(v0)
3069    addu      t9, t9, a3
3070    sll       t3, t3, 15
3071    subu      t8, t4, t0
3072    addu      t0, t4, t0
3073    shra_r.w  t0, t0, 13
3074    shra_r.w  t8, t8, 13
3075    sw        t0, 16(v0)
3076    sw        t8, 36(v0)
3077    lw        t5, 12(v0)
3078    lw        t6, 16(v0)
3079    mult      t7, s2
3080    madd      t1, s3
3081    madd      t5, s4
3082    madd      t6, s5
3083    lw        t5, 24(v0)
3084    lw        t7, 28(v0)
3085    mflo      t0, $ac0
3086    lw        t8, 32(v0)
3087    lw        t2, 36(v0)
3088    mult      $ac1, t5, s2
3089    madd      $ac1, t7, s3
3090    madd      $ac1, t8, s4
3091    madd      $ac1, t2, s5
3092    addu      t1, t3, t0
3093    subu      t6, t3, t0
3094    shra_r.w  t1, t1, 20
3095    shra_r.w  t6, t6, 20
3096    mflo      t4, $ac1
3097    shll_s.w  t1, t1, 24
3098    shll_s.w  t6, t6, 24
3099    sra       t1, t1, 24
3100    sra       t6, t6, 24
3101    addiu     t1, t1, 128
3102    addiu     t6, t6, 128
3103    lw        t0, 20(v0)
3104    sb        t1, 0(t9)
3105    sb        t6, 1(t9)
3106    sll       t0, t0, 15
3107    lw        t9, 4(a2)
3108    addu      t1, t0, t4
3109    subu      t6, t0, t4
3110    addu      t9, t9, a3
3111    shra_r.w  t1, t1, 20
3112    shra_r.w  t6, t6, 20
3113    shll_s.w  t1, t1, 24
3114    shll_s.w  t6, t6, 24
3115    sra       t1, t1, 24
3116    sra       t6, t6, 24
3117    addiu     t1, t1, 128
3118    addiu     t6, t6, 128
3119    sb        t1, 0(t9)
3120    sb        t6, 1(t9)
3121    addiu     sp, sp, 40
3122
3123    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3124
3125    j         ra
3126     nop
3127
3128END(jsimd_idct_2x2_mips_dspr2)
3129
3130/*****************************************************************************/
3131LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
3132/*
3133 * a0     - compptr->dct_table
3134 * a1     - coef_block
3135 * a2     - output_buf
3136 * a3     - output_col
3137 * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
3138 */
3139
3140    .set at
3141    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3142
3143    lw        v1, 48(sp)
3144    move      t0, a1
3145    move      t1, v1
3146    li        t9, 4
3147    li        s0, 0x2e75f93e
3148    li        s1, 0x21f9ba79
3149    li        s2, 0xecc2efb0
3150    li        s3, 0x52031ccd
3151
31520:
3153    lh        s6, 32(t0)        // inptr[DCTSIZE*2]
3154    lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
3155    lh        s7, 96(t0)        // inptr[DCTSIZE*6]
3156    lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
3157    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3158    lh        s4, 0(t0)         // inptr[DCTSIZE*0]
3159    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3160    lh        s5, 0(a0)         // quantptr[0]
3161    li        s6, 15137
3162    li        s7, 6270
3163    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
3164    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3165    lh        t5, 112(t0)       // inptr[DCTSIZE*7]
3166    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3167    lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
3168    lh        v0, 80(t0)        // inptr[DCTSIZE*5]
3169    lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
3170    lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
3171    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
3172    lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
3173    lh        t8, 16(t0)        // inptr[DCTSIZE*1]
3174    subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3175    lh        t7, 48(t0)        // inptr[DCTSIZE*3]
3176    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3177    mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3178    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3179    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3180    addu      t3, t2, t6        // tmp10 = tmp0 + z2
3181    subu      t4, t2, t6        // tmp10 = tmp0 - z2
3182    mult      $ac0, zero, zero
3183    mult      $ac1, zero, zero
3184    ins       t5, v0, 16, 16
3185    ins       t7, t8, 16, 16
3186    addiu     t9, t9, -1
3187    dpa.w.ph  $ac0, t5, s0
3188    dpa.w.ph  $ac0, t7, s1
3189    dpa.w.ph  $ac1, t5, s2
3190    dpa.w.ph  $ac1, t7, s3
3191    mflo      s4, $ac0
3192    mflo      s5, $ac1
3193    addiu     a0, a0, 2
3194    addiu     t1, t1, 4
3195    addiu     t0, t0, 2
3196    addu      t6, t4, s4
3197    subu      t5, t4, s4
3198    addu      s6, t3, s5
3199    subu      s7, t3, s5
3200    shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
3201    shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
3202    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
3203    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
3204    sw        t6, 28(t1)
3205    sw        t5, 60(t1)
3206    sw        s6, -4(t1)
3207    bgtz      t9, 0b
3208     sw       s7, 92(t1)
3209    // second loop three pass
3210    li        t9, 3
32111:
3212    lh        s6, 34(t0)        // inptr[DCTSIZE*2]
3213    lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
3214    lh        s7, 98(t0)        // inptr[DCTSIZE*6]
3215    lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
3216    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3217    lh        s4, 2(t0)         // inptr[DCTSIZE*0]
3218    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3219    lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
3220    li        s6, 15137
3221    li        s7, 6270
3222    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
3223    mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3224    lh        t5, 114(t0)       // inptr[DCTSIZE*7]
3225    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3226    lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
3227    lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
3228    lh        t6, 82(t0)        // inptr[DCTSIZE*5]
3229    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
3230    lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
3231    lh        t8, 18(t0)        // inptr[DCTSIZE*1]
3232    subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3233    lh        t7, 50(t0)        // inptr[DCTSIZE*3]
3234    lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
3235    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3236    mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3237    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3238    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3239    addu      t3, t2, v0        // tmp10 = tmp0 + z2
3240    subu      t4, t2, v0        // tmp10 = tmp0 - z2
3241    mult      $ac0, zero, zero
3242    mult      $ac1, zero, zero
3243    ins       t5, t6, 16, 16
3244    ins       t7, t8, 16, 16
3245    dpa.w.ph  $ac0, t5, s0
3246    dpa.w.ph  $ac0, t7, s1
3247    dpa.w.ph  $ac1, t5, s2
3248    dpa.w.ph  $ac1, t7, s3
3249    mflo      t5, $ac0
3250    mflo      t6, $ac1
3251    addiu     t9, t9, -1
3252    addiu     t0, t0, 2
3253    addiu     a0, a0, 2
3254    addiu     t1, t1, 4
3255    addu      s5, t4, t5
3256    subu      s4, t4, t5
3257    addu      s6, t3, t6
3258    subu      s7, t3, t6
3259    shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
3260    shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
3261    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
3262    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
3263    sw        s5, 32(t1)
3264    sw        s4, 64(t1)
3265    sw        s6, 0(t1)
3266    bgtz      t9, 1b
3267     sw       s7, 96(t1)
3268    move      t1, v1
3269    li        s4, 15137
3270    lw        s6, 8(t1)         // wsptr[2]
3271    li        s5, 6270
3272    lw        s7, 24(t1)        // wsptr[6]
3273    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3274    lw        t2, 0(t1)         // wsptr[0]
3275    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
3276    lh        t5, 28(t1)        // wsptr[7]
3277    lh        t6, 20(t1)        // wsptr[5]
3278    lh        t7, 12(t1)        // wsptr[3]
3279    lh        t8, 4(t1)         // wsptr[1]
3280    ins       t5, t6, 16, 16
3281    ins       t7, t8, 16, 16
3282    mult      $ac0, zero, zero
3283    dpa.w.ph  $ac0, t5, s0
3284    dpa.w.ph  $ac0, t7, s1
3285    mult      $ac1, zero, zero
3286    dpa.w.ph  $ac1, t5, s2
3287    dpa.w.ph  $ac1, t7, s3
3288    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3289    mflo      s6, $ac0
3290    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3291    subu      s4, s4, s5
3292    addu      t3, t2, s4        // tmp10 = tmp0 + z2
3293    mflo      s7, $ac1
3294    subu      t4, t2, s4        // tmp10 = tmp0 - z2
3295    addu      t7, t4, s6
3296    subu      t8, t4, s6
3297    addu      t5, t3, s7
3298    subu      t6, t3, s7
3299    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
3300    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
3301    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
3302    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
3303    sll       s4, t9, 2
3304    lw        v0, 0(a2)         // output_buf[ctr]
3305    shll_s.w  t5, t5, 24
3306    shll_s.w  t6, t6, 24
3307    shll_s.w  t7, t7, 24
3308    shll_s.w  t8, t8, 24
3309    sra       t5, t5, 24
3310    sra       t6, t6, 24
3311    sra       t7, t7, 24
3312    sra       t8, t8, 24
3313    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3314    addiu     t5, t5, 128
3315    addiu     t6, t6, 128
3316    addiu     t7, t7, 128
3317    addiu     t8, t8, 128
3318    sb        t5, 0(v0)
3319    sb        t7, 1(v0)
3320    sb        t8, 2(v0)
3321    sb        t6, 3(v0)
3322    // 2
3323    li        s4, 15137
3324    lw        s6, 40(t1)        // wsptr[2]
3325    li        s5, 6270
3326    lw        s7, 56(t1)        // wsptr[6]
3327    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3328    lw        t2, 32(t1)        // wsptr[0]
3329    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
3330    lh        t5, 60(t1)        // wsptr[7]
3331    lh        t6, 52(t1)        // wsptr[5]
3332    lh        t7, 44(t1)        // wsptr[3]
3333    lh        t8, 36(t1)        // wsptr[1]
3334    ins       t5, t6, 16, 16
3335    ins       t7, t8, 16, 16
3336    mult      $ac0, zero, zero
3337    dpa.w.ph  $ac0, t5, s0
3338    dpa.w.ph  $ac0, t7, s1
3339    mult      $ac1, zero, zero
3340    dpa.w.ph  $ac1, t5, s2
3341    dpa.w.ph  $ac1, t7, s3
3342    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3343    mflo      s6, $ac0
3344    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3345    subu      s4, s4, s5
3346    addu      t3, t2, s4        // tmp10 = tmp0 + z2
3347    mflo      s7, $ac1
3348    subu      t4, t2, s4        // tmp10 = tmp0 - z2
3349    addu      t7, t4, s6
3350    subu      t8, t4, s6
3351    addu      t5, t3, s7
3352    subu      t6, t3, s7
3353    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
3354    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
3355    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
3356    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
3357    sll       s4, t9, 2
3358    lw        v0, 4(a2)         // output_buf[ctr]
3359    shll_s.w  t5, t5, 24
3360    shll_s.w  t6, t6, 24
3361    shll_s.w  t7, t7, 24
3362    shll_s.w  t8, t8, 24
3363    sra       t5, t5, 24
3364    sra       t6, t6, 24
3365    sra       t7, t7, 24
3366    sra       t8, t8, 24
3367    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3368    addiu     t5, t5, 128
3369    addiu     t6, t6, 128
3370    addiu     t7, t7, 128
3371    addiu     t8, t8, 128
3372    sb        t5, 0(v0)
3373    sb        t7, 1(v0)
3374    sb        t8, 2(v0)
3375    sb        t6, 3(v0)
3376    // 3
3377    li        s4, 15137
3378    lw        s6, 72(t1)        // wsptr[2]
3379    li        s5, 6270
3380    lw        s7, 88(t1)        // wsptr[6]
3381    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3382    lw        t2, 64(t1)        // wsptr[0]
3383    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
3384    lh        t5, 92(t1)        // wsptr[7]
3385    lh        t6, 84(t1)        // wsptr[5]
3386    lh        t7, 76(t1)        // wsptr[3]
3387    lh        t8, 68(t1)        // wsptr[1]
3388    ins       t5, t6, 16, 16
3389    ins       t7, t8, 16, 16
3390    mult      $ac0, zero, zero
3391    dpa.w.ph  $ac0, t5, s0
3392    dpa.w.ph  $ac0, t7, s1
3393    mult      $ac1, zero, zero
3394    dpa.w.ph  $ac1, t5, s2
3395    dpa.w.ph  $ac1, t7, s3
3396    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3397    mflo      s6, $ac0
3398    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3399    subu      s4, s4, s5
3400    addu      t3, t2, s4        // tmp10 = tmp0 + z2
3401    mflo      s7, $ac1
3402    subu      t4, t2, s4        // tmp10 = tmp0 - z2
3403    addu      t7, t4, s6
3404    subu      t8, t4, s6
3405    addu      t5, t3, s7
3406    subu      t6, t3, s7
3407    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
3408    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
3409    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
3410    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
3411    sll       s4, t9, 2
3412    lw        v0, 8(a2)         // output_buf[ctr]
3413    shll_s.w  t5, t5, 24
3414    shll_s.w  t6, t6, 24
3415    shll_s.w  t7, t7, 24
3416    shll_s.w  t8, t8, 24
3417    sra       t5, t5, 24
3418    sra       t6, t6, 24
3419    sra       t7, t7, 24
3420    sra       t8, t8, 24
3421    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3422    addiu     t5, t5, 128
3423    addiu     t6, t6, 128
3424    addiu     t7, t7, 128
3425    addiu     t8, t8, 128
3426    sb        t5, 0(v0)
3427    sb        t7, 1(v0)
3428    sb        t8, 2(v0)
3429    sb        t6, 3(v0)
3430    li        s4, 15137
3431    lw        s6, 104(t1)       // wsptr[2]
3432    li        s5, 6270
3433    lw        s7, 120(t1)       // wsptr[6]
3434    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
3435    lw        t2, 96(t1)        // wsptr[0]
3436    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
3437    lh        t5, 124(t1)       // wsptr[7]
3438    lh        t6, 116(t1)       // wsptr[5]
3439    lh        t7, 108(t1)       // wsptr[3]
3440    lh        t8, 100(t1)       // wsptr[1]
3441    ins       t5, t6, 16, 16
3442    ins       t7, t8, 16, 16
3443    mult      $ac0, zero, zero
3444    dpa.w.ph  $ac0, t5, s0
3445    dpa.w.ph  $ac0, t7, s1
3446    mult      $ac1, zero, zero
3447    dpa.w.ph  $ac1, t5, s2
3448    dpa.w.ph  $ac1, t7, s3
3449    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
3450    mflo      s6, $ac0
3451    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3452    subu      s4, s4, s5
3453    addu      t3, t2, s4        // tmp10 = tmp0 + z2;
3454    mflo      s7, $ac1
3455    subu      t4, t2, s4        // tmp10 = tmp0 - z2;
3456    addu      t7, t4, s6
3457    subu      t8, t4, s6
3458    addu      t5, t3, s7
3459    subu      t6, t3, s7
3460    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
3461    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
3462    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
3463    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
3464    sll       s4, t9, 2
3465    lw        v0, 12(a2)        // output_buf[ctr]
3466    shll_s.w  t5, t5, 24
3467    shll_s.w  t6, t6, 24
3468    shll_s.w  t7, t7, 24
3469    shll_s.w  t8, t8, 24
3470    sra       t5, t5, 24
3471    sra       t6, t6, 24
3472    sra       t7, t7, 24
3473    sra       t8, t8, 24
3474    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3475    addiu     t5, t5, 128
3476    addiu     t6, t6, 128
3477    addiu     t7, t7, 128
3478    addiu     t8, t8, 128
3479    sb        t5, 0(v0)
3480    sb        t7, 1(v0)
3481    sb        t8, 2(v0)
3482    sb        t6, 3(v0)
3483
3484    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3485
3486    j         ra
3487     nop
3488END(jsimd_idct_4x4_mips_dspr2)
3489
3490/*****************************************************************************/
3491LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
3492/*
3493 * a0     - compptr->dct_table
3494 * a1     - coef_block
3495 * a2     - output_buf
3496 * a3     - output_col
3497 */
3498    .set at
3499
3500    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3501
3502    addiu     sp, sp, -144
3503    move      v0, sp
3504    addiu     v1, v0, 24
3505    addiu     t9, zero, 5793
3506    addiu     s0, zero, 10033
3507    addiu     s1, zero, 2998
3508
35091:
3510    lh        s2, 0(a0)   // q0 = quantptr[ 0]
3511    lh        s3, 32(a0)  // q1 = quantptr[16]
3512    lh        s4, 64(a0)  // q2 = quantptr[32]
3513    lh        t2, 64(a1)  // tmp2 = inptr[32]
3514    lh        t1, 32(a1)  // tmp1 = inptr[16]
3515    lh        t0, 0(a1)   // tmp0 = inptr[ 0]
3516    mul       t2, t2, s4  // tmp2 = tmp2 * q2
3517    mul       t1, t1, s3  // tmp1 = tmp1 * q1
3518    mul       t0, t0, s2  // tmp0 = tmp0 * q0
3519    lh        t6, 16(a1)  // z1 = inptr[ 8]
3520    lh        t8, 80(a1)  // z3 = inptr[40]
3521    lh        t7, 48(a1)  // z2 = inptr[24]
3522    lh        s2, 16(a0)  // q0 = quantptr[ 8]
3523    lh        s4, 80(a0)  // q2 = quantptr[40]
3524    lh        s3, 48(a0)  // q1 = quantptr[24]
3525    mul       t2, t2, t9  // tmp2 = tmp2 * 5793
3526    mul       t1, t1, s0  // tmp1 = tmp1 * 10033
3527    sll       t0, t0, 13  // tmp0 = tmp0 << 13
3528    mul       t6, t6, s2  // z1 = z1 * q0
3529    mul       t8, t8, s4  // z3 = z3 * q2
3530    mul       t7, t7, s3  // z2 = z2 * q1
3531    addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
3532    sll       t2, t2, 1   // tmp2 = tmp2 << 2
3533    subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
3534    subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
3535    addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
3536    addu      t1, t6, t8  // tmp1 = z1 + z3
3537    mul       t1, t1, s1  // tmp1 = tmp1 * 2998
3538    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
3539    subu      t2, t6, t8  // tmp2 = z1 - z3
3540    subu      t2, t2, t7  // tmp2 = tmp2 - z2
3541    sll       t2, t2, 2   // tmp2 = tmp2 << 2
3542    addu      t0, t6, t7  // tmp0 = z1 + z2
3543    sll       t0, t0, 13  // tmp0 = tmp0 << 13
3544    subu      s2, t8, t7  // q0 = z3 - z2
3545    sll       s2, s2, 13  // q0 = q0 << 13
3546    addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
3547    addu      t1, s2, t1  // tmp1 = q0 + tmp1
3548    addu      s2, t4, t2  // q0 = tmp11 + tmp2
3549    subu      s3, t4, t2  // q1 = tmp11 - tmp2
3550    addu      t6, t3, t0  // z1 = tmp10 + tmp0
3551    subu      t7, t3, t0  // z2 = tmp10 - tmp0
3552    addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
3553    subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
3554    shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
3555    shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
3556    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
3557    shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
3558    sw        s2, 24(v0)
3559    sw        s3, 96(v0)
3560    sw        t6, 0(v0)
3561    sw        t7, 120(v0)
3562    sw        t4, 48(v0)
3563    sw        t5, 72(v0)
3564    addiu     v0, v0, 4
3565    addiu     a1, a1, 2
3566    bne       v0, v1, 1b
3567     addiu    a0, a0, 2
3568
3569    /* Pass 2: process 6 rows from work array, store into output array. */
3570    move      v0, sp
3571    addiu     v1, v0, 144
3572
35732:
3574    lw        t0, 0(v0)
3575    lw        t2, 16(v0)
3576    lw        s5, 0(a2)
3577    addiu     t0, t0, 16
3578    sll       t0, t0, 13
3579    mul       t3, t2, t9
3580    lw        t6, 4(v0)
3581    lw        t8, 20(v0)
3582    lw        t7, 12(v0)
3583    addu      s5, s5, a3
3584    addu      s6, t6, t8
3585    mul       s6, s6, s1
3586    addu      t1, t0, t3
3587    subu      t4, t0, t3
3588    subu      t4, t4, t3
3589    lw        t3, 8(v0)
3590    mul       t0, t3, s0
3591    addu      s7, t6, t7
3592    sll       s7, s7, 13
3593    addu      s7, s6, s7
3594    subu      t2, t8, t7
3595    sll       t2, t2, 13
3596    addu      t2, s6, t2
3597    subu      s6, t6, t7
3598    subu      s6, s6, t8
3599    sll       s6, s6, 13
3600    addu      t3, t1, t0
3601    subu      t5, t1, t0
3602    addu      t6, t3, s7
3603    subu      t3, t3, s7
3604    addu      t7, t4, s6
3605    subu      t4, t4, s6
3606    addu      t8, t5, t2
3607    subu      t5, t5, t2
3608    shll_s.w  t6, t6, 6
3609    shll_s.w  t3, t3, 6
3610    shll_s.w  t7, t7, 6
3611    shll_s.w  t4, t4, 6
3612    shll_s.w  t8, t8, 6
3613    shll_s.w  t5, t5, 6
3614    sra       t6, t6, 24
3615    addiu     t6, t6, 128
3616    sra       t3, t3, 24
3617    addiu     t3, t3, 128
3618    sb        t6, 0(s5)
3619    sra       t7, t7, 24
3620    addiu     t7, t7, 128
3621    sb        t3, 5(s5)
3622    sra       t4, t4, 24
3623    addiu     t4, t4, 128
3624    sb        t7, 1(s5)
3625    sra       t8, t8, 24
3626    addiu     t8, t8, 128
3627    sb        t4, 4(s5)
3628    addiu     v0, v0, 24
3629    sra       t5, t5, 24
3630    addiu     t5, t5, 128
3631    sb        t8, 2(s5)
3632    addiu     a2, a2,  4
3633    bne       v0, v1, 2b
3634     sb       t5, 3(s5)
3635
3636    addiu     sp, sp, 144
3637
3638    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3639
3640    j         ra
3641     nop
3642
3643END(jsimd_idct_6x6_mips_dspr2)
3644
3645/*****************************************************************************/
3646LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
3647/*
3648 * a0     - compptr->dct_table
3649 * a1     - coef_block
3650 * a2     - workspace
3651 */
3652
3653    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3654
3655    li         a3, 8
3656
36571:
3658    // odd part
3659    lh         t0, 48(a1)
3660    lh         t1, 48(a0)
3661    lh         t2, 16(a1)
3662    lh         t3, 16(a0)
3663    lh         t4, 80(a1)
3664    lh         t5, 80(a0)
3665    lh         t6, 112(a1)
3666    lh         t7, 112(a0)
3667    mul        t0, t0, t1    // z2
3668    mul        t1, t2, t3    // z1
3669    mul        t2, t4, t5    // z3
3670    mul        t3, t6, t7    // z4
3671    li         t4, 10703     // FIX(1.306562965)
3672    li         t5, 4433      // FIX_0_541196100
3673    li         t6, 7053      // FIX(0.860918669)
3674    mul        t4, t0,t4     // tmp11
3675    mul        t5, t0,t5     // -tmp14
3676    addu       t7, t1,t2     // tmp10
3677    addu       t8, t7,t3     // tmp10 + z4
3678    mul        t6, t6, t8    // tmp15
3679    li         t8, 2139      // FIX(0.261052384)
3680    mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
3681    li         t7, 2295      // FIX(0.280143716)
3682    mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
3683    addu       t9, t2, t3    // z3 + z4
3684    li         s0, 8565      // FIX(1.045510580)
3685    mul        t9, t9, s0    // -tmp13
3686    li         s0, 12112     // FIX(1.478575242)
3687    mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
3688    li         s1, 12998     // FIX(1.586706681)
3689    mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
3690    li         s2, 5540      // FIX(0.676326758)
3691    mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
3692    li         s3, 16244     // FIX(1.982889723)
3693    mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
3694    subu       t1, t1, t3    // z1-=z4
3695    subu       t0, t0, t2    // z2-=z3
3696    addu       t2, t0, t1    // z1+z2
3697    li         t3, 4433      // FIX_0_541196100
3698    mul        t2, t2, t3    // z3
3699    li         t3, 6270      // FIX_0_765366865
3700    mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
3701    li         t3, 15137     // FIX_0_765366865
3702    mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
3703    addu       t8, t6, t8    // tmp12
3704    addu       t3, t8, t4    // tmp12 + tmp11
3705    addu       t3, t3, t7    // tmp10
3706    subu       t8, t8, t9    // tmp12 + tmp13
3707    addu       s0, t5, s0
3708    subu       t8, t8, s0    // tmp12
3709    subu       t9, t6, t9
3710    subu       s1, s1, t4
3711    addu       t9, t9, s1    // tmp13
3712    subu       t6, t6, t5
3713    subu       t6, t6, s2
3714    subu       t6, t6, s3    // tmp15
3715    // even part start
3716    lh         t4, 64(a1)
3717    lh         t5, 64(a0)
3718    lh         t7, 32(a1)
3719    lh         s0, 32(a0)
3720    lh         s1, 0(a1)
3721    lh         s2, 0(a0)
3722    lh         s3, 96(a1)
3723    lh         v0, 96(a0)
3724    mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
3725    mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
3726    mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
3727    mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
3728    // odd part end
3729    addu       t1, t2, t1    // tmp11
3730    subu       t0, t2, t0    // tmp14
3731    // update counter and pointers
3732    addiu      a3, a3, -1
3733    addiu      a0, a0, 2
3734    addiu      a1, a1, 2
3735    // even part rest
3736    li         s1, 10033
3737    li         s2, 11190
3738    mul        t4, t4, s1    // z4
3739    mul        s1, t5, s2    // z4
3740    sll        t5, t5, 13    // z1
3741    sll        t7, t7, 13
3742    addiu      t7, t7, 1024  // z3
3743    sll        s0, s0, 13    // z2
3744    addu       s2, t7, t4    // tmp10
3745    subu       t4, t7, t4    // tmp11
3746    subu       s3, t5, s0    // tmp12
3747    addu       t2, t7, s3    // tmp21
3748    subu       s3, t7, s3    // tmp24
3749    addu       t7, s1, s0    // tmp12
3750    addu       v0, s2, t7    // tmp20
3751    subu       s2, s2, t7    // tmp25
3752    subu       s1, s1, t5    // z4 - z1
3753    subu       s1, s1, s0    // tmp12
3754    addu       s0, t4, s1    // tmp22
3755    subu       t4, t4, s1    // tmp23
3756    // final output stage
3757    addu       t5, v0, t3
3758    subu       v0, v0, t3
3759    addu       t3, t2, t1
3760    subu       t2, t2, t1
3761    addu       t1, s0, t8
3762    subu       s0, s0, t8
3763    addu       t8, t4, t9
3764    subu       t4, t4, t9
3765    addu       t9, s3, t0
3766    subu       s3, s3, t0
3767    addu       t0, s2, t6
3768    subu       s2, s2, t6
3769    sra        t5, t5, 11
3770    sra        t3, t3, 11
3771    sra        t1, t1, 11
3772    sra        t8, t8, 11
3773    sra        t9, t9, 11
3774    sra        t0, t0, 11
3775    sra        s2, s2, 11
3776    sra        s3, s3, 11
3777    sra        t4, t4, 11
3778    sra        s0, s0, 11
3779    sra        t2, t2, 11
3780    sra        v0, v0, 11
3781    sw         t5, 0(a2)
3782    sw         t3, 32(a2)
3783    sw         t1, 64(a2)
3784    sw         t8, 96(a2)
3785    sw         t9, 128(a2)
3786    sw         t0, 160(a2)
3787    sw         s2, 192(a2)
3788    sw         s3, 224(a2)
3789    sw         t4, 256(a2)
3790    sw         s0, 288(a2)
3791    sw         t2, 320(a2)
3792    sw         v0, 352(a2)
3793    bgtz       a3, 1b
3794     addiu     a2, a2, 4
3795
3796    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3797
3798    j          ra
3799     nop
3800
3801END(jsimd_idct_12x12_pass1_mips_dspr2)
3802
3803/*****************************************************************************/
3804LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
3805/*
3806 * a0     - workspace
3807 * a1     - output
3808 */
3809
3810    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3811
3812    li        a3, 12
3813
38141:
3815    // Odd part
3816    lw        t0, 12(a0)
3817    lw        t1, 4(a0)
3818    lw        t2, 20(a0)
3819    lw        t3, 28(a0)
3820    li        t4, 10703     // FIX(1.306562965)
3821    li        t5, 4433      // FIX_0_541196100
3822    mul       t4, t0, t4    // tmp11
3823    mul       t5, t0, t5    // -tmp14
3824    addu      t6, t1, t2    // tmp10
3825    li        t7, 2139      // FIX(0.261052384)
3826    mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
3827    addu      t6, t6, t3    // tmp10 + z4
3828    li        t8, 7053      // FIX(0.860918669)
3829    mul       t6, t6, t8    // tmp15
3830    li        t8, 2295      // FIX(0.280143716)
3831    mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
3832    addu      t9, t2, t3    // z3 + z4
3833    li        s0, 8565      // FIX(1.045510580)
3834    mul       t9, t9, s0    // -tmp13
3835    li        s0, 12112     // FIX(1.478575242)
3836    mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
3837    li        s1, 12998     // FIX(1.586706681)
3838    mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
3839    li        s2, 5540      // FIX(0.676326758)
3840    mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
3841    li        s3, 16244     // FIX(1.982889723)
3842    mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
3843    subu      t1, t1, t3    // z1 -= z4
3844    subu      t0, t0, t2    // z2 -= z3
3845    addu      t2, t1, t0    // z1 + z2
3846    li        t3, 4433      // FIX_0_541196100
3847    mul       t2, t2, t3    // z3
3848    li        t3, 6270      // FIX_0_765366865
3849    mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
3850    li        t3, 15137     // FIX_1_847759065
3851    mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
3852    addu      t3, t6, t7    // tmp12
3853    addu      t7, t3, t4
3854    addu      t7, t7, t8    // tmp10
3855    subu      t3, t3, t9
3856    subu      t3, t3, t5
3857    subu      t3, t3, s0    // tmp12
3858    subu      t9, t6, t9
3859    subu      t9, t9, t4
3860    addu      t9, t9, s1    // tmp13
3861    subu      t6, t6, t5
3862    subu      t6, t6, s2
3863    subu      t6, t6, s3    // tmp15
3864    addu      t1, t2, t1    // tmp11
3865    subu      t0, t2, t0    // tmp14
3866    // even part
3867    lw        t2, 16(a0)    // z4
3868    lw        t4, 8(a0)     // z1
3869    lw        t5, 0(a0)     // z3
3870    lw        t8, 24(a0)    // z2
3871    li        s0, 10033     // FIX(1.224744871)
3872    li        s1, 11190     // FIX(1.366025404)
3873    mul       t2, t2, s0    // z4
3874    mul       s0, t4, s1    // z4
3875    addiu     t5, t5, 0x10
3876    sll       t5, t5, 13    // z3
3877    sll       t4, t4, 13    // z1
3878    sll       t8, t8, 13    // z2
3879    subu      s1, t4, t8    // tmp12
3880    addu      s2, t5, t2    // tmp10
3881    subu      t2, t5, t2    // tmp11
3882    addu      s3, t5, s1    // tmp21
3883    subu      s1, t5, s1    // tmp24
3884    addu      t5, s0, t8    // tmp12
3885    addu      v0, s2, t5    // tmp20
3886    subu      t5, s2, t5    // tmp25
3887    subu      t4, s0, t4
3888    subu      t4, t4, t8    // tmp12
3889    addu      t8, t2, t4    // tmp22
3890    subu      t2, t2, t4    // tmp23
3891    // increment counter and pointers
3892    addiu     a3, a3, -1
3893    addiu     a0, a0, 32
3894    // Final stage
3895    addu      t4, v0, t7
3896    subu      v0, v0, t7
3897    addu      t7, s3, t1
3898    subu      s3, s3, t1
3899    addu      t1, t8, t3
3900    subu      t8, t8, t3
3901    addu      t3, t2, t9
3902    subu      t2, t2, t9
3903    addu      t9, s1, t0
3904    subu      s1, s1, t0
3905    addu      t0, t5, t6
3906    subu      t5, t5, t6
3907    sll       t4, t4, 4
3908    sll       t7, t7, 4
3909    sll       t1, t1, 4
3910    sll       t3, t3, 4
3911    sll       t9, t9, 4
3912    sll       t0, t0, 4
3913    sll       t5, t5, 4
3914    sll       s1, s1, 4
3915    sll       t2, t2, 4
3916    sll       t8, t8, 4
3917    sll       s3, s3, 4
3918    sll       v0, v0, 4
3919    shll_s.w  t4, t4, 2
3920    shll_s.w  t7, t7, 2
3921    shll_s.w  t1, t1, 2
3922    shll_s.w  t3, t3, 2
3923    shll_s.w  t9, t9, 2
3924    shll_s.w  t0, t0, 2
3925    shll_s.w  t5, t5, 2
3926    shll_s.w  s1, s1, 2
3927    shll_s.w  t2, t2, 2
3928    shll_s.w  t8, t8, 2
3929    shll_s.w  s3, s3, 2
3930    shll_s.w  v0, v0, 2
3931    srl       t4, t4, 24
3932    srl       t7, t7, 24
3933    srl       t1, t1, 24
3934    srl       t3, t3, 24
3935    srl       t9, t9, 24
3936    srl       t0, t0, 24
3937    srl       t5, t5, 24
3938    srl       s1, s1, 24
3939    srl       t2, t2, 24
3940    srl       t8, t8, 24
3941    srl       s3, s3, 24
3942    srl       v0, v0, 24
3943    lw        t6, 0(a1)
3944    addiu     t4, t4, 0x80
3945    addiu     t7, t7, 0x80
3946    addiu     t1, t1, 0x80
3947    addiu     t3, t3, 0x80
3948    addiu     t9, t9, 0x80
3949    addiu     t0, t0, 0x80
3950    addiu     t5, t5, 0x80
3951    addiu     s1, s1, 0x80
3952    addiu     t2, t2, 0x80
3953    addiu     t8, t8, 0x80
3954    addiu     s3, s3, 0x80
3955    addiu     v0, v0, 0x80
3956    sb        t4, 0(t6)
3957    sb        t7, 1(t6)
3958    sb        t1, 2(t6)
3959    sb        t3, 3(t6)
3960    sb        t9, 4(t6)
3961    sb        t0, 5(t6)
3962    sb        t5, 6(t6)
3963    sb        s1, 7(t6)
3964    sb        t2, 8(t6)
3965    sb        t8, 9(t6)
3966    sb        s3, 10(t6)
3967    sb        v0, 11(t6)
3968    bgtz      a3, 1b
3969     addiu    a1, a1, 4
3970
3971    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3972
3973    jr        ra
3974     nop
3975
3976END(jsimd_idct_12x12_pass2_mips_dspr2)
3977
3978/*****************************************************************************/
3979LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
3980/*
3981 * a0     - sample_data
3982 * a1     - start_col
3983 * a2     - workspace
3984 */
3985
3986    lw             t0, 0(a0)
3987    li             t7, 0xff80ff80
3988    addu           t0, t0, a1
3989    ulw            t1, 0(t0)
3990    ulw            t2, 4(t0)
3991    preceu.ph.qbr  t3, t1
3992    preceu.ph.qbl  t4, t1
3993    lw             t0, 4(a0)
3994    preceu.ph.qbr  t5, t2
3995    preceu.ph.qbl  t6, t2
3996    addu           t0, t0, a1
3997    addu.ph        t3, t3, t7
3998    addu.ph        t4, t4, t7
3999    ulw            t1, 0(t0)
4000    ulw            t2, 4(t0)
4001    addu.ph        t5, t5, t7
4002    addu.ph        t6, t6, t7
4003    usw            t3, 0(a2)
4004    usw            t4, 4(a2)
4005    preceu.ph.qbr  t3, t1
4006    preceu.ph.qbl  t4, t1
4007    usw            t5, 8(a2)
4008    usw            t6, 12(a2)
4009
4010    lw             t0, 8(a0)
4011    preceu.ph.qbr  t5, t2
4012    preceu.ph.qbl  t6, t2
4013    addu           t0, t0, a1
4014    addu.ph        t3, t3, t7
4015    addu.ph        t4, t4, t7
4016    ulw            t1, 0(t0)
4017    ulw            t2, 4(t0)
4018    addu.ph        t5, t5, t7
4019    addu.ph        t6, t6, t7
4020    usw            t3, 16(a2)
4021    usw            t4, 20(a2)
4022    preceu.ph.qbr  t3, t1
4023    preceu.ph.qbl  t4, t1
4024    usw            t5, 24(a2)
4025    usw            t6, 28(a2)
4026
4027    lw             t0, 12(a0)
4028    preceu.ph.qbr  t5, t2
4029    preceu.ph.qbl  t6, t2
4030    addu           t0, t0, a1
4031    addu.ph        t3, t3, t7
4032    addu.ph        t4, t4, t7
4033    ulw            t1, 0(t0)
4034    ulw            t2, 4(t0)
4035    addu.ph        t5, t5, t7
4036    addu.ph        t6, t6, t7
4037    usw            t3, 32(a2)
4038    usw            t4, 36(a2)
4039    preceu.ph.qbr  t3, t1
4040    preceu.ph.qbl  t4, t1
4041    usw            t5, 40(a2)
4042    usw            t6, 44(a2)
4043
4044    lw             t0, 16(a0)
4045    preceu.ph.qbr  t5, t2
4046    preceu.ph.qbl  t6, t2
4047    addu           t0, t0, a1
4048    addu.ph        t3, t3, t7
4049    addu.ph        t4, t4, t7
4050    ulw            t1, 0(t0)
4051    ulw            t2, 4(t0)
4052    addu.ph        t5, t5, t7
4053    addu.ph        t6, t6, t7
4054    usw            t3, 48(a2)
4055    usw            t4, 52(a2)
4056    preceu.ph.qbr  t3, t1
4057    preceu.ph.qbl  t4, t1
4058    usw            t5, 56(a2)
4059    usw            t6, 60(a2)
4060
4061    lw             t0, 20(a0)
4062    preceu.ph.qbr  t5, t2
4063    preceu.ph.qbl  t6, t2
4064    addu           t0, t0, a1
4065    addu.ph        t3, t3, t7
4066    addu.ph        t4, t4, t7
4067    ulw            t1, 0(t0)
4068    ulw            t2, 4(t0)
4069    addu.ph        t5, t5, t7
4070    addu.ph        t6, t6, t7
4071    usw            t3, 64(a2)
4072    usw            t4, 68(a2)
4073    preceu.ph.qbr  t3, t1
4074    preceu.ph.qbl  t4, t1
4075    usw            t5, 72(a2)
4076    usw            t6, 76(a2)
4077
4078    lw             t0, 24(a0)
4079    preceu.ph.qbr  t5, t2
4080    preceu.ph.qbl  t6, t2
4081    addu           t0, t0, a1
4082    addu.ph        t3, t3, t7
4083    addu.ph        t4, t4, t7
4084    ulw            t1, 0(t0)
4085    ulw            t2, 4(t0)
4086    addu.ph        t5, t5, t7
4087    addu.ph        t6, t6, t7
4088    usw            t3, 80(a2)
4089    usw            t4, 84(a2)
4090    preceu.ph.qbr  t3, t1
4091    preceu.ph.qbl  t4, t1
4092    usw            t5, 88(a2)
4093    usw            t6, 92(a2)
4094
4095    lw             t0, 28(a0)
4096    preceu.ph.qbr  t5, t2
4097    preceu.ph.qbl  t6, t2
4098    addu           t0, t0, a1
4099    addu.ph        t3, t3, t7
4100    addu.ph        t4, t4, t7
4101    ulw            t1, 0(t0)
4102    ulw            t2, 4(t0)
4103    addu.ph        t5, t5, t7
4104    addu.ph        t6, t6, t7
4105    usw            t3, 96(a2)
4106    usw            t4, 100(a2)
4107    preceu.ph.qbr  t3, t1
4108    preceu.ph.qbl  t4, t1
4109    usw            t5, 104(a2)
4110    usw            t6, 108(a2)
4111    preceu.ph.qbr  t5, t2
4112    preceu.ph.qbl  t6, t2
4113    addu.ph        t3, t3, t7
4114    addu.ph        t4, t4, t7
4115    addu.ph        t5, t5, t7
4116    addu.ph        t6, t6, t7
4117    usw            t3, 112(a2)
4118    usw            t4, 116(a2)
4119    usw            t5, 120(a2)
4120    usw            t6, 124(a2)
4121
4122    j              ra
4123     nop
4124
4125END(jsimd_convsamp_mips_dspr2)
4126
4127/*****************************************************************************/
4128LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
4129/*
4130 * a0     - sample_data
4131 * a1     - start_col
4132 * a2     - workspace
4133 */
4134
4135    .set at
4136
4137    lw       t0, 0(a0)
4138    addu     t0, t0, a1
4139    lbu      t1, 0(t0)
4140    lbu      t2, 1(t0)
4141    lbu      t3, 2(t0)
4142    lbu      t4, 3(t0)
4143    lbu      t5, 4(t0)
4144    lbu      t6, 5(t0)
4145    lbu      t7, 6(t0)
4146    lbu      t8, 7(t0)
4147    addiu    t1, t1, -128
4148    addiu    t2, t2, -128
4149    addiu    t3, t3, -128
4150    addiu    t4, t4, -128
4151    addiu    t5, t5, -128
4152    addiu    t6, t6, -128
4153    addiu    t7, t7, -128
4154    addiu    t8, t8, -128
4155    mtc1     t1, f2
4156    mtc1     t2, f4
4157    mtc1     t3, f6
4158    mtc1     t4, f8
4159    mtc1     t5, f10
4160    mtc1     t6, f12
4161    mtc1     t7, f14
4162    mtc1     t8, f16
4163    cvt.s.w  f2, f2
4164    cvt.s.w  f4, f4
4165    cvt.s.w  f6, f6
4166    cvt.s.w  f8, f8
4167    cvt.s.w  f10, f10
4168    cvt.s.w  f12, f12
4169    cvt.s.w  f14, f14
4170    cvt.s.w  f16, f16
4171    lw       t0, 4(a0)
4172    swc1     f2, 0(a2)
4173    swc1     f4, 4(a2)
4174    swc1     f6, 8(a2)
4175    addu     t0, t0, a1
4176    swc1     f8, 12(a2)
4177    swc1     f10, 16(a2)
4178    swc1     f12, 20(a2)
4179    swc1     f14, 24(a2)
4180    swc1     f16, 28(a2)
4181    //elemr 1
4182    lbu      t1, 0(t0)
4183    lbu      t2, 1(t0)
4184    lbu      t3, 2(t0)
4185    lbu      t4, 3(t0)
4186    lbu      t5, 4(t0)
4187    lbu      t6, 5(t0)
4188    lbu      t7, 6(t0)
4189    lbu      t8, 7(t0)
4190    addiu    t1, t1, -128
4191    addiu    t2, t2, -128
4192    addiu    t3, t3, -128
4193    addiu    t4, t4, -128
4194    addiu    t5, t5, -128
4195    addiu    t6, t6, -128
4196    addiu    t7, t7, -128
4197    addiu    t8, t8, -128
4198    mtc1     t1, f2
4199    mtc1     t2, f4
4200    mtc1     t3, f6
4201    mtc1     t4, f8
4202    mtc1     t5, f10
4203    mtc1     t6, f12
4204    mtc1     t7, f14
4205    mtc1     t8, f16
4206    cvt.s.w  f2, f2
4207    cvt.s.w  f4, f4
4208    cvt.s.w  f6, f6
4209    cvt.s.w  f8, f8
4210    cvt.s.w  f10, f10
4211    cvt.s.w  f12, f12
4212    cvt.s.w  f14, f14
4213    cvt.s.w  f16, f16
4214    lw       t0, 8(a0)
4215    swc1     f2, 32(a2)
4216    swc1     f4, 36(a2)
4217    swc1     f6, 40(a2)
4218    addu     t0, t0, a1
4219    swc1     f8, 44(a2)
4220    swc1     f10, 48(a2)
4221    swc1     f12, 52(a2)
4222    swc1     f14, 56(a2)
4223    swc1     f16, 60(a2)
4224    //elemr 2
4225    lbu      t1, 0(t0)
4226    lbu      t2, 1(t0)
4227    lbu      t3, 2(t0)
4228    lbu      t4, 3(t0)
4229    lbu      t5, 4(t0)
4230    lbu      t6, 5(t0)
4231    lbu      t7, 6(t0)
4232    lbu      t8, 7(t0)
4233    addiu    t1, t1, -128
4234    addiu    t2, t2, -128
4235    addiu    t3, t3, -128
4236    addiu    t4, t4, -128
4237    addiu    t5, t5, -128
4238    addiu    t6, t6, -128
4239    addiu    t7, t7, -128
4240    addiu    t8, t8, -128
4241    mtc1     t1, f2
4242    mtc1     t2, f4
4243    mtc1     t3, f6
4244    mtc1     t4, f8
4245    mtc1     t5, f10
4246    mtc1     t6, f12
4247    mtc1     t7, f14
4248    mtc1     t8, f16
4249    cvt.s.w  f2, f2
4250    cvt.s.w  f4, f4
4251    cvt.s.w  f6, f6
4252    cvt.s.w  f8, f8
4253    cvt.s.w  f10, f10
4254    cvt.s.w  f12, f12
4255    cvt.s.w  f14, f14
4256    cvt.s.w  f16, f16
4257    lw       t0, 12(a0)
4258    swc1     f2, 64(a2)
4259    swc1     f4, 68(a2)
4260    swc1     f6, 72(a2)
4261    addu     t0, t0, a1
4262    swc1     f8, 76(a2)
4263    swc1     f10, 80(a2)
4264    swc1     f12, 84(a2)
4265    swc1     f14, 88(a2)
4266    swc1     f16, 92(a2)
4267    //elemr 3
4268    lbu      t1, 0(t0)
4269    lbu      t2, 1(t0)
4270    lbu      t3, 2(t0)
4271    lbu      t4, 3(t0)
4272    lbu      t5, 4(t0)
4273    lbu      t6, 5(t0)
4274    lbu      t7, 6(t0)
4275    lbu      t8, 7(t0)
4276    addiu    t1, t1, -128
4277    addiu    t2, t2, -128
4278    addiu    t3, t3, -128
4279    addiu    t4, t4, -128
4280    addiu    t5, t5, -128
4281    addiu    t6, t6, -128
4282    addiu    t7, t7, -128
4283    addiu    t8, t8, -128
4284    mtc1     t1, f2
4285    mtc1     t2, f4
4286    mtc1     t3, f6
4287    mtc1     t4, f8
4288    mtc1     t5, f10
4289    mtc1     t6, f12
4290    mtc1     t7, f14
4291    mtc1     t8, f16
4292    cvt.s.w  f2, f2
4293    cvt.s.w  f4, f4
4294    cvt.s.w  f6, f6
4295    cvt.s.w  f8, f8
4296    cvt.s.w  f10, f10
4297    cvt.s.w  f12, f12
4298    cvt.s.w  f14, f14
4299    cvt.s.w  f16, f16
4300    lw       t0, 16(a0)
4301    swc1     f2, 96(a2)
4302    swc1     f4, 100(a2)
4303    swc1     f6, 104(a2)
4304    addu     t0, t0, a1
4305    swc1     f8, 108(a2)
4306    swc1     f10, 112(a2)
4307    swc1     f12, 116(a2)
4308    swc1     f14, 120(a2)
4309    swc1     f16, 124(a2)
4310    //elemr 4
4311    lbu      t1, 0(t0)
4312    lbu      t2, 1(t0)
4313    lbu      t3, 2(t0)
4314    lbu      t4, 3(t0)
4315    lbu      t5, 4(t0)
4316    lbu      t6, 5(t0)
4317    lbu      t7, 6(t0)
4318    lbu      t8, 7(t0)
4319    addiu    t1, t1, -128
4320    addiu    t2, t2, -128
4321    addiu    t3, t3, -128
4322    addiu    t4, t4, -128
4323    addiu    t5, t5, -128
4324    addiu    t6, t6, -128
4325    addiu    t7, t7, -128
4326    addiu    t8, t8, -128
4327    mtc1     t1, f2
4328    mtc1     t2, f4
4329    mtc1     t3, f6
4330    mtc1     t4, f8
4331    mtc1     t5, f10
4332    mtc1     t6, f12
4333    mtc1     t7, f14
4334    mtc1     t8, f16
4335    cvt.s.w  f2, f2
4336    cvt.s.w  f4, f4
4337    cvt.s.w  f6, f6
4338    cvt.s.w  f8, f8
4339    cvt.s.w  f10, f10
4340    cvt.s.w  f12, f12
4341    cvt.s.w  f14, f14
4342    cvt.s.w  f16, f16
4343    lw       t0, 20(a0)
4344    swc1     f2, 128(a2)
4345    swc1     f4, 132(a2)
4346    swc1     f6, 136(a2)
4347    addu     t0, t0, a1
4348    swc1     f8, 140(a2)
4349    swc1     f10, 144(a2)
4350    swc1     f12, 148(a2)
4351    swc1     f14, 152(a2)
4352    swc1     f16, 156(a2)
4353    //elemr 5
4354    lbu      t1, 0(t0)
4355    lbu      t2, 1(t0)
4356    lbu      t3, 2(t0)
4357    lbu      t4, 3(t0)
4358    lbu      t5, 4(t0)
4359    lbu      t6, 5(t0)
4360    lbu      t7, 6(t0)
4361    lbu      t8, 7(t0)
4362    addiu    t1, t1, -128
4363    addiu    t2, t2, -128
4364    addiu    t3, t3, -128
4365    addiu    t4, t4, -128
4366    addiu    t5, t5, -128
4367    addiu    t6, t6, -128
4368    addiu    t7, t7, -128
4369    addiu    t8, t8, -128
4370    mtc1     t1, f2
4371    mtc1     t2, f4
4372    mtc1     t3, f6
4373    mtc1     t4, f8
4374    mtc1     t5, f10
4375    mtc1     t6, f12
4376    mtc1     t7, f14
4377    mtc1     t8, f16
4378    cvt.s.w  f2, f2
4379    cvt.s.w  f4, f4
4380    cvt.s.w  f6, f6
4381    cvt.s.w  f8, f8
4382    cvt.s.w  f10, f10
4383    cvt.s.w  f12, f12
4384    cvt.s.w  f14, f14
4385    cvt.s.w  f16, f16
4386    lw       t0, 24(a0)
4387    swc1     f2, 160(a2)
4388    swc1     f4, 164(a2)
4389    swc1     f6, 168(a2)
4390    addu     t0, t0, a1
4391    swc1     f8, 172(a2)
4392    swc1     f10, 176(a2)
4393    swc1     f12, 180(a2)
4394    swc1     f14, 184(a2)
4395    swc1     f16, 188(a2)
4396    //elemr 6
4397    lbu      t1, 0(t0)
4398    lbu      t2, 1(t0)
4399    lbu      t3, 2(t0)
4400    lbu      t4, 3(t0)
4401    lbu      t5, 4(t0)
4402    lbu      t6, 5(t0)
4403    lbu      t7, 6(t0)
4404    lbu      t8, 7(t0)
4405    addiu    t1, t1, -128
4406    addiu    t2, t2, -128
4407    addiu    t3, t3, -128
4408    addiu    t4, t4, -128
4409    addiu    t5, t5, -128
4410    addiu    t6, t6, -128
4411    addiu    t7, t7, -128
4412    addiu    t8, t8, -128
4413    mtc1     t1, f2
4414    mtc1     t2, f4
4415    mtc1     t3, f6
4416    mtc1     t4, f8
4417    mtc1     t5, f10
4418    mtc1     t6, f12
4419    mtc1     t7, f14
4420    mtc1     t8, f16
4421    cvt.s.w  f2, f2
4422    cvt.s.w  f4, f4
4423    cvt.s.w  f6, f6
4424    cvt.s.w  f8, f8
4425    cvt.s.w  f10, f10
4426    cvt.s.w  f12, f12
4427    cvt.s.w  f14, f14
4428    cvt.s.w  f16, f16
4429    lw       t0, 28(a0)
4430    swc1     f2, 192(a2)
4431    swc1     f4, 196(a2)
4432    swc1     f6, 200(a2)
4433    addu     t0, t0, a1
4434    swc1     f8, 204(a2)
4435    swc1     f10, 208(a2)
4436    swc1     f12, 212(a2)
4437    swc1     f14, 216(a2)
4438    swc1     f16, 220(a2)
4439    //elemr 7
4440    lbu      t1, 0(t0)
4441    lbu      t2, 1(t0)
4442    lbu      t3, 2(t0)
4443    lbu      t4, 3(t0)
4444    lbu      t5, 4(t0)
4445    lbu      t6, 5(t0)
4446    lbu      t7, 6(t0)
4447    lbu      t8, 7(t0)
4448    addiu    t1, t1, -128
4449    addiu    t2, t2, -128
4450    addiu    t3, t3, -128
4451    addiu    t4, t4, -128
4452    addiu    t5, t5, -128
4453    addiu    t6, t6, -128
4454    addiu    t7, t7, -128
4455    addiu    t8, t8, -128
4456    mtc1     t1, f2
4457    mtc1     t2, f4
4458    mtc1     t3, f6
4459    mtc1     t4, f8
4460    mtc1     t5, f10
4461    mtc1     t6, f12
4462    mtc1     t7, f14
4463    mtc1     t8, f16
4464    cvt.s.w  f2, f2
4465    cvt.s.w  f4, f4
4466    cvt.s.w  f6, f6
4467    cvt.s.w  f8, f8
4468    cvt.s.w  f10, f10
4469    cvt.s.w  f12, f12
4470    cvt.s.w  f14, f14
4471    cvt.s.w  f16, f16
4472    swc1     f2, 224(a2)
4473    swc1     f4, 228(a2)
4474    swc1     f6, 232(a2)
4475    swc1     f8, 236(a2)
4476    swc1     f10, 240(a2)
4477    swc1     f12, 244(a2)
4478    swc1     f14, 248(a2)
4479    swc1     f16, 252(a2)
4480
4481    j        ra
4482     nop
4483
4484END(jsimd_convsamp_float_mips_dspr2)
4485
4486/*****************************************************************************/
4487
4488