1/****************************************************************************
2**
3** Copyright (C) 2013 Imagination Technologies Limited, www.imgtec.com
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtGui module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#include "qt_mips_asm_dsp_p.h"
41
42LEAF_MIPS_DSP(destfetchARGB32_asm_mips_dsp)
43/*
44 * a0 - buffer address (dst)
45 * a1 - data address (src)
46 * a2 - length
47 */
48
49    beqz              a2, 2f
50     move             v0, a0         /* just return the address of buffer
51                                      * for storing returning values */
52    move              v0, a0
53    andi              t1, a2, 0x1
54    li                t7, 8388736    /* t7 = 0x800080 */
55    beqz              t1, 1f
56     nop
57    lw                t8, 0(a1)
58    addiu             a2, a2, -1
59    srl               t6, t8, 24     /* t6 = alpha */
60
61    preceu.ph.qbra    t0, t8
62    mul               t1, t0, t6
63    preceu.ph.qbla    t4, t8
64    mul               t5, t4, t6
65
66    preceu.ph.qbla    t2, t1
67    addq.ph           t3, t1, t2
68    addq.ph           t3, t3, t7
69    preceu.ph.qbla    t1, t3         /* t1 holds R & B blended with alpha
70                                      * | 0 | dRab | 0 | dBab | */
71    preceu.ph.qbla    t2, t5
72    addq.ph           t3, t2, t5
73    addq.ph           t4, t3, t7
74    preceu.ph.qbla    t2, t4         /* t2 holds A & G blended with alpha
75                                      * | 0 | dAab | 0 | dGab | */
76    andi              t2, t2, 255    /* t2 = 0xff */
77
78    sll               t0, t6, 24
79    sll               t3, t2, 8
80    or                t4, t0, t3
81    or                t0, t1, t4
82    sw                t0, 0(a0)
83    addiu             a0, a0, 4
84    addiu             a1, a1, 4
85    beqz              a2, 2f         /* there was only one member */
86     nop
871:
88    lw                t0, 0(a1)      /* t0 = src1 */
89    lw                t1, 4(a1)      /* t1 = src2 */
90    precrq.qb.ph      t4, t0, t1     /* t4 = a1 G1 a2 G2 */
91    preceu.ph.qbra    t3, t4         /* t3 = 0 G1 0 G2 */
92    preceu.ph.qbla    t2, t4         /* t2 = | 0 | a1 | 0 | a2 | */
93    srl               t5, t2, 8
94    or                t8, t2, t5     /* t8 = 0 a1 a1 a2 */
95    muleu_s.ph.qbr    t5, t8, t3
96
97    addiu             a2, a2, -2
98    addiu             a1, a1, 8
99    precrq.ph.w       t9, t0, t1
100    preceu.ph.qbra    t9, t9
101
102    preceu.ph.qbla    t6, t5
103    addq.ph           t5, t5, t6
104    addq.ph           t2, t5, t7
105    muleu_s.ph.qbr    t6, t8, t9
106    sll               t3, t1, 16
107    packrl.ph         t3, t0, t3
108    preceu.ph.qbra    t3, t3
109    muleu_s.ph.qbr    t8, t8, t3
110    preceu.ph.qbla    t3, t6
111    addq.ph           t3, t6, t3
112    addq.ph           t3, t3, t7
113    preceu.ph.qbla    t5, t8
114    addq.ph           t5, t8, t5
115    addq.ph           t5, t5, t7
116
117    precrq.ph.w       t0, t4, t3     /* t0 = | 0 |  a1 | 0 | dR1 | */
118    precrq.ph.w       t1, t2, t5     /* t1 = | 0 | dG1 | 0 | dB1 | */
119    precrq.qb.ph      t6, t0, t1     /* t6 = | a1 | dR1 | dG1 | dB1 | */
120    sll               t3, t3, 16
121    sll               t5, t5, 16
122    packrl.ph         t0, t4, t3
123    packrl.ph         t1, t2, t5
124    precrq.qb.ph      t8, t0, t1     /* t8 = | a2 | dR2 | dG2 | dB2 | */
125    sw                t6, 0(a0)
126    sw                t8, 4(a0)
127    bnez              a2, 1b
128     addiu            a0, a0, 8
1292:
130    j                 ra
131     nop
132
133END(destfetchARGB32_asm_mips_dsp)
134
135LEAF_MIPS_DSP(qt_memfill32_asm_mips_dsp)
136/*
137 * a0 - destination address (dst)
138 * a1 - value
139 * a2 - count
140 */
141
142    beqz      a2, 5f
143     nop
144    li        t8, 8
145    andi      t0, a2, 0x7    /* t0 holds how many counts exceeds 8 */
146    beqzl     t0, 2f         /* count is multiple of 8 (8, 16, 24, ....) */
147     addiu    a2, a2, -8
148    subu      a2, a2, t0
1491:
150    sw        a1, 0(a0)
151    addiu     t0, t0, -1
152    bnez      t0, 1b
153     addiu    a0, a0, 4
154    bgeu      a2, t8, 2f
155     addiu    a2, a2, -8
156    b         5f
157     nop
1582:
159    beqz      a2, 4f
160     nop
1613:
162    pref      30, 32(a0)
163    addiu     a2, a2, -8
164    sw        a1, 0( a0)
165    sw        a1, 4(a0)
166    sw        a1, 8(a0)
167    sw        a1, 12(a0)
168    addiu     a0, a0, 32
169    sw        a1, -16(a0)
170    sw        a1, -12(a0)
171    sw        a1, -8(a0)
172    bnez      a2, 3b
173     sw       a1, -4(a0)
1744:
175    sw        a1, 0(a0)
176    sw        a1, 4(a0)
177    sw        a1, 8(a0)
178    sw        a1, 12(a0)
179    addiu     a0, a0, 32
180    sw        a1, -16(a0)
181    sw        a1, -12(a0)
182    sw        a1, -8(a0)
183    sw        a1, -4(a0)
1845:
185    jr        ra
186     nop
187
188END(qt_memfill32_asm_mips_dsp)
189
190LEAF_MIPS_DSP(comp_func_SourceOver_asm_mips_dsp)
191/*
192 * a0 - uint *dest
193 * a1 - const uint *src
194 * a2 - int length
195 * a3 - uint const_alpha
196 */
197
198    beqz              a2, 5f
199     nop
200    li                t8, 0xff
201    li                t7, 8388736    /* t7 = 0x800080 */
202    bne               a3, t8, 4f
203     nop
204
205/* part where const_alpha = 255 */
206    b                 2f
207     nop
2081:
209    addiu             a0, a0, 4
210    addiu             a2, a2, -1
211    beqz              a2, 5f
212     nop
2132:
214    lw                t0, 0(a1)      /* t0 = s = src[i] */
215    addiu             a1, a1, 4
216    nor               t1, t0, zero
217    srl               t1, t1, 24     /* t1 = ~qAlpha(s) */
218    bnez              t1, 3f
219     nop
220    sw                t0, 0(a0)      /* dst[i] = src[i] */
221    addiu             a2, a2, -1
222    bnez              a2, 2b
223     addiu            a0, a0, 4
224    b 5f
225     nop
2263:
227    beqz              t0, 1b
228     nop
229
230    lw                t4, 0(a0)
231    replv.ph          t6, t1
232    muleu_s.ph.qbl    t2, t4, t6
233    muleu_s.ph.qbr    t3, t4, t6
234    addiu             a2, a2, -1
235    preceu.ph.qbla    t4, t2
236    addq.ph           t4, t2, t4
237    addq.ph           t4, t4, t7
238    preceu.ph.qbla    t5, t3
239    addq.ph           t5, t5, t3
240    addq.ph           t5, t5, t7
241    precrq.qb.ph      t8, t4, t5    /* t8 = | dsA | dsR | dsG | dsB | */
242    addu              t8, t0, t8    /* dst[i] =
243                                     * s + BYTE_MUL(dst[i],~qAlpha(s)) */
244    sw                t8, 0(a0)
245    bnez              a2, 2b
246     addiu            a0, a0, 4
247    b                 5f
248     nop
2494:
250    lw                t0, 0(a0)     /* t0 - dst[i] "1" */
251    lw                t1, 0(a1)     /* t1 - src[i] "2" */
252    addiu             a1, a1, 4
253    addiu             a2, a2, -1
254    replv.ph          t6, a3        /* a1 = 0x00a00a */
255    muleu_s.ph.qbl    t2, t1, t6
256    muleu_s.ph.qbr    t3, t1, t6
257    preceu.ph.qbla    t4, t2
258    addq.ph           t4, t2, t4
259    addq.ph           t4, t4, t7
260    preceu.ph.qbla    t5, t3
261    addq.ph           t5, t5, t3
262    addq.ph           t5, t5, t7
263    precrq.qb.ph      t8, t4, t5    /* t8 = | dsA | dsR | dsG | dsB | */
264
265    nor               t6, t8, zero
266    srl               t6, t6, 24
267    replv.ph          t6, t6
268
269    muleu_s.ph.qbl    t2, t0, t6
270    muleu_s.ph.qbr    t3, t0, t6
271    preceu.ph.qbla    t4, t2
272    addq.ph           t4, t2, t4
273    addq.ph           t4, t4, t7
274    preceu.ph.qbla    t5, t3
275    addq.ph           t5, t5, t3
276    addq.ph           t5, t5, t7
277    precrq.qb.ph      t6, t4, t5    /* t6 = | ddA | ddR | ddG | ddB | */
278
279    addu              t0, t8, t6
280    sw                t0, 0(a0)
281    bnez              a2, 4b
282     addiu            a0, a0, 4
2835:
284    jr                ra
285     nop
286
287END(comp_func_SourceOver_asm_mips_dsp)
288
289LEAF_MIPS_DSPR2(qt_destStoreARGB32_asm_mips_dsp)
290/*
291 * a0 - uint * data
292 * a1 - const uint *buffer
293 * a2 - int length
294 */
295
296    blez      a2, 6f
297    move      v1, zero
298    li        t0, 255
299    lui       a3, 0xff
300    j         2f
301     lui      t2, 0xff00
3021:
303    addiu     v1, v1, 1
304    sw        zero, 0(a0)
305    addiu     a1, a1, 4
306    beq       v1, a2, 6f
307    addiu     a0, a0, 4
3082:
309    lw        v0, 0(a1)
310    srl       t3, v0, 0x18
311    beql      t3, t0, 5f
312    addiu     v1, v1, 1
313    beqz      t3, 1b
314
315    srl       t1, v0, 0x8
316    andi      t1, t1, 0xff
317
318    teq       t3, zero, 0x7
319    div       zero, a3, t3
320    move      t8, t3
321    andi      t6, v0, 0xff
322
323    srl       t3,v0,0x10
324    andi      t3,t3,0xff
325
326    and       t5, v0, t2
327    mflo      t4
328
329    mult      $ac0, t4, t6
330    mult      $ac1, t1, t4
331    mul       t4, t3, t4
332
333    sltiu     t8, t8, 2
334    beqz      t8, 3f
335     nop
336    mflo      t6, $ac0
337    mflo      t1, $ac1
338    sra       t6, t6, 0x10
339    sra       t1, t1, 0x8
340    b         4f
341     nop
3423:
343    extr.w    t6, $ac0, 0x10
344    extr.w    t1, $ac1, 0x8
3454:
346    and       v0, t4, a3
347    or        v0, v0, t6
348    or        v0, v0, t5
349    andi      t1, t1, 0xff00
350    or        v0, v0, t1
351    addiu     v1, v1, 1
3525:
353    sw        v0, 0(a0)
354    addiu     a1, a1, 4
355    bne       v1, a2, 2b
356    addiu     a0, a0, 4
3576:
358    jr        ra
359     nop
360
361END(qt_destStoreARGB32_asm_mips_dsp)
362
363LEAF_MIPS_DSP(comp_func_solid_Source_dsp_asm_x2)
364/*
365 * a0 - const uint *dest
366 * a1 - int length
367 * a2 - uint color
368 * a3 - uint ialpha
369 */
370
371    beqz              a1, 2f
372     nop
373    replv.ph          a3, a3
374    li                t9, 8388736    /* t9 = 0x800080 */
3751:
376    lw                t0, 0(a0)
377    lw                t1, 4(a0)
378    or                t2, t0, t1    /* if both dest are zero, no computation needed */
379    beqz              t2, 12f
380     addiu             a1, -2
381
382    BYTE_MUL_x2 t0, t1, t6, t7, a3, a3, t9, t2, t3, t4, t5, 0
38311:
384    addu              t2, a2, t6
385    addu              t3, a2, t7
386    sw                t2, 0(a0)
387    sw                t3, 4(a0)
388    bnez              a1, 1b
389     addiu             a0, 8
390    b                 2f
39112:
392    addu              t2, a2, t0
393    addu              t3, a2, t1
394    sw                t2, 0(a0)
395    sw                t3, 4(a0)
396    bnez              a1, 1b
397     addiu             a0, 8
3982:
399    jr                ra
400     nop
401
402END(comp_func_solid_Source_dsp_asm_x2)
403
404LEAF_MIPS_DSP(comp_func_solid_DestinationOver_dsp_asm_x2)
405/*
406 * a0 - uint *dest
407 * a1 - int length
408 * a2 - uint color
409 */
410
411    addiu             sp, sp, -8
412    sw                s0, 0(sp)
413    sw                s1, 4(sp)
414    beqz              a1, 2f
415     nop
416    beqz              a2, 2f
417     nop
418    li                t9, 8388736    /* t4 = 0x800080 */
419
4201:
421    lw                t0, 0(a0)
422    lw                t1, 4(a0)
423    not               t2, t0
424    not               t3, t1
425    srl               t4, t2, 24
426    srl               t5, t3, 24
427    or                t2, t4, t5    /* if both dest are zero, no computation needed */
428    beqz              t2, 11f
429     addiu             a1, -2
430    replv.ph          t2, t4
431    replv.ph          t3, t5
432
433    BYTE_MUL_x2 a2, a2, t8, a3, t2, t3, t9, t4, t5, t6, t7
434
435    addu              t0, t0, t8
436    addu              t1, t1, a3
43711:
438    sw                t0, 0(a0)
439    sw                t1, 4(a0)
440    bnez              a1, 1b
441     addiu             a0, 8
442
4432:
444    lw                s0, 0(sp)
445    lw                s1, 4(sp)
446    addiu             sp, sp, 8
447    jr                ra
448     nop
449
450END(comp_func_solid_DestinationOver_dsp_asm_x2)
451
452LEAF_MIPS_DSP(comp_func_DestinationOver_dsp_asm_x2)
453/*
454 * a0 - uint *dest
455 * a1 - uint *src
456 * a2 - int length
457 * a3 - uint const_alpha
458 */
459
460    .set              noat
461    addiu             sp, sp, -8
462    sw                s0, 0(sp)
463    sw                s1, 4(sp)
464    beqz              a2, 3f
465     nop
466    li                t9, 8388736    /* t4 = 0x800080 */
467    li                t0, 0xff
468    beq               a3, t0, 2f
469     nop
470
471/* part where const_alpha != 255 */
4721:
473    replv.ph          a3, a3
47411:
475    lw                t0, 0(a1)     # src_1
476    lw                t1, 4(a1)     # src_2
477    addiu             a2, -2
478
479    BYTE_MUL_x2 t0, t1, t8, AT, a3, a3, t9, t4, t5, t6, t7, 0
480                                    # t8 = s1
481                                    # AT = s2
482    lw                t0, 0(a0)     # dest_1
483    lw                t1, 4(a0)     # dest_2
484    addiu             a1, 8
485    not               t2, t0
486    not               t3, t1
487    srl               t4, t2, 24
488    srl               t5, t3, 24
489    replv.ph          t2, t4        # qAlpha(~d) 1
490    replv.ph          t3, t5        # qAlpha(~d) 2
491
492    BYTE_MUL_x2 t8, AT, s0, s1, t2, t3, t9, t4, t5, t6, t7
493
494    addu              t0, t0, s0
495    addu              t1, t1, s1
496    sw                t0, 0(a0)
497    sw                t1, 4(a0)
498    bnez              a2, 11b
499     addiu             a0, 8
500    b                 3f
501     nop
502
503/* part where const_alpha = 255 */
5042:
505    lw                t0, 0(a0)     # dest 1
506    lw                t1, 4(a0)     # dest 2
507    lw                s0, 0(a1)     # src 1
508    lw                s1, 4(a1)     # src 2
509    not               t2, t0
510    not               t3, t1
511    srl               t4, t2, 24
512    srl               t5, t3, 24
513    replv.ph          t2, t4
514    replv.ph          t3, t5
515    addiu             a1, 8
516    addiu             a2, -2
517
518    BYTE_MUL_x2 s0, s1, t8, AT, t2, t3, t9, t4, t5, t6, t7
519
520    addu              t0, t0, t8
521    addu              t1, t1, AT
522    sw                t0, 0(a0)
523    sw                t1, 4(a0)
524    bnez              a2, 2b
525     addiu             a0, 8
526
5273:
528    lw                s0, 0(sp)
529    lw                s1, 4(sp)
530    addiu             sp, sp, 8
531    jr                ra
532     nop
533    .set              at
534
535END(comp_func_DestinationOver_dsp_asm_x2)
536
537LEAF_MIPS_DSP(comp_func_solid_SourceIn_dsp_asm_x2)
538/*
539 * a0 - uint *dest
540 * a1 - int length
541 * a2 - uint color
542 * a3 - uint const_alpha
543 */
544
545    .set              noat
546    addiu             sp, -12
547    sw                s0, 0(sp)
548    sw                s1, 4(sp)
549    sw                s2, 8(sp)
550    beqz              a1, 3f
551     nop
552    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
553    lui               t8, 0xff00
554    li                t0, 0xff
555    beq               a3, t0, 2f
556     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
557
558/* part where const_alpha != 255 */
5591:
560    replv.ph          t0, a3
561    li                t5, 0xff
562    BYTE_MUL a2, a2, t0, t9, t1, t2, t3, t4    /* a2 = color ( = BYTE_MUL(color, const_alpha)); */
563    subu              t1, t5, a3               /* t1 = cia = 255 - const_alpha */
56411:
565    lw                t2, 0(a0)                /* t2 = d */
566    lw                s0, 4(a0)
567    addiu             a1, -2
568    srl               t3, t2, 24               /* t3 = qAlpha(d) */
569    srl               s2, s0, 24
570
571    INTERPOLATE_PIXEL_255 a2, t3, t2, t1, AT, t9, t8, t4, t5, t6, t7
572    INTERPOLATE_PIXEL_255 a2, s2, s0, t1, s1, t9, t8, t4, t5, t6, t7
573
574    sw                AT, 0(a0)
575    sw                s1, 4(a0)
576    bnez              a1, 11b
577     addiu            a0, 8
578    b                 3f
579     nop
580
581/* part where const_alpha = 255 */
5822:
583    lw                t0, 0(a0)                /* dest 1 */
584    lw                t1, 4(a0)                /* dest 2 */
585    srl               t4, t0, 24
586    srl               t5, t1, 24
587    replv.ph          t2, t4
588    replv.ph          t3, t5
589    addiu             a1, -2
590
591    BYTE_MUL_x2 a2, a2, t8, AT, t2, t3, t9, t4, t5, t6, t7
592
593    sw                t8, 0(a0)
594    sw                AT, 4(a0)
595    bnez              a1, 2b
596     addiu             a0, 8
597
5983:
599    lw                s0, 0(sp)
600    lw                s1, 4(sp)
601    lw                s2, 8(sp)
602    addiu             sp, 12
603    jr                ra
604     nop
605    .set              at
606
607END(comp_func_solid_SourceIn_dsp_asm_x2)
608
609LEAF_MIPS_DSP(comp_func_SourceIn_dsp_asm_x2)
610/*
611 * a0 - uint *dest
612 * a1 - const uint *src
613 * a2 - int length
614 * a3 - uint const_alpha
615 */
616
617    .set              noat
618    addiu             sp, -16
619    sw                s0, 0(sp)
620    sw                s1, 4(sp)
621    sw                s2, 8(sp)
622    sw                s3, 12(sp)
623    beqz              a2, 3f
624     nop
625    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
626    lui               t8, 0xff00
627    li                t0, 0xff
628    beq               a3, t0, 2f
629     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
630
631/* part where const_alpha != 255 */
6321:
633    li                t5, 0xff
634    subu              t7, t5, a3               /* t7 = cia = 255 - const_alpha */
635    replv.ph          a3, a3
63611:
637    lw                t0, 0(a1)                /* t0 = src 1 */
638    lw                t1, 4(a1)                /* t1 = src 2 */
639    addiu             a2, -2
640
641    BYTE_MUL_x2 t0, t1, AT, s0, a3, a3, t9, t3, t4, t5, t6, 0
642
643    lw                t0, 0(a0)                /* t0 = dest 1 */
644    lw                t1, 4(a0)                /* t1 = dest 2 */
645    addiu             a1, 8
646
647    srl               t2, t0, 24               /* t2 = qAlpha(d) 1 */
648    srl               t3, t1, 24               /* t3 = qAlpha(d) 2 */
649
650    INTERPOLATE_PIXEL_255 AT, t2, t0, t7, s1, t9, t8, t4, t5, t6, s3
651    INTERPOLATE_PIXEL_255 s0, t3, t1, t7, s2, t9, t8, t4, t5, t6, s3
652
653    sw                s1, 0(a0)
654    sw                s2, 4(a0)
655    bnez              a2, 11b
656     addiu            a0, 8
657    b                 3f
658     nop
659
660/* part where const_alpha = 255 */
6612:
662    lw                t2, 0(a0)                /* dest 1 */
663    lw                t3, 4(a0)                /* dest 2 */
664    lw                t0, 0(a1)                /* src 1 */
665    lw                t1, 4(a1)                /* src 2 */
666    srl               t4, t2, 24
667    srl               t5, t3, 24
668    replv.ph          t2, t4
669    replv.ph          t3, t5
670    addiu             a2, -2
671
672    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7
673
674    addiu             a1, 8
675    sw                t8, 0(a0)
676    sw                AT, 4(a0)
677    bnez              a2, 2b
678     addiu             a0, 8
679
6803:
681    lw                s0, 0(sp)
682    lw                s1, 4(sp)
683    lw                s2, 8(sp)
684    lw                s3, 12(sp)
685    addiu             sp, 16
686    jr                ra
687     nop
688    .set              at
689
690END(comp_func_SourceIn_dsp_asm_x2)
691
692LEAF_MIPS_DSP(comp_func_solid_DestinationIn_dsp_asm_x2)
693/*
694 * a0 - uint *dest
695 * a1 - int length
696 * a2 - uint a
697 */
698
699    .set              noat
700    beqz              a1, 2f
701     nop
702    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
703    replv.ph          a2, a2
7041:
705    lw                t0, 0(a0)
706    lw                t1, 4(a0)
707    addiu             a1, -2
708
709    BYTE_MUL_x2 t0, t1, t8, AT, a2, a2, t9, t4, t5, t6, t7, 0
710
711    sw                t8, 0(a0)
712    sw                AT, 4(a0)
713    bnez              a1, 1b
714     addiu            a0, 8
7152:
716    jr                ra
717     nop
718    .set              at
719
720END(comp_func_solid_DestinationIn_dsp_asm_x2)
721
722LEAF_MIPS_DSP(comp_func_DestinationIn_dsp_asm_x2)
723/*
724 * a0 - uint *dest
725 * a1 - const uint *src
726 * a2 - int length
727 * a3 - uint const_alpha
728 */
729
730    addiu             sp, -8
731    sw                s0, 0(sp)
732    sw                s1, 4(sp)
733    beqz              a2, 3f
734     nop
735    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
736    li                t0, 0xff
737    beq               a3, t0, 2f
738     nop
739
740/* part where const_alpha != 255 */
7411:
742    li                t5, 0xff
743    subu              t8, t5, a3               /* t8 = cia = 255 - const_alpha */
744    replv.ph          a3, a3
74511:
746    lw                t0, 0(a1)                /* t0 = src 1 */
747    lw                t1, 4(a1)                /* t1 = src 2 */
748    addiu             a2, -2
749    srl               t0, t0, 24
750    srl               t1, t1, 24
751
752    BYTE_MUL_x2 t0, t1, s1, t7, a3, a3, t9, t3, t4, t5, t6, 0
753
754    lw                t0, 0(a0)                /* t0 = dest 1 */
755    lw                t1, 4(a0)                /* t1 = dest 2 */
756    addu              s1, s1, t8               /* a 1 */
757    addu              t7, t7, t8               /* a 2 */
758    replv.ph          t2, s1
759    replv.ph          t3, t7
760
761    BYTE_MUL_x2 t0, t1, s1, t7, t2, t3, t9, t4, t5, t6, s0
762
763    addiu             a1, 8
764    sw                s1, 0(a0)
765    sw                t7, 4(a0)
766    bnez              a2, 11b
767     addiu            a0, 8
768    b                 3f
769     nop
770
771/* part where const_alpha = 255 */
7722:
773    lw                t2, 0(a1)                /* src 1 */
774    lw                t3, 4(a1)                /* src 2 */
775    lw                t0, 0(a0)                /* dest 1 */
776    lw                t1, 4(a0)                /* dest 2 */
777    srl               t4, t2, 24
778    srl               t5, t3, 24
779    replv.ph          t2, t4                   /* t2 = qAlpha(src 1) */
780    replv.ph          t3, t5                   /* t3 = qAlpha(src 2) */
781    addiu             a2, -2
782
783    BYTE_MUL_x2 t0, t1, t8, s1, t2, t3, t9, t4, t5, t6, t7
784
785    addiu             a1, 8
786    sw                t8, 0(a0)
787    sw                s1, 4(a0)
788    bnez              a2, 2b
789     addiu             a0, 8
790
7913:
792    lw                s0, 0(sp)
793    lw                s1, 4(sp)
794    addiu             sp, 8
795    jr                ra
796     nop
797
798END(comp_func_DestinationIn_dsp_asm_x2)
799
800LEAF_MIPS_DSP(comp_func_DestinationOut_dsp_asm_x2)
801/*
802 * a0 - uint *dest
803 * a1 - const uint *src
804 * a2 - int length
805 * a3 - uint const_alpha
806 */
807
808    .set              noat
809    addiu             sp, -4
810    sw                s0, 0(sp)
811    beqz              a2, 3f
812     nop
813    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
814    li                t0, 0xff
815    beq               a3, t0, 2f
816     nop
817
818/* part where const_alpha != 255 */
8191:
820    li                t5, 0xff
821    subu              t8, t5, a3               /* t8 = cia = 255 - const_alpha */
822    replv.ph          a3, a3
82311:
824    lw                t0, 0(a1)                /* t0 = src 1 */
825    lw                t1, 4(a1)                /* t1 = src 2 */
826    not               t0, t0
827    not               t1, t1
828    addiu             a2, -2
829    srl               t0, t0, 24
830    srl               t1, t1, 24
831
832    BYTE_MUL_x2       t0, t1, AT, t7, a3, a3, t9, t3, t4, t5, t6, 0
833
834    lw                t0, 0(a0)                /* t0 = dest 1 */
835    lw                t1, 4(a0)                /* t1 = dest 2 */
836    addu              AT, AT, t8               /* a 1 */
837    addu              t7, t7, t8               /* a 2 */
838    replv.ph          t2, AT
839    replv.ph          t3, t7
840
841    BYTE_MUL_x2 t0, t1, AT, t7, t2, t3, t9, t4, t5, t6, s0
842
843    addiu             a1, 8
844    sw                AT, 0(a0)
845    sw                t7, 4(a0)
846    bnez              a2, 11b
847     addiu            a0, 8
848    b                 3f
849     nop
850
851/* part where const_alpha = 255 */
8522:
853    lw                t2, 0(a1)                /* src 1 */
854    lw                t3, 4(a1)                /* src 2 */
855    not               t2, t2
856    not               t3, t3
857    lw                t0, 0(a0)                /* dest 1 */
858    lw                t1, 4(a0)                /* dest 2 */
859    srl               t4, t2, 24
860    srl               t5, t3, 24
861    replv.ph          t2, t4                   /* t2 = qAlpha(src 1) */
862    replv.ph          t3, t5                   /* t3 = qAlpha(src 2) */
863    addiu             a2, -2
864
865    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7
866
867    addiu             a1, 8
868    sw                t8, 0(a0)
869    sw                AT, 4(a0)
870    bnez              a2, 2b
871     addiu             a0, 8
872
8733:
874    lw                s0, 0(sp)
875    addiu             sp, 4
876    jr                ra
877     nop
878    .set              at
879
880END(comp_func_DestinationOut_dsp_asm_x2)
881
882LEAF_MIPS_DSP(comp_func_solid_SourceAtop_dsp_asm_x2)
883/*
884 * a0 - uint *dest
885 * a1 - int length
886 * a2 - uint color
887 * a3 - uint sia
888 */
889
890    .set              noat
891    addu              sp, -4
892    sw                s0, 0(sp)
893    beqz              a1, 2f
894     nop
895    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
896    lui               t8, 0xff00
897    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
8981:
899    lw                t0, 0(a0)                /* t0 = dest 1 */
900    lw                t1, 4(a0)                /* t1 = dest 2 */
901    addiu             a1, -2
902    srl               t2, t0, 24               /* t2 = qAlpha(dest 1) */
903    srl               t3, t1, 24               /* t3 = qAlpha(dest 2) */
904
905    INTERPOLATE_PIXEL_255 a2, t2, t0, a3, AT, t9, t8, t4, t5, t6, t7
906    INTERPOLATE_PIXEL_255 a2, t3, t1, a3, s0, t9, t8, t4, t5, t6, t7
907
908    sw                AT, 0(a0)
909    sw                s0, 4(a0)
910    bnez              a1, 1b
911     addiu            a0, 8
9122:
913    lw                s0, 0(sp)
914    addiu             sp, 4
915    jr                ra
916     nop
917    .set              at
918
919END(comp_func_solid_SourceAtop_dsp_asm_x2)
920
921LEAF_MIPS_DSP(comp_func_SourceAtop_dsp_asm_x2)
922/*
923 * a0 - uint *dest
924 * a1 - const uint *src
925 * a2 - int length
926 * a3 - uint const_alpha
927 */
928
929    .set              noat
930    addiu             sp, -20
931    sw                s0, 0(sp)
932    sw                s1, 4(sp)
933    sw                s2, 8(sp)
934    sw                s3, 12(sp)
935    sw                s4, 16(sp)
936    beqz              a2, 3f
937     nop
938    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
939    lui               t8, 0xff00
940    li                t0, 0xff
941    beq               a3, t0, 2f
942     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
943
944/* part where const_alpha != 255 */
9451:
946    replv.ph          a3, a3
94711:
948    lw                AT, 0(a1)                /* src 1 */
949    lw                s0, 4(a1)                /* src 2 */
950
951    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
952                                               /* t0 = s */
953
954    lw                t2, 0(a0)                /* t2 = dest 1 */
955    lw                t3, 4(a0)                /* t3 = dest 2 */
956
957    srl               t4, t2, 24               /* t4 = qAplpha(dest 1) */
958    srl               t5, t3, 24
959    not               t6, t0
960    not               t7, t1
961    srl               t6, t6, 24               /* t6 = qAlpha(~s) */
962    srl               t7, t7, 24
963    addiu             a2, -2
964
965    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
966    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4
967
968    addiu             a1, 8
969    sw                AT, 0(a0)
970    sw                s0, 4(a0)
971    bnez              a2, 11b
972     addiu             a0, 8
973    b                 3f
974     nop
975
976/* part where const_alpha = 255 */
9772:
978    lw                t2, 0(a0)                /* dest 1 */
979    lw                t3, 4(a0)                /* dest 2 */
980    lw                t0, 0(a1)                /* src 1 */
981    lw                t1, 4(a1)                /* src 2 */
982    srl               t4, t2, 24
983    srl               t5, t3, 24
984    not               t6, t0
985    not               t7, t1
986    srl               t6, t6, 24
987    srl               t7, t7, 24
988    addiu             a2, -2
989
990    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
991    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4
992
993    addiu             a1, 8
994    sw                AT, 0(a0)
995    sw                s0, 4(a0)
996    bnez              a2, 2b
997     addiu             a0, 8
998
9993:
1000    lw                s0, 0(sp)
1001    lw                s1, 4(sp)
1002    lw                s2, 8(sp)
1003    lw                s3, 12(sp)
1004    lw                s4, 16(sp)
1005    addiu             sp, 20
1006    jr                 ra
1007     nop
1008    .set              at
1009
1010END(comp_func_SourceAtop_dsp_asm_x2)
1011
1012LEAF_MIPS_DSP(comp_func_solid_DestinationAtop_dsp_asm_x2)
1013/*
1014 * a0 - uint *dest
1015 * a1 - int length
1016 * a2 - uint color
1017 * a3 - uint a
1018 */
1019
1020    .set              noat
1021    addiu             sp, -4
1022    sw                s0, 0(sp)
1023    beqz              a1, 2f
1024     nop
1025    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
1026    lui               t8, 0xff00
1027    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
10281:
1029    lw                t0, 0(a0)                /* t0 = dest 1 */
1030    lw                t1, 4(a0)                /* t1 = dest 2 */
1031    addiu             a1, -2
1032    not               t2, t0
1033    not               t3, t1
1034    srl               t2, t2, 24               /* t2 = qAlpha(~(dest 1)) */
1035    srl               t3, t3, 24               /* t3 = qAlpha(~(dest 2)) */
1036
1037    INTERPOLATE_PIXEL_255 t0, a3, a2, t2, AT, t9, t8, t4, t5, t6, t7
1038    INTERPOLATE_PIXEL_255 t1, a3, a2, t3, s0, t9, t8, t4, t5, t6, t7
1039
1040    sw                AT, 0(a0)
1041    sw                s0, 4(a0)
1042    bnez              a1, 1b
1043     addiu            a0, 8
10442:
1045    lw                s0, 0(sp)
1046    addiu              sp, 4
1047    jr                ra
1048     nop
1049    .set              at
1050
1051END(comp_func_solid_DestinationAtop_dsp_asm_x2)
1052
1053LEAF_MIPS_DSP(comp_func_DestinationAtop_dsp_asm_x2)
1054/*
1055 * a0 - uint *dest
1056 * a1 - const uint *src
1057 * a2 - int length
1058 * a3 - uint const_alpha
1059 */
1060
1061    .set              noat
1062    addiu             sp, -24
1063    sw                s0, 0(sp)
1064    sw                s1, 4(sp)
1065    sw                s2, 8(sp)
1066    sw                s3, 12(sp)
1067    sw                s4, 16(sp)
1068    sw                s5, 20(sp)
1069    beqz              a2, 3f
1070     nop
1071    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
1072    lui               t8, 0xff00
1073    li                t0, 0xff
1074    beq               a3, t0, 2f
1075     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
1076
1077/* part where const_alpha != 255 */
10781:
1079    li                s5, 0xff
1080    subu              s5, s5, a3               /* s5 = cia = 255 - const_alpha */
1081    replv.ph          a3, a3
108211:
1083    lw                AT, 0(a1)                /* src 1 */
1084    lw                s0, 4(a1)                /* src 2 */
1085
1086    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
1087                                               /* t0 = s */
1088
1089    lw                t2, 0(a0)                /* t2 = dest 1 */
1090    lw                t3, 4(a0)                /* t3 = dest 2 */
1091
1092    not               t4, t2
1093    not               t5, t3
1094    srl               t4, t4, 24               /* t4 = qAplpha(~(dest 1)) */
1095    srl               t5, t5, 24
1096    srl               t6, t0, 24
1097    srl               t7, t1, 24
1098    addu              t6, t6, s5               /* t6 = a = qAlpha(s1) + cia */
1099    addu              t7, t7, s5
1100    addiu             a2, -2
1101
1102    INTERPOLATE_PIXEL_255 t2, t6, t0, t4, AT, t9, t8, s1, s2, s3, s4
1103    INTERPOLATE_PIXEL_255 t3, t7, t1, t5, s0, t9, t8, s1, s2, s3, s4
1104
1105    addiu             a1, 8
1106    sw                AT, 0(a0)
1107    sw                s0, 4(a0)
1108    bnez              a2, 11b
1109     addiu             a0, 8
1110    b                 3f
1111     nop
1112
1113/* part where const_alpha = 255 */
11142:
1115    lw                t2, 0(a0)                /* d1 */
1116    lw                t3, 4(a0)                /* d2 */
1117    lw                t0, 0(a1)                /* s1 */
1118    lw                t1, 4(a1)                /* s2 */
1119    srl               t4, t0, 24               /* t4 = qAlpha(s1) */
1120    srl               t5, t1, 24
1121    not               t6, t2
1122    not               t7, t3
1123    srl               t6, t6, 24               /* qAlpha(~d1) */
1124    srl               t7, t7, 24
1125    addiu             a2, -2
1126
1127    INTERPOLATE_PIXEL_255 t2, t4, t0, t6, AT, t9, t8, s1, s2, s3, s4
1128    INTERPOLATE_PIXEL_255 t3, t5, t1, t7, s0, t9, t8, s1, s2, s3, s4
1129
1130    addiu             a1, 8
1131    sw                AT, 0(a0)
1132    sw                s0, 4(a0)
1133    bnez              a2, 2b
1134     addiu             a0, 8
1135
11363:
1137    lw                s0, 0(sp)
1138    lw                s1, 4(sp)
1139    lw                s2, 8(sp)
1140    lw                s3, 12(sp)
1141    lw                s4, 16(sp)
1142    lw                s5, 20(sp)
1143    addiu             sp, 24
1144    jr                ra
1145     nop
1146    .set              at
1147
1148END(comp_func_DestinationAtop_dsp_asm_x2)
1149
1150LEAF_MIPS_DSP(comp_func_solid_XOR_dsp_asm_x2)
1151/*
1152 * a0 - uint *dest
1153 * a1 - int length
1154 * a2 - uint color
1155 * a3 - uint sia
1156 */
1157
1158    .set              noat
1159    addu              sp, -4
1160    sw                s0, 0(sp)
1161    beqz              a1, 2f
1162     nop
1163    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
1164    lui               t8, 0xff00
1165    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
11661:
1167    lw                t0, 0(a0)                /* t0 = dest 1 */
1168    lw                t1, 4(a0)                /* t1 = dest 2 */
1169    addiu             a1, -2
1170    not               t2, t0
1171    not               t3, t1
1172    srl               t2, t2, 24               /* t2 = qAlpha(~(dest 1)) */
1173    srl               t3, t3, 24               /* t3 = qAlpha(~(dest 2)) */
1174
1175    INTERPOLATE_PIXEL_255 a2, t2, t0, a3, AT, t9, t8, t4, t5, t6, t7
1176    INTERPOLATE_PIXEL_255 a2, t3, t1, a3, s0, t9, t8, t4, t5, t6, t7
1177
1178    sw                AT, 0(a0)
1179    sw                s0, 4(a0)
1180    bnez              a1, 1b
1181     addiu            a0, 8
11822:
1183    lw                s0, 0(sp)
1184    addu              sp, 4
1185    jr                ra
1186     nop
1187    .set              at
1188
1189END(comp_func_solid_XOR_dsp_asm_x2)
1190
1191LEAF_MIPS_DSP(comp_func_XOR_dsp_asm_x2)
1192/*
1193 * a0 - uint *dest
1194 * a1 - const uint *src
1195 * a2 - int length
1196 * a3 - uint const_alpha
1197 */
1198
1199    .set              noat
1200    addiu             sp, -20
1201    sw                s0, 0(sp)
1202    sw                s1, 4(sp)
1203    sw                s2, 8(sp)
1204    sw                s3, 12(sp)
1205    sw                s4, 16(sp)
1206    beqz              a2, 3f
1207     nop
1208    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
1209    lui               t8, 0xff00
1210    li                t0, 0xff
1211    beq               a3, t0, 2f
1212     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
1213
1214/* part where const_alpha != 255 */
12151:
1216    replv.ph          a3, a3
121711:
1218    lw                AT, 0(a1)                /* src 1 */
1219    lw                s0, 4(a1)                /* src 2 */
1220
1221    BYTE_MUL_x2 AT, s0, t0, t1, a3, a3, t9, t3, t4, t5, t6, 0
1222                                               /* t0 = s1 */
1223                                               /* t1 = s2 */
1224
1225    lw                t2, 0(a0)                /* t2 = dest 1 */
1226    lw                t3, 4(a0)                /* t3 = dest 2 */
1227
1228    not               t4, t2
1229    not               t5, t3
1230    srl               t4, t4, 24               /* t4 = qAplpha(~(dest 1)) */
1231    srl               t5, t5, 24
1232    not               t6, t0
1233    not               t7, t1
1234    srl               t6, t6, 24               /* t6 = qAlpha(~s) */
1235    srl               t7, t7, 24
1236    addiu             a2, -2
1237
1238    INTERPOLATE_PIXEL_255 t0, t4, t2, t6, AT, t9, t8, s1, s2, s3, s4
1239    INTERPOLATE_PIXEL_255 t1, t5, t3, t7, s0, t9, t8, s1, s2, s3, s4
1240
1241    addiu             a1, 8
1242    sw                AT, 0(a0)
1243    sw                s0, 4(a0)
1244    bnez              a2, 11b
1245     addiu             a0, 8
1246    b                 3f
1247     nop
1248
1249/* part where const_alpha = 255 */
12502:
1251    lw                t2, 0(a0)                /* d1 */
1252    lw                t3, 4(a0)                /* d2 */
1253    lw                t0, 0(a1)                /* s1 */
1254    lw                t1, 4(a1)                /* s2 */
1255    not               t4, t0
1256    not               t5, t1
1257    srl               t4, t4, 24               /* t4 = qAlpha(~s1) */
1258    srl               t5, t5, 24
1259    not               t6, t2
1260    not               t7, t3
1261    srl               t6, t6, 24               /* qAlpha(~d1) */
1262    srl               t7, t7, 24
1263    addiu             a2, -2
1264
1265    INTERPOLATE_PIXEL_255 t0, t6, t2, t4, AT, t9, t8, s1, s2, s3, s4
1266    INTERPOLATE_PIXEL_255 t1, t7, t3, t5, s0, t9, t8, s1, s2, s3, s4
1267
1268    addiu             a1, 8
1269    sw                AT, 0(a0)
1270    sw                s0, 4(a0)
1271    bnez              a2, 2b
1272     addiu             a0, 8
1273
12743:
1275    lw                s0, 0(sp)
1276    lw                s1, 4(sp)
1277    lw                s2, 8(sp)
1278    lw                s3, 12(sp)
1279    lw                s4, 16(sp)
1280    addiu             sp, 20
1281    jr                ra
1282     nop
1283    .set              at
1284
1285END(comp_func_XOR_dsp_asm_x2)
1286
1287LEAF_MIPS_DSP(comp_func_solid_SourceOut_dsp_asm_x2)
1288/*
1289 * a0 - uint *dest
1290 * a1 - int length
1291 * a2 - uint color
1292 * a3 - uint const_alpha
1293 */
1294
1295    .set              noat
1296    addiu             sp, -12
1297    sw                s0, 0(sp)
1298    sw                s1, 4(sp)
1299    sw                s2, 8(sp)
1300    beqz              a1, 3f
1301     nop
1302    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
1303    lui               t8, 0xff00
1304    li                t0, 0xff
1305    beq               a3, t0, 2f
1306     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
1307
1308/* part where const_alpha != 255 */
13091:
1310    replv.ph          t0, a3
1311    li                t5, 0xff
1312    BYTE_MUL a2, a2, t0, t9, t1, t2, t3, t4    /* a2 = color ( = BYTE_MUL(color, const_alpha)); */
1313    subu              t1, t5, a3               /* t1 = cia = 255 - const_alpha */
131411:
1315    lw                t2, 0(a0)                /* t2 = d1 */
1316    lw                s0, 4(a0)                /* s0 = d2 */
1317    addiu             a1, -2
1318    not               t3, t2
1319    not               s2, s0
1320    srl               t3, t3, 24               /* t3 = qAlpha(~d1) */
1321    srl               s2, s2, 24               /* s2 = qAlpha(~d2) */
1322
1323    INTERPOLATE_PIXEL_255 a2, t3, t2, t1, AT, t9, t8, t4, t5, t6, t7
1324    INTERPOLATE_PIXEL_255 a2, s2, s0, t1, s1, t9, t8, t4, t5, t6, t7
1325
1326    sw                AT, 0(a0)
1327    sw                s1, 4(a0)
1328    bnez              a1, 11b
1329     addiu            a0, 8
1330    b                 3f
1331     nop
1332
1333/* part where const_alpha = 255 */
13342:
1335    lw                t0, 0(a0)                /* dest 1 */
1336    lw                t1, 4(a0)                /* dest 2 */
1337    not               t4, t0
1338    not               t5, t1
1339    srl               t4, t4, 24
1340    srl               t5, t5, 24
1341    replv.ph          t2, t4
1342    replv.ph          t3, t5
1343    addiu             a1, -2
1344
1345    BYTE_MUL_x2 a2, a2, t8, AT, t2, t3, t9, t4, t5, t6, t7
1346
1347    sw                t8, 0(a0)
1348    sw                AT, 4(a0)
1349    bnez              a1, 2b
1350     addiu             a0, 8
1351
13523:
1353    lw                s0, 0(sp)
1354    lw                s1, 4(sp)
1355    lw                s2, 8(sp)
1356    addiu             sp, 12
1357    jr                ra
1358     nop
1359    .set              at
1360
1361END(comp_func_solid_SourceOut_dsp_asm_x2)
1362
1363LEAF_MIPS_DSP(comp_func_SourceOut_dsp_asm_x2)
1364/*
1365 * a0 - uint *dest
1366 * a1 - const uint *src
1367 * a2 - int length
1368 * a3 - uint const_alpha
1369 */
1370
1371    .set              noat
1372    addiu             sp, -16
1373    sw                s0, 0(sp)
1374    sw                s1, 4(sp)
1375    sw                s2, 8(sp)
1376    sw                s3, 12(sp)
1377    beqz              a2, 3f
1378     nop
1379    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
1380    lui               t8, 0xff00
1381    li                t0, 0xff
1382    beq               a3, t0, 2f
1383     ori               t8, t8, 0xff00         /* t8 = 0xff00ff00 (andi_factor) */
1384
1385/* part where const_alpha != 255 */
13861:
1387    li                t5, 0xff
1388    subu              t7, t5, a3               /* t7 = cia = 255 - const_alpha */
1389    replv.ph          a3, a3
139011:
1391    lw                t0, 0(a1)                /* t0 = src 1 */
1392    lw                t1, 4(a1)                /* t1 = src 2 */
1393    addiu             a2, -2
1394
1395    BYTE_MUL_x2 t0, t1, AT, s0, a3, a3, t9, t3, t4, t5, t6, 0
1396
1397    lw                t0, 0(a0)                /* t0 = dest 1 */
1398    lw                t1, 4(a0)                /* t1 = dest 2 */
1399    addiu             a1, 8
1400
1401    not               t2, t0
1402    not               t3, t1
1403    srl               t2, t2, 24               /* t2 = qAlpha(~d1) */
1404    srl               t3, t3, 24               /* t3 = qAlpha(~d2) */
1405
1406    INTERPOLATE_PIXEL_255 AT, t2, t0, t7, s1, t9, t8, t4, t5, t6, s3
1407    INTERPOLATE_PIXEL_255 s0, t3, t1, t7, s2, t9, t8, t4, t5, t6, s3
1408
1409    sw                s1, 0(a0)
1410    sw                s2, 4(a0)
1411    bnez              a2, 11b
1412     addiu            a0, 8
1413    b                 3f
1414     nop
1415
1416/* part where const_alpha = 255 */
14172:
1418    lw                t2, 0(a0)                /* dest 1 */
1419    lw                t3, 4(a0)                /* dest 2 */
1420    lw                t0, 0(a1)                /* src 1 */
1421    lw                t1, 4(a1)                /* src 2 */
1422    not               t4, t2
1423    not               t5, t3
1424    srl               t4, t4, 24               /* qAlpha(~d1) */
1425    srl               t5, t5, 24               /* qAlpha(~d2) */
1426    replv.ph          t2, t4
1427    replv.ph          t3, t5
1428    addiu             a2, -2
1429
1430    BYTE_MUL_x2 t0, t1, t8, AT, t2, t3, t9, t4, t5, t6, t7
1431
1432    addiu             a1, 8
1433    sw                t8, 0(a0)
1434    sw                AT, 4(a0)
1435    bnez              a2, 2b
1436     addiu             a0, 8
1437
14383:
1439    lw                s0, 0(sp)
1440    lw                s1, 4(sp)
1441    lw                s2, 8(sp)
1442    lw                s3, 12(sp)
1443    addiu             sp, 16
1444    jr                 ra
1445     nop
1446    .set              at
1447
1448END(comp_func_SourceOut_dsp_asm_x2)
1449
1450LEAF_MIPS_DSP(comp_func_Source_dsp_asm_x2)
1451/*
1452 * a0 - uint *dest
1453 * a1 - const uint *src
1454 * a2 - int length
1455 * a3 - uint const_alpha
1456 */
1457
1458    .set              noat
1459    addiu             sp, -8
1460    sw                s0, 0(sp)
1461    sw                s1, 4(sp)
1462    beqz              a2, 2f
1463     nop
1464    li                t9, 8388736              /* t9 = 0x800080 (rounding_factor) */
1465    lui               t8, 0xff00
1466    ori               t8, t8, 0xff00           /* t8 = 0xff00ff00 (andi_factor) */
1467    li                t7, 0xff
1468    subu              t7, t7, a3               /* t7 = ialpha */
14691:
1470    lw                t0, 0(a0)                /* t0 = dest 1 */
1471    lw                t1, 4(a0)                /* t1 = dest 2 */
1472    lw                t2, 0(a1)                /* t2 = src 1 */
1473    lw                t3, 4(a1)                /* t3 = src 2 */
1474    addiu             a2, -2
1475    addiu             a1, 8
1476
1477    INTERPOLATE_PIXEL_255 t2, a3, t0, t7, AT, t9, t8, t4, t5, t6, s1
1478    INTERPOLATE_PIXEL_255 t3, a3, t1, t7, s0, t9, t8, t4, t5, t6, s1
1479
1480    sw                AT, 0(a0)
1481    sw                s0, 4(a0)
1482    bnez              a2, 1b
1483     addiu            a0, 8
14842:
1485    lw                s0, 0(sp)
1486    lw                s1, 4(sp)
1487    addiu             sp, 8
1488    jr                ra
1489     nop
1490    .set              at
1491
1492END(comp_func_Source_dsp_asm_x2)
1493
1494LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_mips_dsp_asm_x2)
1495/*
1496 * a0 - uint *dest
1497 * a1 - const uint *src
1498 * a2 - int length
1499 * a3 - uint const_alpha
1500 */
1501
1502    .set              noat
1503    addiu             sp, -12
1504    sw                s0, 0(sp)
1505    sw                s1, 4(sp)
1506    sw                s2, 8(sp)
1507    beqz              a2, 2f
1508     nop
1509    replv.ph          a3, a3
1510    li                t9, 8388736             /* t9 = 0x800080 (rounding_factor) */
1511
15121:
1513    lw                t0, 0(a1)                /* t0 = src 1 */
1514    lw                t1, 4(a1)                /* t1 = src 2 */
1515    addiu             a2, -2
1516
1517    BYTE_MUL_x2       t0, t1, AT, t7, a3, a3, t9, t3, t4, t5, t6, 0
1518
1519    lw                t0, 0(a0)                /* t0 = dest 1 */
1520    lw                t1, 4(a0)                /* t1 = dest 2 */
1521    not               s1, AT
1522    not               s2, t7
1523    srl               s1, s1, 24               /* s1 = qAlpha(~s1) */
1524    srl               s2, s2, 24               /* s2 = qAlpha(~s2) */
1525    replv.ph          s1, s1
1526    replv.ph          s2, s2
1527
1528    BYTE_MUL_x2 t0, t1, t2, t3, s1, s2, t9, t4, t5, t6, s0
1529
1530    addiu             a1, 8
1531    addu              AT, AT, t2
1532    addu              t7, t7, t3
1533    sw                AT, 0(a0)
1534    sw                t7, 4(a0)
1535    bnez              a2, 1b
1536     addiu            a0, 8
1537
15382:
1539    lw                s0, 0(sp)
1540    lw                s1, 4(sp)
1541    lw                s2, 8(sp)
1542    addiu             sp, 12
1543    jr                ra
1544     nop
1545    .set              at
1546
1547END(qt_blend_argb32_on_argb32_mips_dsp_asm_x2)
1548
1549LEAF_MIPS_DSP(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm)
1550/*
1551 * a0 - uint *dest
1552 * a1 - const uint *src
1553 * a2 - int length
1554 */
1555
1556    beqz              a2, 5f
1557     nop
1558    li                t7, 8388736    /* t7 = 0x800080 */
1559    b                 2f
1560     nop
15611:
1562    addiu             a0, a0, 4
1563    addiu             a2, a2, -1
1564    beqz              a2, 5f
1565     nop
15662:
1567    lw                t0, 0(a1)      /* t0 = s = src[i] */
1568    addiu             a1, a1, 4
1569    nor               t1, t0, zero
1570    srl               t1, t1, 24     /* t1 = ~qAlpha(s) */
1571    bnez              t1, 3f
1572     nop
1573    sw                t0, 0(a0)      /* dst[i] = src[i] */
1574    addiu             a2, a2, -1
1575    bnez              a2, 2b
1576     addiu            a0, a0, 4
1577    b 5f
1578     nop
15793:
1580    beqz              t0, 1b
1581     replv.ph          t6, t1        /* | 0 | qAlpha(~s) | 0 | qAlpha(~s) | */
1582
1583    lw                t4, 0(a0)
1584    addiu             a2, a2, -1
1585    beqz              t4, 31f
1586     move             t8, zero
1587
1588    BYTE_MUL t4, t8, t6, t7, t1, t2, t3, t4
158931:
1590    addu              t8, t0, t8    /* dst[i] =
1591                                     * s + BYTE_MUL(dst[i],~qAlpha(s)) */
1592    sw                t8, 0(a0)
1593    bnez              a2, 2b
1594     addiu            a0, a0, 4
1595    b                 5f
1596     nop
15975:
1598    jr                ra
1599     nop
1600
1601END(qt_blend_argb32_on_argb32_const_alpha_256_mips_dsp_asm)
1602
1603
1604#if defined(__MIPSEL) && __MIPSEL
1605# define PACK(r, s, t)  packrl.ph r, s, t
1606# define SWHI(r, o, b)  swl r, o + 1 (b)
1607# define SWLO(r, o, b)  swr r, o + 0 (b)
1608# define LDHI(r, o, b)  lwl r, o + 1 (b)
1609# define LDLO(r, o, b)  lwr r, o + 2 (b)
1610#else
1611# define PACK(r, s, t)  packrl.ph r, t, s
1612# define SWHI(r, o, b)  swr r, o + 1 (b)
1613# define SWLO(r, o, b)  swl r, o + 0 (b)
1614# define LDHI(r, o, b)  lwr r, o + 1 (b)
1615# define LDLO(r, o, b)  lwl r, o + 2 (b)
1616#endif
1617
1618LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm)
1619/*
1620 * a0 - dst (*r5g6b5)
1621 * a1 - src (const *r5g6b5)
1622 * a2 - len (unsigned int)
1623 *
1624 * Register usage:
1625 *  t0-3 - Scratch registers
1626 *  t4   - Number of iterations to do in unrolled loops
1627 *  t5-7 - Auxiliary scratch registers.
1628 *
1629 * Check if base addresses of src/dst are aligned, cases:
1630 *  a) Both aligned.
1631 *  b) Both unaligned:
1632 *      1. Copy a halfword
1633 *      2. Use aligned case.
1634 *  c) dst aligned, src unaligned:
1635 *      1. Read a word from dst, halfword from src.
1636 *      2. Continue reading words from both.
1637 *  d) dst unaligned, src aligned:
1638 *      1. Read a word from src, halfword from dst.
1639 *      2. Continue reading words from both.
1640 */
1641
1642    beqz   a2, 0f       /* if (a2:len == 0): return */
1643     andi  t0, a0, 0x3  /* t0 = a0:dst % 4 */
1644    andi   t1, a1, 0x3  /* t1 = a1:dst % 4 */
1645    or     t2, t0, t1   /* t1 = t0 | t1 */
1646
1647    beqz   t2, 4f       /* both aligned */
1648     nop
1649    beqz   t0, 3f       /* dst aligned, src unaligned */
1650     nop
1651    beqz   t1, 2f       /* src aligned, dst unaligned */
1652     nop
1653
1654    /*
1655     * Both src/dst are unaligned: read 1 halfword from each,
1656     * the fall-off to continue with word-aligned copy.
1657     */
1658    lhu    t0, 0 (a1)    /* t0 <- ((uint16_t*) src)[0] */
1659    addiu  a1, a1, 2     /* src++ */
1660    addiu  a2, a2,-1     /* len-- */
1661    sh     t0, 0 (a0)    /* t1 -> ((uint16_t*) dst)[0] */
1662    addiu  a0, a0, 2     /* dst++ */
1663
1664    /*
1665     * Both src/dst pointers are word-aligned, process eight
1666     * items at a time in an unrolled loop.
1667     */
16684:  beqz   a2, 0f        /* if (len == 0): return */
1669     srl   t4, a2, 3     /* t4 = len / 8 */
1670
1671    beqz   t4, 5f        /* if (t4 == 0): tail */
1672     andi  a2, a2, 0x07  /* len = len % 8 */
1673
16741:  lw     t0,  0 (a1)
1675    lw     t1,  4 (a1)
1676    lw     t2,  8 (a1)
1677    lw     t3, 12 (a1)
1678
1679    addiu  t4, t4, -1     /* t4-- */
1680    addiu  a1, a1, 16     /* src += 8 */
1681
1682    sw     t0,  0 (a0)
1683    sw     t1,  4 (a0)
1684    sw     t2,  8 (a0)
1685    sw     t3, 12 (a0)
1686
1687    bnez   t4, 1b
1688     addiu a0, a0, 16     /* dst += 8 */
1689
1690    b 5f
1691    nop
1692
1693
1694    /*
1695     * dst pointer is unaligned
1696     */
16972:  beqz   a2, 0f        /* if (len == 0): return */
1698     srl   t4, a2, 3     /* t4 = len / 8 */
1699    beqz   t4, 5f        /* if (t4 == 0): tail */
1700     andi  a2, a2, 0x07  /* len = len % 8 */
1701
17021:  lw     t0,  0 (a1)
1703   lw     t1,  4 (a1)
1704    lw     t2,  8 (a1)
1705    lw     t3, 12 (a1)
1706
1707    addiu  t4, t4, -1    /* t4-- */
1708    addiu  a1, a1, 16    /* src += 8 */
1709
1710    SWLO  (t0,  0, a0)
1711    PACK  (t5, t1, t0)
1712    PACK  (t6, t2, t1)
1713    PACK  (t7, t3, t2)
1714    SWHI  (t3, 14, a0)
1715    sw     t5,  2 (a0)
1716    sw     t6,  6 (a0)
1717    sw     t7, 10 (a0)
1718
1719    bnez   t4, 1b
1720     addiu a0, a0, 16    /* dst += 8 */
1721
1722    b 5f
1723     nop
1724
1725    /*
1726     * src pointer is unaligned
1727     */
17283:  beqz   a2, 0f        /* if (len == 0): return */
1729     srl   t4, a2, 3     /* t4 = len / 8 */
1730    beqz   t4, 5f        /* if (t4 == 0): tail */
1731     andi  a2, a2, 0x07  /* len = len % 8 */
1732
17331:  LDHI  (t0,  0, a1)
1734    lw     t1,  2 (a1)
1735    lw     t2,  6 (a1)
1736    lw     t3, 10 (a1)
1737    LDLO  (t5, 12, a1)
1738
1739    addiu  t4, t4, -1    /* t4-- */
1740    addiu  a1, a1, 16    /* src += 8 */
1741
1742    PACK  (t0, t1, t0)
1743    PACK  (t6, t2, t1)
1744    PACK  (t7, t3, t2)
1745    sw     t0,  0 (a0)
1746    PACK  (t0, t5, t3)
1747    sw     t6,  4 (a0)
1748    sw     t7,  8 (a0)
1749    sw     t0, 12 (a0)
1750
1751    bnez   t4, 1b
1752     addiu a0, a0, 16    /* dst += 8 */
1753
1754
17555:  /* Process remaining items (a2:len < 4), one at a time */
1756    beqz   a2, 0f
1757     nop
1758
17591:  lhu    t0, 0 (a1)  /* t0 <- ((uint16_t*) src)[0] */
1760    addiu  a2, a2,-1   /* len-- */
1761    addiu  a1, a1, 2   /* src++ */
1762    sh     t0, 0 (a0)  /* to -> ((uint16_t*) dst)[0] */
1763    bnez   a2, 1b      /* if (len != 0): loop */
1764     addiu a0, a0, 2   /* dst++ */
1765
17660:  jr ra
1767     nop
1768
1769END(qt_blend_rgb16_on_rgb16_const_alpha_256_mips_dsp_asm)
1770
1771
1772#undef LDHI
1773#undef LDLO
1774#undef PACK
1775#undef SWHI
1776#undef SWLO
1777
1778
1779LEAF_MIPS_DSP(qt_blend_rgb16_on_rgb16_mips_dsp_asm)
1780/*
1781 * a0 - dst (*r5g6b5)
1782 * a1 - src (const *r5g6b5)
1783 * a2 - len (unsigned int) - batch length
1784 * a3 - alpha (int)
1785 */
1786
1787    beqz    a2, 2f
1788     li     t9, 255
1789    sll     t8, a3, 8
1790    subu    a3, t8, a3
1791    srl     a3, a3, 8
1792    subu    t9, t9, a3
1793    addiu   a3, a3, 1
1794    srl     t4, a3, 2
1795    addiu   t9, t9, 1
1796    srl     t5, t9, 2
17971:
1798    lhu     t0, 0(a1)
1799    lhu     t1, 0(a0)
1800    addiu   a2, a2, -1
1801    andi    t2, t0, 0x07e0
1802    andi    t0, t0, 0xf81f
1803    mul     t2, t2, a3
1804    mul     t0, t0, t4
1805    andi    t3, t1, 0x07e0
1806    andi    t1, t1, 0xf81f
1807    mul     t3, t3, t9
1808    mul     t1, t1, t5
1809    addiu   a1, a1, 2
1810    srl     t2, t2, 8
1811    srl     t0, t0, 6
1812    andi    t2, t2, 0x07e0
1813    andi    t0, t0, 0xf81f
1814    or      t0, t0, t2
1815    srl     t3, t3, 8
1816    srl     t1, t1, 6
1817    andi    t3, t3, 0x07e0
1818    andi    t1, t1, 0xf81f
1819    or      t1, t1, t3
1820    addu    t0, t0, t1
1821    sh      t0, 0(a0)
1822    bgtz    a2, 1b
1823     addiu  a0, a0, 2
18242:
1825    jr      ra
1826     nop
1827
1828END(qt_blend_rgb16_on_rgb16_mips_dsp_asm)
1829
1830
1831LEAF_MIPS_DSP(fetchUntransformed_888_asm_mips_dsp)
1832/*
1833 * a0 - dst address (address of 32-bit aRGB value)
1834 * a1 - src address
1835 * a2 - length
1836 */
1837
1838    beqz       a2, 4f
1839     lui       t8, 0xff00
1840    andi       t0, a2, 0x1
1841    beqz       t0, 1f
1842     nop
1843/* case for one pixel */
1844    lbu        t1, 0(a1)
1845    lbu        v1, 2(a1)
1846    lbu        t0, 1(a1)
1847    addiu      a1, a1, 3
1848    addiu      a2, a2, -1
1849    sll        t1, t1, 0x10
1850    or         v1, v1, t8
1851    sll        t0, t0, 0x8
1852    or         v1, v1, t1
1853    or         v1, v1, t0
1854    sw         v1, 0(a0)
1855    addiu      a0, a0, 4
1856
1857    beqz       a2, 4f        /* only one pixel is present (length = 1) */
1858     nop
18591:
1860    andi       t0, a1, 0x1
1861    beqz       t0, 3f
1862     nop
18632:
1864    lbu        t0, 0(a1)     /* t0 = | 0 | 0 | 0 | R1 | */
1865    lhu        t1, 1(a1)     /* t1 = | 0 | 0 | B1 | G1 | */
1866    addiu      a1, a1, 3
1867    lhu        t2, 0(a1)     /* t2 = | 0 | 0 | G2 | R2 | */
1868    lbu        t3, 2(a1)     /* t3 = | 0 | 0 | 0 | B2 | */
1869
1870    sll        t0, t0, 16
1871    or         t0, t0, t8    /* t0 = | ff | R1 | 0 | 0 | */
1872    shll.ph    t4, t1, 8     /* t4 = | 0 | 0 | G1 | 0 | */
1873    srl        t5, t1, 8
1874    or         t4, t4, t5    /* t4 = | 0 | 0 | G1 | B1 | */
1875    or         t0, t0, t4    /* t0 = | ff | R1 | G1 | B1 | */
1876
1877    shll.ph    t4, t2, 8     /* t4 = | 0 | 0 | R2 | 0 | */
1878    srl        t5, t2, 8     /* t5 = | 0 | 0 | 0 | G2 | */
1879    or         t4, t4, t5
1880    sll        t4, t4, 8     /* t4 = | 0 | R2 | G2 | 0 | */
1881    or         t5, t3, t8
1882    or         t2, t4, t5    /* t2 = | ff | R2 | G2 | B2 | */
1883
1884    sw         t0, 0(a0)
1885    addiu      a1, a1, 3
1886    sw         t2, 4(a0)
1887    addiu      a2, a2, -2
1888    bnez       a2, 2b
1889     addiu     a0, a0, 8
1890    b          4f
1891     nop
18923:
1893    lhu        t0, 0(a1)     /* t0 = | 0 | 0 | G1 | R1 | */
1894    lbu        t1, 2(a1)     /* t1 = | 0 | 0 | 0 | B1 | */
1895    addiu      a1, a1, 3
1896    lbu        t2, 0(a1)     /* t2 = | 0 | 0 | 0 | R2 | */
1897    lhu        t3, 1(a1)     /* t3 = | 0 | 0 | B2 | G2 | */
1898
1899    srl        t4, t0, 8     /* t4 = | 0 | 0 | 0 | G1 | */
1900    shll.ph    t5, t0, 8     /* t5 = | 0 | 0 | R1 | 0 | */
1901    or         t0, t4, t5
1902    sll        t6, t0, 8     /* t6 = | 0 | R1 | G1 | 0 | */
1903    or         t4, t1, t8    /* t4 = | ff | 0 | 0 | B1 | */
1904    or         t0, t6, t4
1905
1906    sll        t2, t2, 16
1907    srl        t4, t3, 8
1908    shll.ph    t5, t3, 8
1909    or         t3, t4, t5
1910    or         t2, t2, t3
1911    or         t2, t2, t8
1912
1913    sw         t0, 0(a0)
1914    addiu      a1, a1, 3
1915    sw         t2, 4(a0)
1916    addiu      a2, a2, -2
1917    bnez       a2, 3b
1918     addiu     a0, a0, 8
19194:
1920    jr         ra
1921     nop
1922
1923END(fetchUntransformed_888_asm_mips_dsp)
1924
1925
1926LEAF_MIPS_DSP(fetchUntransformed_444_asm_mips_dsp)
1927/*
1928 * a0 - dst address (address of 32-bit aRGB value)
1929 * a1 - src address
1930 * a2 - length
1931 */
1932
1933    lui              t8, 0xff00
1934    li               t4, 0x1
1935
1936    beqz             a2, 5f
1937     move            v0, a0         /* just return the address of buffer
1938                                     * for storing returning values */
1939    andi             t0, a2, 0x1
1940    beqz             t0, 2f         /* there is more then one pixel
1941                                     * (check src memory alignment (word)) */
1942     nop
19431:
1944    lhu              v0, 0(a1)
1945    addiu            a1, a1, 2
1946    addiu            a2, a2, -1
1947    andi             t0, v0, 0xf00
1948    andi             v1, v0, 0xf
1949    andi             v0, v0, 0xf0
1950    sra              t3, t0, 0x4
1951    sra              t1, v0, 0x4
1952    sra              t0, t0, 0x8
1953    sll              t2, v1, 0x4
1954    or               t0, t0, t3
1955    or               v0, t1, v0
1956    lui              t1, 0xff00
1957    or               v1, t2, v1
1958    sll              t0, t0, 0x10
1959    or               v1, v1, t1
1960    sll              v0, v0, 0x8
1961    or               v1, v1, t0
1962    or               v0, v1, v0
1963    sw               v0, 0(a0)
1964    addiu            a0, a0, 4
1965    beqz             a2, 5f         /* no more pixels for processing */
1966     nop
1967    beq              a2, t4, 4f     /* only one more pixel remained */
1968     nop
1969/* check if src memory address is word aligned */
19702:
1971    andi             t0, a1, 0x3
1972    beqz             t0, 3f         /* memory is word aligned */
1973     andi            a3, a2, 0x1    /* set the a3 register as the comparation
1974                                     * for ending the unrolled loop
1975                                     * (1 if odd, 0 if even) */
1976    b                1b             /* not word aligned,
1977                                     * go another turn with
1978                                     * just one pixel processing */
1979     nop
19803:
1981    lw               t0, 0(a1)
1982    addiu            a2, a2, -2
1983    preceu.ph.qbr    t1, t0         /* t1 = | 0 | aR1 | 0 | G1B1 | */
1984    preceu.ph.qbl    t2, t0         /* t1 = | 0 | aR2 | 0 | G2B2 | */
1985    shll.qb          t3, t1, 4      /* t3 = | 0 | R1 0 | 0 | B1 0 | */
1986    srl              t4, t3, 4
1987    or               t0, t3, t4     /* t0 = | 0 | R1R1 | 0 | B1B1 | */
1988    andi             t3, t1, 0xf0
1989    sll              t3, t3, 8
1990    srl              t4, t3, 4
1991    or               t1, t3, t4
1992    or               t0, t0, t1     /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */
1993    or               t0, t0, t8     /* t0 = | ff | R1R1 | G1G1 | B1B1 | */
1994
1995    shll.qb          t3, t2, 4      /* t3 = | 0 | R1 0 | 0 | B1 0 | */
1996    srl              t4, t3, 4
1997    or               t7, t3, t4     /* t0 = | 0 | R1R1 | 0 | B1B1 | */
1998    andi             t3, t2, 0xf0
1999    sll              t3, t3, 8
2000    srl              t4, t3, 4
2001    or               t1, t3, t4
2002    or               t2, t7, t1     /* t0 = | 0 | R1R1 | G1G1 | B1B1 | */
2003    or               t2, t2, t8     /* t0 = | ff | R1R1 | G1G1 | B1B1 | */
2004
2005    sw               t0, 0(a0)
2006    addiu            a1, a1, 4
2007    sw               t2, 4(a0)
2008    bne              a2, a3, 3b
2009     addiu           a0, a0, 8
2010    beqz             a2, 5f         /* no more pixels for processing */
2011     nop
20124:
2013/* one more pixel remained (after loop unrolling process finished) */
2014    lhu              v0, 0(a1)
2015    addiu            a1, a1, 2
2016    addiu            a2, a2, -1
2017    andi             t0, v0, 0xf00
2018    andi             v1, v0, 0xf
2019    andi             v0, v0, 0xf0
2020    sra              t3, t0, 0x4
2021    sra              t1, v0, 0x4
2022    sra              t0, t0, 0x8
2023    sll              t2, v1, 0x4
2024    or               t0, t0, t3
2025    or               v0, t1, v0
2026    lui              t1, 0xff00
2027    or               v1, t2, v1
2028    sll              t0, t0, 0x10
2029    or               v1, v1, t1
2030    sll              v0, v0, 0x8
2031    or               v1, v1, t0
2032    or               v0, v1, v0
2033    sw               v0, 0(a0)
2034    addiu            a0, a0, 4
20355:
2036    jr               ra
2037     nop
2038
2039END(fetchUntransformed_444_asm_mips_dsp)
2040
2041
2042LEAF_MIPS_DSP(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp)
2043/*
2044 * a0 - dst address
2045 * a1 - src address
2046 * a2 - length
2047 */
2048
2049    beqz      a2, 2f
2050     nop
2051
20521:
2053    ulh       t1, 0(a1)
2054    lbu       t2, 2(a1)
2055    addiu     a2, a2, -1
2056    wsbh      t1, t1
2057    sll       t0, t1, 8       /* t0 = 00000000rrrrrggggggbbbbb00000000 */
2058    ins       t0, t1, 3, 16   /* t0 = 00000000rrrrrrrrrrggggggbbbbb000 */
2059    ins       t0, t1, 5, 11   /* t0 = 00000000rrrrrrrrggggggbbbbbbb000 */
2060    srl       t4, t1, 9       /* t4 = 0000000000000000000000000rrrrrgg */
2061    replv.qb  t3, t2
2062    ins       t0, t4, 8, 2    /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */
2063    ins       t0, t1, 3, 5    /* t0 = 00000000rrrrrrrrggggggggbbbbb000 */
2064    srl       t4, t1, 2       /* t4 = 000000000000000000rrrrrggggggbbb */
2065    ins       t0, t4, 0, 3    /* t0 = 00000000rrrrrrrrggggggggbbbbbbbb */
2066    ins       t0, t2, 24, 8   /* t0 =aaaaaaaarrrrrrrrggggggggbbbbbbbb */
2067    cmpu.lt.qb t3, t0
2068    pick.qb   t0, t3, t0
2069    addiu     a1, a1, 3
2070    sw        t0, 0(a0)
2071    bgtz      a2, 1b
2072     addiu    a0, a0, 4
20732:
2074    jr        ra
2075     nop
2076
2077END(fetchUntransformed_argb8565_premultiplied_asm_mips_dsp)
2078