1; PowerPC optimized drawing methods for Goom
2; © 2003 Guillaume Borios
3; This library is free software; you can redistribute it and/or
4; modify it under the terms of the GNU Library General Public
5; License as published by the Free Software Foundation; either
6; version 2 of the License, or (at your option) any later version.
7;
8; This library is distributed in the hope that it will be useful,
9; but WITHOUT ANY WARRANTY; without even the implied warranty of
10; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11; Library General Public License for more details.
12;
13; You should have received a copy of the GNU Library General Public
14; License along with this library; if not, write to the
15; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
16; Boston, MA 02110-1301, USA.
17
18; Change log :
19; 30 May 2003 : File creation
20
21; Section definition : We use a read only code section for the whole file
22.section __TEXT,__text,regular,pure_instructions
23
24
25; --------------------------------------------------------------------------------------
26; Single 32b pixel drawing macros
27; Usage :
28; 	DRAWMETHOD_XXXX_MACRO *pixelIN, *pixelOUT, COLOR, WR1, WR2, WR3, WR4
29;	Only the work registers (WR) can be touched by the macros
30;
31; Available methods :
32;	DRAWMETHOD_DFLT_MACRO : Default drawing method (Actually OVRW)
33;	DRAWMETHOD_PLUS_MACRO : RVB Saturated per channel addition (SLOWEST)
34;	DRAWMETHOD_HALF_MACRO : 50% Transparency color drawing
35;	DRAWMETHOD_OVRW_MACRO : Direct COLOR drawing (FASTEST)
36;	DRAWMETHOD_B_OR_MACRO : Bitwise OR
37;	DRAWMETHOD_BAND_MACRO : Bitwise AND
38;	DRAWMETHOD_BXOR_MACRO : Bitwise XOR
39;	DRAWMETHOD_BNOT_MACRO : Bitwise NOT
40; --------------------------------------------------------------------------------------
41
42.macro DRAWMETHOD_OVRW_MACRO
43    stw		$2,0($1)	;; *$1 <- $2
44.endmacro
45
46.macro DRAWMETHOD_B_OR_MACRO
47    lwz		$3,0($0)	;; $3 <- *$0
48    or		$3,$3,$2	;; $3 <- $3 | $2
49    stw		$3,0($1)	;; *$1 <- $3
50.endmacro
51
52.macro DRAWMETHOD_BAND_MACRO
53    lwz		$3,0($0)	;; $3 <- *$0
54    and		$3,$3,$2	;; $3 <- $3 & $2
55    stw		$3,0($1)	;; *$1 <- $3
56.endmacro
57
58.macro DRAWMETHOD_BXOR_MACRO
59    lwz		$3,0($0)	;; $3 <- *$0
60    xor		$3,$3,$2	;; $3 <- $3 ^ $2
61    stw		$3,0($1)	;; *$1 <- $3
62.endmacro
63
64.macro DRAWMETHOD_BNOT_MACRO
65    lwz		$3,0($0)	;; $3 <- *$0
66    nand	$3,$3,$3	;; $3 <- ~$3
67    stw		$3,0($1)	;; *$1 <- $3
68.endmacro
69
70.macro DRAWMETHOD_PLUS_MACRO
71    lwz		$4,0($0)	;; $4 <- *$0
72    andi.	$3,$4,0xFF00	;; $3 <- $4 & 0x0000FF00
73    andi.	$5,$2,0xFF00	;; $5 <- $2 & 0x0000FF00
74    add		$3,$3,$5	;; $3 <- $3 + $5
75    rlwinm	$5,$3,15,0,0	;; $5 <- 0 | ($3[15] << 15)
76    srawi	$5,$5,23	;; $5 <- $5 >> 23 (algebraic for sign extension)
77    or		$3,$3,$5	;; $3 <- $3 | $5
78    lis		$5,0xFF		;; $5 <- 0x00FF00FF
79    addi	$5,$5,0xFF
80    and		$4,$4,$5	;; $4 <- $4 & $5
81    and		$6,$2,$5	;; $6 <- $2 & $5
82    add		$4,$4,$6	;; $4 <- $4 + $6
83    rlwinm	$6,$4,7,0,0	;; $6 <- 0 | ($4[7] << 7)
84    srawi	$6,$6,15	;; $6 <- $6 >> 15 (algebraic for sign extension)
85    rlwinm	$5,$4,23,0,0	;; $5 <- 0 | ($4[23] << 23)
86    srawi	$5,$5,31	;; $5 <- $5 >> 31 (algebraic for sign extension)
87    rlwimi	$6,$5,0,24,31	;; $6[24..31] <- $5[24..31]
88    or		$4,$4,$6	;; $4 <- $4 | $6
89    rlwimi	$4,$3,0,16,23	;; $4[16..23] <- $3[16..23]
90    stw		$4,0($1)	;; *$1 <- $4
91.endmacro
92
93.macro	DRAWMETHOD_HALF_MACRO
94    lwz		$4,0($0)	;; $4 <- *$0
95    andi.	$3,$4,0xFF00	;; $3 <- $4 & 0x0000FF00
96    andi.	$5,$2,0xFF00	;; $5 <- $2 & 0x0000FF00
97    add		$3,$3,$5	;; $3 <- $3 + $5
98    lis		$5,0xFF		;; $5 <- 0x00FF00FF
99    addi	$5,$5,0xFF
100    and		$4,$4,$5	;; $4 <- $4 & $5
101    and		$5,$2,$5	;; $5 <- $2 & $5
102    add		$4,$4,$5	;; $4 <- $4 + $5
103    srwi	$4,$4,1		;; $4 <- $4 >> 1
104    rlwimi	$4,$3,31,16,23	;; $4[16..23] <- $3[15..22]
105    stw		$4,0($1)	;; *$1 <- $4
106.endmacro
107
108.macro DRAWMETHOD_DFLT_MACRO
109    DRAWMETHOD_PLUS_MACRO
110.endmacro
111
112; --------------------------------------------------------------------------------------
113
114
115
116; **************************************************************************************
117; void DRAWMETHOD_PLUS_PPC(unsigned int * buf, unsigned int _col);
118; void DRAWMETHOD_PLUS_2_PPC(unsigned * in, unsigned int * out, unsigned int _col);
119; **************************************************************************************
120.globl _DRAWMETHOD_PLUS_2_PPC
121.align 3
122_DRAWMETHOD_PLUS_2_PPC:
123    DRAWMETHOD_PLUS_MACRO	r3,r4,r5,r6,r7,r8,r9
124    blr				;; return
125
126.globl _DRAWMETHOD_PLUS_PPC
127.align 3
128_DRAWMETHOD_PLUS_PPC:
129    DRAWMETHOD_PLUS_MACRO	r3,r3,r4,r5,r6,r7,r9
130    blr				;; return
131
132
133; **************************************************************************************
134; void DRAWMETHOD_HALF_PPC(unsigned int * buf, unsigned int _col);
135; void DRAWMETHOD_HALF_2_PPC(unsigned * in, unsigned int * out, unsigned int _col);
136; **************************************************************************************
137.globl _DRAWMETHOD_HALF_2_PPC
138.align 3
139_DRAWMETHOD_HALF_2_PPC:
140    DRAWMETHOD_HALF_MACRO	r3,r4,r5,r6,r7,r8
141    blr				;; return
142
143.globl _DRAWMETHOD_HALF_PPC
144.align 3
145_DRAWMETHOD_HALF_PPC:
146    DRAWMETHOD_HALF_MACRO	r3,r3,r4,r5,r6,r7
147    blr				;; return
148
149
150; **************************************************************************************
151; void DRAW_LINE_PPC(unsigned int *data, int x1, int y1, int x2, int y2, unsigned int col,
152; 			unsigned int screenx, unsigned int screeny)
153; **************************************************************************************
154.globl _DRAW_LINE_PPC
155.align 3
156_DRAW_LINE_PPC:
157    ;; NOT IMPLEMENTED YET
158    blr				;; return
159
160
161; **************************************************************************************
162; void _ppc_brightness(Pixel * src, Pixel * dest, unsigned int size, unsigned int coeff)
163; **************************************************************************************
164
165
166.const
167.align 4
168vectorZERO:
169    .long 0,0,0,0
170    .long 0x10101000, 0x10101001, 0x10101002, 0x10101003
171    .long 0x10101004, 0x10101005, 0x10101006, 0x10101007
172    .long 0x10101008, 0x10101009, 0x1010100A, 0x1010100B
173    .long 0x1010100C, 0x1010100D, 0x1010100E, 0x1010100F
174
175
176.section __TEXT,__text,regular,pure_instructions
177
178.globl _ppc_brightness_G4
179.align 3
180_ppc_brightness_G4:
181
182
183;; PowerPC Altivec code
184    srwi    r5,r5,2
185    mtctr   r5
186
187;;vrsave
188    mfspr   r11,256
189    lis     r12,0xCFFC
190    mtspr   256,r12
191
192        mflr r0
193        bcl 20,31,"L00000000001$pb"
194"L00000000001$pb":
195        mflr r10
196        mtlr r0
197
198    addis   r9,r10,ha16(vectorZERO-"L00000000001$pb")
199    addi    r9,r9,lo16(vectorZERO-"L00000000001$pb")
200
201    vxor    v0,v0,v0 ;; V0 = NULL vector
202
203    addi    r9,r9,16
204    lvx     v10,0,r9
205    addi    r9,r9,16
206    lvx     v11,0,r9
207    addi    r9,r9,16
208    lvx     v12,0,r9
209    addi    r9,r9,16
210    lvx     v13,0,r9
211
212    addis   r9,r10,ha16(vectortmpwork-"L00000000001$pb")
213    addi    r9,r9,lo16(vectortmpwork-"L00000000001$pb")
214    stw     r6,0(r9)
215    li      r6,8
216    stw     r6,4(r9)
217    lvx     v9,0,r9
218    li      r9,128
219    vspltw  v8,v9,0
220    vspltw  v9,v9,1
221
222;; elt counter
223    li      r9,0
224    lis     r7,0x0F01
225    b L7
226.align 4
227L7:
228    lvx     v1,r9,r3
229
230    vperm   v4,v1,v0,v10
231    ;*********************
232     add r10,r9,r3
233    ;*********************
234    vperm   v5,v1,v0,v11
235    vperm   v6,v1,v0,v12
236    vperm   v7,v1,v0,v13
237
238    vmulouh  v4,v4,v8
239    ;*********************
240     dst     r10,r7,3
241    ;*********************
242    vmulouh  v5,v5,v8
243    vmulouh  v6,v6,v8
244    vmulouh  v7,v7,v8
245    vsrw     v4,v4,v9
246    vsrw     v5,v5,v9
247    vsrw     v6,v6,v9
248    vsrw     v7,v7,v9
249
250    vpkuwus v4,v4,v5
251    vpkuwus v6,v6,v7
252    vpkuhus v1,v4,v6
253
254    stvx    v1,r9,r4
255    addi    r9,r9,16
256
257    bdnz L7
258
259    mtspr   256,r11
260    blr
261
262
263.globl _ppc_brightness_G5
264.align 3
265_ppc_brightness_G5:
266
267;; PowerPC Altivec G5 code
268    srwi    r5,r5,2
269    mtctr   r5
270
271;;vrsave
272    mfspr   r11,256
273    lis     r12,0xCFFC
274    mtspr   256,r12
275
276        mflr r0
277        bcl 20,31,"L00000000002$pb"
278"L00000000002$pb":
279        mflr r10
280        mtlr r0
281
282    addis   r9,r10,ha16(vectorZERO-"L00000000002$pb")
283    addi    r9,r9,lo16(vectorZERO-"L00000000002$pb")
284
285    vxor    v0,v0,v0 ;; V0 = NULL vector
286
287    addi    r9,r9,16
288    lvx     v10,0,r9
289    addi    r9,r9,16
290    lvx     v11,0,r9
291    addi    r9,r9,16
292    lvx     v12,0,r9
293    addi    r9,r9,16
294    lvx     v13,0,r9
295
296    addis   r9,r10,ha16(vectortmpwork-"L00000000002$pb")
297    addi    r9,r9,lo16(vectortmpwork-"L00000000002$pb")
298    stw     r6,0(r9)
299    li      r6,8
300    stw     r6,4(r9)
301    lvx     v9,0,r9
302    li      r9,128
303    vspltw  v8,v9,0
304    vspltw  v9,v9,1
305
306;; elt counter
307    li      r9,0
308    lis     r7,0x0F01
309    b L6
310.align 4
311L6:
312    lvx     v1,r9,r3
313
314    vperm   v4,v1,v0,v10
315    ;*********************
316    add r10,r9,r3
317    ;*********************
318    vperm   v5,v1,v0,v11
319    vperm   v6,v1,v0,v12
320    vperm   v7,v1,v0,v13
321
322    vmulouh  v4,v4,v8
323    vmulouh  v5,v5,v8
324    vmulouh  v6,v6,v8
325    vmulouh  v7,v7,v8
326    vsrw     v4,v4,v9
327    vsrw     v5,v5,v9
328    vsrw     v6,v6,v9
329    vsrw     v7,v7,v9
330
331    vpkuwus v4,v4,v5
332    vpkuwus v6,v6,v7
333    vpkuhus v1,v4,v6
334
335    stvx    v1,r9,r4
336    addi    r9,r9,16
337
338    bdnz L6
339
340    mtspr   256,r11
341    blr
342
343
344.globl _ppc_brightness_generic
345.align 3
346_ppc_brightness_generic:
347    lis   r12,0x00FF
348    ori   r12,r12,0x00FF
349    subi  r3,r3,4
350    subi  r4,r4,4
351    mtctr r5
352    b L1
353.align 4
354L1:
355    lwzu  r7,4(r3)
356
357    rlwinm  r8,r7,16,24,31
358    rlwinm  r9,r7,24,24,31
359    mullw   r8,r8,r6
360    rlwinm  r10,r7,0,24,31
361    mullw   r9,r9,r6
362    srwi    r8,r8,8
363    mullw   r10,r10,r6
364    srwi    r9,r9,8
365
366    rlwinm. r11,r8,0,0,23
367    beq     L2
368    li      r8,0xFF
369L2:
370    srwi    r10,r10,8
371    rlwinm. r11,r9,0,0,23
372    beq     L3
373    li      r9,0xFF
374L3:
375    rlwinm  r7,r8,16,8,15
376    rlwinm. r11,r10,0,0,23
377    beq     L4
378    li      r10,0xFF
379L4:
380    rlwimi  r7,r9,8,16,23
381    rlwimi  r7,r10,0,24,31
382
383    stwu    r7,4(r4)
384    bdnz L1
385
386    blr
387
388
389
390.static_data
391.align 4
392vectortmpwork:
393    .long 0,0,0,0
394
395