1; PowerPC optimized drawing methods for Goom
2; © 2003 Guillaume Borios
3; This Source Code is released under the terms of the General Public License
4
5; Change log :
6; 30 May 2003 : File creation
7
8; Section definition : We use a read only code section for the whole file
9.section __TEXT,__text,regular,pure_instructions
10
11
12; --------------------------------------------------------------------------------------
13; Single 32b pixel drawing macros
14; Usage :
15; 	DRAWMETHOD_XXXX_MACRO *pixelIN, *pixelOUT, COLOR, WR1, WR2, WR3, WR4
16;	Only the work registers (WR) can be touched by the macros
17;
18; Available methods :
19;	DRAWMETHOD_DFLT_MACRO : Default drawing method (Actually OVRW)
20;	DRAWMETHOD_PLUS_MACRO : RVB Saturated per channel addition (SLOWEST)
21;	DRAWMETHOD_HALF_MACRO : 50% Transparency color drawing
22;	DRAWMETHOD_OVRW_MACRO : Direct COLOR drawing (FASTEST)
23;	DRAWMETHOD_B_OR_MACRO : Bitwise OR
24;	DRAWMETHOD_BAND_MACRO : Bitwise AND
25;	DRAWMETHOD_BXOR_MACRO : Bitwise XOR
26;	DRAWMETHOD_BNOT_MACRO : Bitwise NOT
27; --------------------------------------------------------------------------------------
28
29.macro DRAWMETHOD_OVRW_MACRO
30    stw		$2,0($1)	;; *$1 <- $2
31.endmacro
32
33.macro DRAWMETHOD_B_OR_MACRO
34    lwz		$3,0($0)	;; $3 <- *$0
35    or		$3,$3,$2	;; $3 <- $3 | $2
36    stw		$3,0($1)	;; *$1 <- $3
37.endmacro
38
39.macro DRAWMETHOD_BAND_MACRO
40    lwz		$3,0($0)	;; $3 <- *$0
41    and		$3,$3,$2	;; $3 <- $3 & $2
42    stw		$3,0($1)	;; *$1 <- $3
43.endmacro
44
45.macro DRAWMETHOD_BXOR_MACRO
46    lwz		$3,0($0)	;; $3 <- *$0
47    xor		$3,$3,$2	;; $3 <- $3 ^ $2
48    stw		$3,0($1)	;; *$1 <- $3
49.endmacro
50
51.macro DRAWMETHOD_BNOT_MACRO
52    lwz		$3,0($0)	;; $3 <- *$0
53    nand	$3,$3,$3	;; $3 <- ~$3
54    stw		$3,0($1)	;; *$1 <- $3
55.endmacro
56
57.macro DRAWMETHOD_PLUS_MACRO
58    lwz		$4,0($0)	;; $4 <- *$0
59    andi.	$3,$4,0xFF00	;; $3 <- $4 & 0x0000FF00
60    andi.	$5,$2,0xFF00	;; $5 <- $2 & 0x0000FF00
61    add		$3,$3,$5	;; $3 <- $3 + $5
62    rlwinm	$5,$3,15,0,0	;; $5 <- 0 | ($3[15] << 15)
63    srawi	$5,$5,23	;; $5 <- $5 >> 23 (algebraic for sign extension)
64    or		$3,$3,$5	;; $3 <- $3 | $5
65    lis		$5,0xFF		;; $5 <- 0x00FF00FF
66    addi	$5,$5,0xFF
67    and		$4,$4,$5	;; $4 <- $4 & $5
68    and		$6,$2,$5	;; $6 <- $2 & $5
69    add		$4,$4,$6	;; $4 <- $4 + $6
70    rlwinm	$6,$4,7,0,0	;; $6 <- 0 | ($4[7] << 7)
71    srawi	$6,$6,15	;; $6 <- $6 >> 15 (algebraic for sign extension)
72    rlwinm	$5,$4,23,0,0	;; $5 <- 0 | ($4[23] << 23)
73    srawi	$5,$5,31	;; $5 <- $5 >> 31 (algebraic for sign extension)
74    rlwimi	$6,$5,0,24,31	;; $6[24..31] <- $5[24..31]
75    or		$4,$4,$6	;; $4 <- $4 | $6
76    rlwimi	$4,$3,0,16,23	;; $4[16..23] <- $3[16..23]
77    stw		$4,0($1)	;; *$1 <- $4
78.endmacro
79
80.macro	DRAWMETHOD_HALF_MACRO
81    lwz		$4,0($0)	;; $4 <- *$0
82    andi.	$3,$4,0xFF00	;; $3 <- $4 & 0x0000FF00
83    andi.	$5,$2,0xFF00	;; $5 <- $2 & 0x0000FF00
84    add		$3,$3,$5	;; $3 <- $3 + $5
85    lis		$5,0xFF		;; $5 <- 0x00FF00FF
86    addi	$5,$5,0xFF
87    and		$4,$4,$5	;; $4 <- $4 & $5
88    and		$5,$2,$5	;; $5 <- $2 & $5
89    add		$4,$4,$5	;; $4 <- $4 + $5
90    srwi	$4,$4,1		;; $4 <- $4 >> 1
91    rlwimi	$4,$3,31,16,23	;; $4[16..23] <- $3[15..22]
92    stw		$4,0($1)	;; *$1 <- $4
93.endmacro
94
95.macro DRAWMETHOD_DFLT_MACRO
96    DRAWMETHOD_PLUS_MACRO
97.endmacro
98
99; --------------------------------------------------------------------------------------
100
101
102
103; **************************************************************************************
104; void DRAWMETHOD_PLUS_PPC(unsigned int * buf, unsigned int _col);
105; void DRAWMETHOD_PLUS_2_PPC(unsigned * in, unsigned int * out, unsigned int _col);
106; **************************************************************************************
107.globl _DRAWMETHOD_PLUS_2_PPC
108.align 3
109_DRAWMETHOD_PLUS_2_PPC:
110    DRAWMETHOD_PLUS_MACRO	r3,r4,r5,r6,r7,r8,r9
111    blr				;; return
112
113.globl _DRAWMETHOD_PLUS_PPC
114.align 3
115_DRAWMETHOD_PLUS_PPC:
116    DRAWMETHOD_PLUS_MACRO	r3,r3,r4,r5,r6,r7,r9
117    blr				;; return
118
119
120; **************************************************************************************
121; void DRAWMETHOD_HALF_PPC(unsigned int * buf, unsigned int _col);
122; void DRAWMETHOD_HALF_2_PPC(unsigned * in, unsigned int * out, unsigned int _col);
123; **************************************************************************************
124.globl _DRAWMETHOD_HALF_2_PPC
125.align 3
126_DRAWMETHOD_HALF_2_PPC:
127    DRAWMETHOD_HALF_MACRO	r3,r4,r5,r6,r7,r8
128    blr				;; return
129
130.globl _DRAWMETHOD_HALF_PPC
131.align 3
132_DRAWMETHOD_HALF_PPC:
133    DRAWMETHOD_HALF_MACRO	r3,r3,r4,r5,r6,r7
134    blr				;; return
135
136
137; **************************************************************************************
138; void DRAW_LINE_PPC(unsigned int *data, int x1, int y1, int x2, int y2, unsigned int col,
139; 			unsigned int screenx, unsigned int screeny)
140; **************************************************************************************
141.globl _DRAW_LINE_PPC
142.align 3
143_DRAW_LINE_PPC:
144    ;; NOT IMPLEMENTED YET
145    blr				;; return
146
147
148; **************************************************************************************
149; void _ppc_brightness(Pixel * src, Pixel * dest, unsigned int size, unsigned int coeff)
150; **************************************************************************************
151
152
153.const
154.align 4
155vectorZERO:
156    .long 0,0,0,0
157    .long 0x10101000, 0x10101001, 0x10101002, 0x10101003
158    .long 0x10101004, 0x10101005, 0x10101006, 0x10101007
159    .long 0x10101008, 0x10101009, 0x1010100A, 0x1010100B
160    .long 0x1010100C, 0x1010100D, 0x1010100E, 0x1010100F
161
162
163.section __TEXT,__text,regular,pure_instructions
164
165.globl _ppc_brightness_G4
166.align 3
167_ppc_brightness_G4:
168
169
170;; PowerPC Altivec code
171    srwi    r5,r5,2
172    mtctr   r5
173
174;;vrsave
175    mfspr   r11,256
176    lis     r12,0xCFFC
177    mtspr   256,r12
178
179        mflr r0
180        bcl 20,31,"L00000000001$pb"
181"L00000000001$pb":
182        mflr r10
183        mtlr r0
184
185    addis   r9,r10,ha16(vectorZERO-"L00000000001$pb")
186    addi    r9,r9,lo16(vectorZERO-"L00000000001$pb")
187
188    vxor    v0,v0,v0 ;; V0 = NULL vector
189
190    addi    r9,r9,16
191    lvx     v10,0,r9
192    addi    r9,r9,16
193    lvx     v11,0,r9
194    addi    r9,r9,16
195    lvx     v12,0,r9
196    addi    r9,r9,16
197    lvx     v13,0,r9
198
199    addis   r9,r10,ha16(vectortmpwork-"L00000000001$pb")
200    addi    r9,r9,lo16(vectortmpwork-"L00000000001$pb")
201    stw     r6,0(r9)
202    li      r6,8
203    stw     r6,4(r9)
204    lvx     v9,0,r9
205    li      r9,128
206    vspltw  v8,v9,0
207    vspltw  v9,v9,1
208
209;; elt counter
210    li      r9,0
211    lis     r7,0x0F01
212    b L7
213.align 4
214L7:
215    lvx     v1,r9,r3
216
217    vperm   v4,v1,v0,v10
218    ;*********************
219     add r10,r9,r3
220    ;*********************
221    vperm   v5,v1,v0,v11
222    vperm   v6,v1,v0,v12
223    vperm   v7,v1,v0,v13
224
225    vmulouh  v4,v4,v8
226    ;*********************
227     dst     r10,r7,3
228    ;*********************
229    vmulouh  v5,v5,v8
230    vmulouh  v6,v6,v8
231    vmulouh  v7,v7,v8
232    vsrw     v4,v4,v9
233    vsrw     v5,v5,v9
234    vsrw     v6,v6,v9
235    vsrw     v7,v7,v9
236
237    vpkuwus v4,v4,v5
238    vpkuwus v6,v6,v7
239    vpkuhus v1,v4,v6
240
241    stvx    v1,r9,r4
242    addi    r9,r9,16
243
244    bdnz L7
245
246    mtspr   256,r11
247    blr
248
249
250.globl _ppc_brightness_G5
251.align 3
252_ppc_brightness_G5:
253
254;; PowerPC Altivec G5 code
255    srwi    r5,r5,2
256    mtctr   r5
257
258;;vrsave
259    mfspr   r11,256
260    lis     r12,0xCFFC
261    mtspr   256,r12
262
263        mflr r0
264        bcl 20,31,"L00000000002$pb"
265"L00000000002$pb":
266        mflr r10
267        mtlr r0
268
269    addis   r9,r10,ha16(vectorZERO-"L00000000002$pb")
270    addi    r9,r9,lo16(vectorZERO-"L00000000002$pb")
271
272    vxor    v0,v0,v0 ;; V0 = NULL vector
273
274    addi    r9,r9,16
275    lvx     v10,0,r9
276    addi    r9,r9,16
277    lvx     v11,0,r9
278    addi    r9,r9,16
279    lvx     v12,0,r9
280    addi    r9,r9,16
281    lvx     v13,0,r9
282
283    addis   r9,r10,ha16(vectortmpwork-"L00000000002$pb")
284    addi    r9,r9,lo16(vectortmpwork-"L00000000002$pb")
285    stw     r6,0(r9)
286    li      r6,8
287    stw     r6,4(r9)
288    lvx     v9,0,r9
289    li      r9,128
290    vspltw  v8,v9,0
291    vspltw  v9,v9,1
292
293;; elt counter
294    li      r9,0
295    lis     r7,0x0F01
296    b L6
297.align 4
298L6:
299    lvx     v1,r9,r3
300
301    vperm   v4,v1,v0,v10
302    ;*********************
303    add r10,r9,r3
304    ;*********************
305    vperm   v5,v1,v0,v11
306    vperm   v6,v1,v0,v12
307    vperm   v7,v1,v0,v13
308
309    vmulouh  v4,v4,v8
310    vmulouh  v5,v5,v8
311    vmulouh  v6,v6,v8
312    vmulouh  v7,v7,v8
313    vsrw     v4,v4,v9
314    vsrw     v5,v5,v9
315    vsrw     v6,v6,v9
316    vsrw     v7,v7,v9
317
318    vpkuwus v4,v4,v5
319    vpkuwus v6,v6,v7
320    vpkuhus v1,v4,v6
321
322    stvx    v1,r9,r4
323    addi    r9,r9,16
324
325    bdnz L6
326
327    mtspr   256,r11
328    blr
329
330
331.globl _ppc_brightness_generic
332.align 3
333_ppc_brightness_generic:
334    lis   r12,0x00FF
335    ori   r12,r12,0x00FF
336    subi  r3,r3,4
337    subi  r4,r4,4
338    mtctr r5
339    b L1
340.align 4
341L1:
342    lwzu  r7,4(r3)
343
344    rlwinm  r8,r7,16,24,31
345    rlwinm  r9,r7,24,24,31
346    mullw   r8,r8,r6
347    rlwinm  r10,r7,0,24,31
348    mullw   r9,r9,r6
349    srwi    r8,r8,8
350    mullw   r10,r10,r6
351    srwi    r9,r9,8
352
353    rlwinm. r11,r8,0,0,23
354    beq     L2
355    li      r8,0xFF
356L2:
357    srwi    r10,r10,8
358    rlwinm. r11,r9,0,0,23
359    beq     L3
360    li      r9,0xFF
361L3:
362    rlwinm  r7,r8,16,8,15
363    rlwinm. r11,r10,0,0,23
364    beq     L4
365    li      r10,0xFF
366L4:
367    rlwimi  r7,r9,8,16,23
368    rlwimi  r7,r10,0,24,31
369
370    stwu    r7,4(r4)
371    bdnz L1
372
373    blr
374
375
376
377.static_data
378.align 4
379vectortmpwork:
380    .long 0,0,0,0
381
382