1; PowerPC optimized zoom for Goom
2; © 2001-2003 Guillaume Borios
3; This library is free software; you can redistribute it and/or
4; modify it under the terms of the GNU Library General Public
5; License as published by the Free Software Foundation; either
6; version 2 of the License, or (at your option) any later version.
7;
8; This library is distributed in the hope that it will be useful,
9; but WITHOUT ANY WARRANTY; without even the implied warranty of
10; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11; Library General Public License for more details.
12;
13; You should have received a copy of the GNU Library General Public
14; License along with this library; if not, write to the
15; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
16; Boston, MA 02110-1301, USA.
17
18; Change log :
19; 21 Dec 2003 : Use of altivec is now determined with a parameter
20
21; Section definition : We use a read only section
22.text
23
24; name of the function to call by C program : ppc_zoom
25; We declare this label as a global to extend its scope outside this file
26.globl _ppc_zoom_generic
27.globl _ppc_zoom_G4
28
29; Description :
30; This routine dynamically computes and applies a zoom filter
31
32; parameters :
33; r3  <=> unsigned int sizeX (in pixels)
34; r4  <=> unsigned int sizeY (in pixels)
35; r5  <=> unsigned int * frompixmap
36; r6  <=> unsigned int * topixmap
37; r7  <=> unsigned int * brutS
38; r8  <=> unsigned int * brutD
39; r9  <=> unsigned int buffratio
40; r10 <=> int [16][16] precalccoeffs
41
42; globals after init
43; r5  <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5)
44; r6  <=> topixmap - 1 byte needed for preincremental fetch (replaces r6)
45; r3 <=> ax = x max in 16th of pixels (replaces old r3)
46; r4 <=> ay = y max in 16th of pixels (replaces old r4)
47; r20 <=> row size in bytes
48; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing)
49; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7)
50; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8)
51
52; ABI notes :
53; r1 is the Stack Pointer (SP) => Do not use
54; r13..r31 are non-volatiles => Do not use
55
56_ppc_zoom_generic:
57
58; Saves the used non volatile registers in the Mach-O stack s Red-Zone
59stmw 	r18,-56(r1)
60
61; init
62li      r18,0		; Default value if out of range : 0 (Black)
63mr      r11,r10
64lis     r12,0xFF
65mullw   r2,r3,r4	; Number of pixels to compute
66subi    r30,r8,0
67slwi	r20,r3,2
68srawi   r19,r20,2
69ori     r12,r12,0xFF
70subi    r3,r3,1
71subi    r4,r4,1
72mtspr	ctr,r2		; Init the loop count (one loop per pixel computed)
73subi    r31,r7,0
74subi    r6,r6,4
75slwi	r3,r3,4
76slwi	r4,r4,4
77
78;pre init for loop
79lwz	r2,0(r31)    ; px
80lwz	r29,4(r31)   ; py
81lwz	r8,0(r30)    ; px2
82lwz	r10,4(r30)   ; py2
83
84b       L1
85.align  5
86L1:
87
88; computes dynamically the position to fetch
89sub     r8,r8,r2
90sub     r10,r10,r29
91mullw   r8,r8,r9
92addi    r31,r31,8
93mullw   r10,r10,r9
94addi    r30,r30,8
95
96srawi   r8,r8,16
97srawi   r10,r10,16
98add     r2,r2,r8
99add     r29,r29,r10
100
101; if px>ax or py>ay goto outofrange
102; computes the attenuation coeffs and the original point address
103rlwinm  r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0   (r10=(r2%16)*4*16)
104cmpl    cr4,0,r2,r3
105rlwimi  r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D)      (r10=(r10%16)*4 | r10)
106cmpl    cr7,0,r29,r4
107srawi   r29,r29,4     ; pos computing
108bge-	cr4,L4
109srawi   r2,r2,4       ; pos computing
110mullw   r29, r29,r19  ; pos computing
111bge-	cr7,L4
112
113; Channels notation : 00112233 (AARRVVBB)
114
115add     r2,r2,r29    		; pos computing
116lwzx    r10,r11,r10		; Loads coefs
117slwi    r2,r2,2      		; pos computing
118add	r2,r2,r5     		; pos computing
119rlwinm  r21,r10,0,24,31	        ; Isolates coef1 (??????11 -> 00000011)
120lwz	r25,0(r2)		; Loads col1 -> r25
121lwz	r26,4(r2)		; Loads col2 -> r26
122rlwinm  r22,r10,24,24,31	; Isolates coef2 (????22?? -> 00000022)
123rlwinm  r23,r10,16,24,31	; Isolates coef3 (??33???? -> 00000033)
124add	r2,r2,r20		; Adds one line for future load of col3 and col4
125and	r8, r25,r12		; Masks col1 channels 1 & 3 : 0x00XX00XX
126rlwinm  r24,r10,8,24,31		; Isolates coef4 (44?????? -> 00000044)
127andi.	r25,r25,0xFF00		; Masks col1 channel 2 : 0x0000XX00
128mullw	r8, r8, r21		; Applies coef1 on col1 channels 1 & 3
129
130
131; computes final pixel color
132and	r10,r26,r12		; Masks col2 channels 1 & 3 : 0x00XX00XX
133lwz	r27,0(r2)		; Loads col3 -> r27
134mullw	r10,r10,r22		; Applies coef2 on col2 channels 1 & 3
135mullw	r25,r25,r21		; Applies coef1 on col1 channel 2
136andi.	r29,r26,0xFF00		; Masks col2 channel 2 : 0x0000XX00
137mullw	r29,r29,r22		; Applies coef2 on col2 channel 2
138lwz	r28,4(r2)		; Loads col4 -> r28
139add	r8 ,r8 ,r10		; Adds col1 & col2 channels 1 & 3
140and	r10,r27,r12		; Masks col3 channels 1 & 3 : 0x00XX00XX
141add	r25,r25,r29		; Adds col1 & col2 channel 2
142mullw	r10,r10,r23		; Applies coef3 on col3 channels 1 & 3
143andi.	r29,r27,0xFF00		; Masks col3 channel 2 : 0x0000XX00
144mullw	r29,r29,r23		; Applies coef3 on col3 channel 2
145lwz	r2,0(r31)		; px
146add	r7 ,r8 ,r10		; Adds col3 to (col1 + col2) channels 1 & 3
147and	r10,r28,r12		; Masks col4 channels 1 & 3 : 0x00XX00XX
148mullw	r10,r10,r24		; Applies coef4 on col4 channels 1 & 3
149add	r25,r25,r29		; Adds col 3 to (col1 + col2) channel 2
150lwz 	r8,0(r30)    		; px2
151andi.	r28,r28,0xFF00		; Masks col4 channel 2 : 0x0000XX00
152add	r7 ,r7 ,r10		; Adds col4 to (col1 + col2 + col3) channels 1 & 3
153lwz	r10,4(r30)   		; py2
154mullw	r28,r28,r24		; Applies coef4 on col4 channel 2
155srawi	r7, r7, 8		; (sum of channels 1 & 3) >> 8
156lwz	r29,4(r31)              ; py
157add	r25,r25,r28		; Adds col 4 to (col1 + col2 + col3) channel 2
158rlwimi  r7, r25, 24, 16, 23	; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
159stwu	r7,4(r6)		; Stores the computed pixel
160bdnz	L1			; Iterate again if needed
161b       L3	;goto end	; If not, returns from the function
162
163
164; if out of range
165L4:
166stwu	r18,4(r6)
167lwz	r8,0(r30)    ; px2
168lwz	r10,4(r30)   ; py2
169lwz	r2,0(r31)    ; px
170lwz	r29,4(r31)   ; py
171bdnz	L1
172
173
174L3:
175
176; Restore saved registers and return
177lmw	r18,-56(r1)
178blr
179
180
181
182
183
184
185
186
187_ppc_zoom_G4:
188
189; Saves the used non volatile registers in the Mach-O stack s Red-Zone
190stmw 	r17,-60(r1)
191
192; init
193li      r18,0		; Default value if out of range : 0 (Black)
194mr      r11,r10
195lis     r12,0xFF
196mullw   r2,r3,r4	; Number of pixels to compute
197subi    r30,r8,0
198slwi	r20,r3,2
199srawi   r19,r20,2
200ori     r12,r12,0xFF
201subi    r3,r3,1
202subi    r4,r4,1
203mtspr	ctr,r2		; Init the loop count (one loop per pixel computed)
204subi    r31,r7,0
205subi    r6,r6,4
206slwi	r3,r3,4
207slwi	r4,r4,4
208
209;pre init for loop
210lwz	r2,0(r31)    ; px
211lwz	r29,4(r31)   ; py
212lwz	r8,0(r30)    ; px2
213lwz	r10,4(r30)   ; py2
214
215;*********************
216lis     r17,0x0F01
217
218b       L100
219.align  5
220L100:
221
222addi    r6,r6,4
223
224; Optimization to ensure the destination buffer
225; won't be loaded into the data cache
226rlwinm. r0,r6,0,27,31
227bne+    L500
228dcbz    0,r6
229;dcba    0,r6
230L500:
231
232; computes dynamically the position to fetch
233;mullw   r8,r8,r29
234;mullw   r2,r2,r29
235;add     r2,r8,r2
236;srawi   r2,r2,17
237
238sub     r8,r8,r2
239sub     r10,r10,r29
240mullw   r8,r8,r9
241addi    r31,r31,8
242mullw   r10,r10,r9
243addi    r30,r30,8
244
245dst     r30,r17,0
246
247srawi    r8,r8,16
248srawi    r10,r10,16
249add     r2,r2,r8
250add     r29,r29,r10
251
252dst     r31,r17,1
253
254; if px>ax or py>ay goto outofrange
255; computes the attenuation coeffs and the original point address
256rlwinm  r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0   (r10=(r2%16)*4*16)
257cmpl    cr4,0,r2,r3
258rlwimi  r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D)      (r10=(r29%16)*4 | r10)
259cmpl    cr7,0,r29,r4
260srawi   r29,r29,4     ; pos computing
261bge-	cr4,L400
262srawi   r2,r2,4       ; pos computing
263mullw   r29, r29,r19  ; pos computing
264bge-	cr7,L400
265
266; Channels notation : 00112233 (AARRVVBB)
267
268add     r2,r2,r29    		; pos computing
269lwzx    r10,r11,r10		; Loads coefs
270slwi    r2,r2,2      		; pos computing
271add	r2,r2,r5     		; pos computing
272rlwinm  r21,r10,0,24,31	        ; Isolates coef1 (??????11 -> 00000011)
273lwz	r25,0(r2)		; Loads col1 -> r25
274lwz	r26,4(r2)		; Loads col2 -> r26
275rlwinm  r22,r10,24,24,31	; Isolates coef2 (????22?? -> 00000022)
276rlwinm  r23,r10,16,24,31	; Isolates coef3 (??33???? -> 00000033)
277add	r2,r2,r20		; Adds one line for future load of col3 and col4
278and	r8, r25,r12		; Masks col1 channels 1 & 3 : 0x00XX00XX
279rlwinm  r24,r10,8,24,31		; Isolates coef4 (44?????? -> 00000044)
280dst     r2,r17,2
281rlwinm  r25,r25,0,16,23		; Masks col1 channel 2 : 0x0000XX00
282;andi.	r25,r25,0xFF00		; Masks col1 channel 2 : 0x0000XX00
283mullw	r8, r8, r21		; Applies coef1 on col1 channels 1 & 3
284
285
286; computes final pixel color
287and	r10,r26,r12		; Masks col2 channels 1 & 3 : 0x00XX00XX
288lwz	r27,0(r2)		; Loads col3 -> r27
289mullw	r10,r10,r22		; Applies coef2 on col2 channels 1 & 3
290mullw	r25,r25,r21		; Applies coef1 on col1 channel 2
291rlwinm  r29,r26,0,16,23		; Masks col2 channel 2 : 0x0000XX00
292;andi.	r29,r26,0xFF00		; Masks col2 channel 2 : 0x0000XX00
293mullw	r29,r29,r22		; Applies coef2 on col2 channel 2
294lwz	r28,4(r2)		; Loads col4 -> r28
295add	r8 ,r8 ,r10		; Adds col1 & col2 channels 1 & 3
296and	r10,r27,r12		; Masks col3 channels 1 & 3 : 0x00XX00XX
297add	r25,r25,r29		; Adds col1 & col2 channel 2
298mullw	r10,r10,r23		; Applies coef3 on col3 channels 1 & 3
299rlwinm  r29,r27,0,16,23		; Masks col3 channel 2 : 0x0000XX00
300;andi.	r29,r27,0xFF00		; Masks col3 channel 2 : 0x0000XX00
301mullw	r29,r29,r23		; Applies coef3 on col3 channel 2
302lwz	r2,0(r31)		; px
303add	r7 ,r8 ,r10		; Adds col3 to (col1 + col2) channels 1 & 3
304and	r10,r28,r12		; Masks col4 channels 1 & 3 : 0x00XX00XX
305mullw	r10,r10,r24		; Applies coef4 on col4 channels 1 & 3
306add	r25,r25,r29		; Adds col 3 to (col1 + col2) channel 2
307lwz 	r8,0(r30)    		; px2
308rlwinm  r28,r28,0,16,23		; Masks col4 channel 2 : 0x0000XX00
309;andi.	r28,r28,0xFF00		; Masks col4 channel 2 : 0x0000XX00
310add	r7 ,r7 ,r10		; Adds col4 to (col1 + col2 + col3) channels 1 & 3
311lwz	r10,4(r30)   		; py2
312mullw	r28,r28,r24		; Applies coef4 on col4 channel 2
313srawi	r7, r7, 8		; (sum of channels 1 & 3) >> 8
314lwz	r29,4(r31)              ; py
315add	r25,r25,r28		; Adds col 4 to (col1 + col2 + col3) channel 2
316rlwimi  r7, r25, 24, 16, 23	; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
317stw	r7,0(r6)		; Stores the computed pixel
318bdnz	L100			; Iterate again if needed
319b       L300	;goto end	; If not, returns from the function
320
321
322; if out of range
323L400:
324stw	r18,0(r6)
325lwz	r8,0(r30)    ; px2
326lwz	r10,4(r30)   ; py2
327lwz	r2,0(r31)    ; px
328lwz	r29,4(r31)   ; py
329bdnz	L100
330
331
332L300:
333
334; Restore saved registers and return
335lmw	r17,-60(r1)
336blr
337