1{
2    x86 surface clear routines for HERMES
3    Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
4
5    This library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version
9    with the following modification:
10
11    As a special exception, the copyright holders of this library give you
12    permission to link this library with independent modules to produce an
13    executable, regardless of the license terms of these independent modules,and
14    to copy and distribute the resulting executable under terms of your choice,
15    provided that you also meet, for each linked independent module, the terms
16    and conditions of the license of that module. An independent module is a
17    module which is not derived from or based on this library. If you modify
18    this library, you may extend this exception to your version of the library,
19    but you are not obligated to do so. If you do not wish to do so, delete this
20    exception statement from your version.
21
22    This library is distributed in the hope that it will be useful,
23    but WITHOUT ANY WARRANTY; without even the implied warranty of
24    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
25    Lesser General Public License for more details.
26
27    You should have received a copy of the GNU Lesser General Public
28    License along with this library; if not, write to the Free Software
29    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
30
31    (04/10/99)    Modified ClearX86_8             <Mikko.Tiihonen@hut.fi>
32}
33
34{$ASMMODE att}
35
36{
37 --------------------------------------------------------------------------
38 HermesClearInterface (ebp+..)
39   0: char8 *dest
40   4: int32 value
41   8: unsigned int width (already checked to be >0!)
42  12: unsigned int height (already checked to be >0!)
43  16: int add
44}
45procedure ClearX86_32(hci: PHermesClearInterface); cdecl; assembler;
46asm
47        pushl %ebp
48
49        movl 8(%ebp),%ebp
50
51        movl (%ebp),%edi        // destination
52        movl 4(%ebp),%eax       // pixel value
53
54        movl 12(%ebp),%edx      // height
55.balign 4
56.L_y:
57        movl 8(%ebp),%ecx
58        rep
59 stosl
60
61        addl 16(%ebp),%edi
62
63        decl %edx
64        jnz .L_y
65
66        popl %ebp
67end;
68
69procedure ClearX86_24(hci: PHermesClearInterface); cdecl; assembler;
70asm
71end;
72
73procedure ClearX86_16(hci: PHermesClearInterface); cdecl; assembler;
74asm
75        pushl %ebp
76
77        movl 8(%ebp),%ebp
78
79        movl (%ebp),%edi        // destination
80        movl 4(%ebp),%eax       // pixel value
81
82        movl 12(%ebp),%edx      // height
83        movl %eax,%ebx
84
85        shll $16,%eax           // Duplicate pixel value
86        andl $0x0ffff,%ebx
87
88        orl %ebx,%eax
89.L_y:
90        movl 8(%ebp),%ecx
91
92        testl $3,%edi           // Check if destination is aligned mod 4
93        jz .L_aligned
94
95        movw %ax,(%edi)         // otherwise write one pixel
96        addl $2,%edi
97
98        decl %ecx
99        jz .L_endline
100
101.L_aligned:
102        shrl $1,%ecx
103
104rep
105 stosl
106
107        jnc .L_endline
108
109        movw %ax,(%edi)
110        addl $2,%edi
111
112.L_endline:
113        addl 16(%ebp),%edi
114
115        decl %edx
116        jnz .L_y
117
118        popl %ebp
119end;
120
121procedure ClearX86_8(hci: PHermesClearInterface); cdecl; nostackframe; assembler;
122asm
123        pushl %ebp
124        movl %esp,%ebp
125
126        movl 8(%ebp),%ebp
127
128        movl 4(%ebp),%eax       // pixel value
129        movl 12(%ebp),%edx      // height
130
131        movb %al,%ah
132        movl (%ebp),%edi        // destination
133
134        movl %eax,%ecx
135
136        shll $16,%eax           // Put the byte pixel value in all four bytes
137        andl $0x0ffff,%ecx      // of eax
138
139        movl 8(%ebp),%ebx
140        orl %ecx,%eax
141
142        cmpl $5,%ebx            // removes need for extra checks later
143        jbe .L_short_y
144
145.balign 4
146.L_y:
147        testl $3,%edi
148        jz .L_aligned
149
150        movl %edi,%ecx
151        negl %ecx
152        andl $3,%ecx
153
154        subl %ecx,%ebx
155
156        rep
157 stosb
158
159.L_aligned:
160        movl %ebx,%ecx
161
162        shrl $2,%ecx
163        andl $3,%ebx
164
165        rep
166 stosl
167
168        movl %ebx,%ecx
169        rep
170 stosb
171
172        addl 16(%ebp),%edi
173
174        decl %edx
175        movl 8(%ebp),%ebx
176        jnz .L_y
177
178        popl %ebp
179        ret
180
181// Short loop
182.balign 4
183.L_short_y:
184        movl %ebx,%ecx
185
186        rep
187 stosb
188        addl 16(%ebp),%edi
189
190        decl %edx
191        jnz .L_short_y
192
193        popl %ebp
194end;
195
196{
197 ClearX86_8 version 2,
198 Im not sure wheather this is faster or not...
199 too many jumps could confuse the CPU branch guessing
200}
201procedure ClearX86_8_2(hci: PHermesClearInterface); cdecl; nostackframe; assembler;
202asm
203        pushl %ebp
204        movl %esp,%ebp
205
206        movl 8(%ebp),%ebp
207
208        movl 4(%ebp),%eax       // pixel value
209        movl 12(%ebp),%edx      // height
210
211        movb %al,%ah
212        movl (%ebp),%edi        // destination
213
214        movl %eax,%ecx
215
216        shll $16,%eax           // Put the byte pixel value in all four bytes
217        andl $0x0ffff,%ecx      // of eax
218
219        movl 8(%ebp),%ebx
220        orl %ecx,%eax
221
222        cmpl $5,%ebx            // removes need for extra checks in main loop
223        jbe .L_short_y
224
225
226.balign 4
227.L_y:
228        testl $3,%edi
229        jz .L_aligned
230
231        movl %edi,%ecx
232        negl %ecx
233        andl $3,%ecx
234
235        movb %al,(%edi)
236        subl %ecx,%ebx
237
238        incl %edi
239
240        decl %ecx
241        jz .L_aligned
242
243        movb %al,(%edi)
244        incl %edi
245        decl %ecx
246        jz .L_aligned
247
248        movb %al,(%edi)
249        incl %edi
250
251.L_aligned:
252        movl %ebx,%ecx
253
254        shrl $2,%ecx
255        andl $3,%ebx
256
257        rep
258 stosl
259
260        jz .L_endline
261                // ebx
262
263        movb %al,(%edi)
264                // Write remaining (1,2 or 3) pixels
265        incl %edi
266        decl %ebx
267        jz .L_endline
268
269        movb %al,(%edi)
270        incl %edi
271        decl %ebx
272        jz .L_endline
273
274        movb %al,(%edi)
275        incl %edi
276        decl %ebx
277        jz .L_endline
278
279        movb %al,(%edi)
280        incl %edi
281
282.L_endline:
283        addl 16(%ebp),%edi
284
285        decl %edx
286        movl 8(%ebp),%ebx
287        jnz .L_y
288
289        popl %ebp
290        ret
291
292// Short loop
293.balign 4
294.L_short_y:
295        movl %ebx,%ecx
296
297        rep
298 stosb
299        addl 16(%ebp),%edi
300
301        decl %edx
302        jnz .L_short_y
303
304        popl %ebp
305end;
306