1 #ifndef X86_BITS_STRING_H
2 #define X86_BITS_STRING_H
3
4 /*
5 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 * 02110-1301, USA.
21 *
22 * You can also choose to distribute this program under the terms of
23 * the Unmodified Binary Distribution Licence (as given in the file
24 * COPYING.UBDL), provided that you have satisfied its requirements.
25 */
26
27 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
28
29 /** @file
30 *
31 * Optimised string operations
32 *
33 */
34
35 extern void * __memcpy ( void *dest, const void *src, size_t len );
36 extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );
37
38 /**
39 * Copy memory area (where length is a compile-time constant)
40 *
41 * @v dest Destination address
42 * @v src Source address
43 * @v len Length
44 * @ret dest Destination address
45 */
46 static inline __attribute__ (( always_inline )) void *
__constant_memcpy(void * dest,const void * src,size_t len)47 __constant_memcpy ( void *dest, const void *src, size_t len ) {
48 union {
49 uint32_t u32[2];
50 uint16_t u16[4];
51 uint8_t u8[8];
52 } __attribute__ (( __may_alias__ )) *dest_u = dest;
53 const union {
54 uint32_t u32[2];
55 uint16_t u16[4];
56 uint8_t u8[8];
57 } __attribute__ (( __may_alias__ )) *src_u = src;
58 const void *esi;
59 void *edi;
60
61 switch ( len ) {
62 case 0 : /* 0 bytes */
63 return dest;
64 /*
65 * Single-register moves; these are always better than a
66 * string operation. We can clobber an arbitrary two
67 * registers (data, source, dest can re-use source register)
68 * instead of being restricted to esi and edi. There's also a
69 * much greater potential for optimising with nearby code.
70 *
71 */
72 case 1 : /* 4 bytes */
73 dest_u->u8[0] = src_u->u8[0];
74 return dest;
75 case 2 : /* 6 bytes */
76 dest_u->u16[0] = src_u->u16[0];
77 return dest;
78 case 4 : /* 4 bytes */
79 dest_u->u32[0] = src_u->u32[0];
80 return dest;
81 /*
82 * Double-register moves; these are probably still a win.
83 *
84 */
85 case 3 : /* 12 bytes */
86 dest_u->u16[0] = src_u->u16[0];
87 dest_u->u8[2] = src_u->u8[2];
88 return dest;
89 case 5 : /* 10 bytes */
90 dest_u->u32[0] = src_u->u32[0];
91 dest_u->u8[4] = src_u->u8[4];
92 return dest;
93 case 6 : /* 12 bytes */
94 dest_u->u32[0] = src_u->u32[0];
95 dest_u->u16[2] = src_u->u16[2];
96 return dest;
97 case 8 : /* 10 bytes */
98 dest_u->u32[0] = src_u->u32[0];
99 dest_u->u32[1] = src_u->u32[1];
100 return dest;
101 }
102
103 /* Even if we have to load up esi and edi ready for a string
104 * operation, we can sometimes save space by using multiple
105 * single-byte "movs" operations instead of loading up ecx and
106 * using "rep movsb".
107 *
108 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
109 * to allow for saving/restoring ecx 50% of the time.
110 *
111 * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
112 * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
113 * but "movsl" moves twice as much data, so it balances out).
114 *
115 * The cutoff point therefore occurs around 26 bytes; the byte
116 * requirements for each method are:
117 *
118 * len 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
119 * #bytes (ecx) 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
120 * #bytes (no ecx) 4 5 6 7 5 6 7 8 6 7 8 9 7 8 9 10
121 */
122
123 esi = src;
124 edi = dest;
125
126 if ( len >= 26 )
127 return __memcpy ( dest, src, len );
128
129 if ( len >= 6*4 )
130 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
131 : "0" ( edi ), "1" ( esi ) : "memory" );
132 if ( len >= 5*4 )
133 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
134 : "0" ( edi ), "1" ( esi ) : "memory" );
135 if ( len >= 4*4 )
136 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
137 : "0" ( edi ), "1" ( esi ) : "memory" );
138 if ( len >= 3*4 )
139 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
140 : "0" ( edi ), "1" ( esi ) : "memory" );
141 if ( len >= 2*4 )
142 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
143 : "0" ( edi ), "1" ( esi ) : "memory" );
144 if ( len >= 1*4 )
145 __asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
146 : "0" ( edi ), "1" ( esi ) : "memory" );
147 if ( ( len % 4 ) >= 2 )
148 __asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
149 : "0" ( edi ), "1" ( esi ) : "memory" );
150 if ( ( len % 2 ) >= 1 )
151 __asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
152 : "0" ( edi ), "1" ( esi ) : "memory" );
153
154 return dest;
155 }
156
157 /**
158 * Copy memory area
159 *
160 * @v dest Destination address
161 * @v src Source address
162 * @v len Length
163 * @ret dest Destination address
164 */
165 static inline __attribute__ (( always_inline )) void *
memcpy(void * dest,const void * src,size_t len)166 memcpy ( void *dest, const void *src, size_t len ) {
167 if ( __builtin_constant_p ( len ) ) {
168 return __constant_memcpy ( dest, src, len );
169 } else {
170 return __memcpy ( dest, src, len );
171 }
172 }
173
174 extern void * __memmove ( void *dest, const void *src, size_t len );
175
176 /**
177 * Copy (possibly overlapping) memory area
178 *
179 * @v dest Destination address
180 * @v src Source address
181 * @v len Length
182 * @ret dest Destination address
183 */
184 static inline __attribute__ (( always_inline )) void *
memmove(void * dest,const void * src,size_t len)185 memmove ( void *dest, const void *src, size_t len ) {
186 ssize_t offset = ( dest - src );
187
188 if ( __builtin_constant_p ( offset ) ) {
189 if ( offset <= 0 ) {
190 return memcpy ( dest, src, len );
191 } else {
192 return __memcpy_reverse ( dest, src, len );
193 }
194 } else {
195 return __memmove ( dest, src, len );
196 }
197 }
198
199 /**
200 * Fill memory region
201 *
202 * @v dest Destination address
203 * @v fill Fill pattern
204 * @v len Length
205 * @ret dest Destination address
206 */
207 static inline __attribute__ (( always_inline )) void *
__memset(void * dest,int fill,size_t len)208 __memset ( void *dest, int fill, size_t len ) {
209 void *discard_D;
210 size_t discard_c;
211
212 __asm__ __volatile__ ( "rep stosb"
213 : "=&D" ( discard_D ), "=&c" ( discard_c )
214 : "0" ( dest ), "1" ( len ), "a" ( fill )
215 : "memory" );
216 return dest;
217 }
218
219 /**
220 * Fill memory region with zero (where length is a compile-time constant)
221 *
222 * @v dest Destination address
223 * @v len Length
224 * @ret dest Destination address
225 */
226 static inline __attribute__ (( always_inline )) void *
__constant_memset_zero(void * dest,size_t len)227 __constant_memset_zero ( void *dest, size_t len ) {
228 union {
229 uint32_t u32[2];
230 uint16_t u16[4];
231 uint8_t u8[8];
232 } __attribute__ (( __may_alias__ )) *dest_u = dest;
233 void *edi;
234 uint32_t eax;
235
236 switch ( len ) {
237 case 0 : /* 0 bytes */
238 return dest;
239
240 /* Single-register moves. Almost certainly better than a
241 * string operation. We can avoid clobbering any registers,
242 * we can reuse a zero that happens to already be in a
243 * register, and we can optimise away the code entirely if the
244 * memset() is used to clear a region which then gets
245 * immediately overwritten.
246 */
247 case 1 : /* 3 bytes */
248 dest_u->u8[0] = 0;
249 return dest;
250 case 2: /* 5 bytes */
251 dest_u->u16[0] = 0;
252 return dest;
253 case 4: /* 6 bytes */
254 dest_u->u32[0] = 0;
255 return dest;
256
257 /* Double-register moves. Very probably better than a string
258 * operation.
259 */
260 case 3 : /* 9 bytes */
261 dest_u->u16[0] = 0;
262 dest_u->u8[2] = 0;
263 return dest;
264 case 5 : /* 10 bytes */
265 dest_u->u32[0] = 0;
266 dest_u->u8[4] = 0;
267 return dest;
268 case 6 : /* 12 bytes */
269 dest_u->u32[0] = 0;
270 dest_u->u16[2] = 0;
271 return dest;
272 case 8 : /* 13 bytes */
273 dest_u->u32[0] = 0;
274 dest_u->u32[1] = 0;
275 return dest;
276 }
277
278 /* As with memcpy(), we can potentially save space by using
279 * multiple single-byte "stos" instructions instead of loading
280 * up ecx and using "rep stosb".
281 *
282 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
283 * to allow for saving/restoring ecx 50% of the time.
284 *
285 * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
286 *
287 * The calculations are therefore the same as for memcpy(),
288 * giving a cutoff point of around 26 bytes.
289 */
290
291 edi = dest;
292 eax = 0;
293
294 if ( len >= 26 )
295 return __memset ( dest, 0, len );
296
297 if ( len >= 6*4 )
298 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
299 : "0" ( edi ), "1" ( eax ) : "memory" );
300 if ( len >= 5*4 )
301 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
302 : "0" ( edi ), "1" ( eax ) : "memory" );
303 if ( len >= 4*4 )
304 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
305 : "0" ( edi ), "1" ( eax ) : "memory" );
306 if ( len >= 3*4 )
307 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
308 : "0" ( edi ), "1" ( eax ) : "memory" );
309 if ( len >= 2*4 )
310 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
311 : "0" ( edi ), "1" ( eax ) : "memory" );
312 if ( len >= 1*4 )
313 __asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
314 : "0" ( edi ), "1" ( eax ) : "memory" );
315 if ( ( len % 4 ) >= 2 )
316 __asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
317 : "0" ( edi ), "1" ( eax ) : "memory" );
318 if ( ( len % 2 ) >= 1 )
319 __asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
320 : "0" ( edi ), "1" ( eax ) : "memory" );
321
322 return dest;
323 }
324
325 /**
326 * Fill memory region
327 *
328 * @v dest Destination address
329 * @v fill Fill pattern
330 * @v len Length
331 * @ret dest Destination address
332 */
333 static inline __attribute__ (( always_inline )) void *
memset(void * dest,int fill,size_t len)334 memset ( void *dest, int fill, size_t len ) {
335
336 if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
337 __builtin_constant_p ( len ) ) {
338 return __constant_memset_zero ( dest, len );
339 } else {
340 return __memset ( dest, fill, len );
341 }
342 }
343
344 #endif /* X86_BITS_STRING_H */
345