1 #ifndef X86_BITS_STRING_H
2 #define X86_BITS_STRING_H
3 
4 /*
5  * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation; either version 2 of the
10  * License, or any later version.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20  * 02110-1301, USA.
21  *
22  * You can also choose to distribute this program under the terms of
23  * the Unmodified Binary Distribution Licence (as given in the file
24  * COPYING.UBDL), provided that you have satisfied its requirements.
25  */
26 
27 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
28 
29 /** @file
30  *
31  * Optimised string operations
32  *
33  */
34 
35 extern void * __memcpy ( void *dest, const void *src, size_t len );
36 extern void * __memcpy_reverse ( void *dest, const void *src, size_t len );
37 
38 /**
39  * Copy memory area (where length is a compile-time constant)
40  *
41  * @v dest		Destination address
42  * @v src		Source address
43  * @v len		Length
44  * @ret dest		Destination address
45  */
46 static inline __attribute__ (( always_inline )) void *
__constant_memcpy(void * dest,const void * src,size_t len)47 __constant_memcpy ( void *dest, const void *src, size_t len ) {
48 	union {
49 		uint32_t u32[2];
50 		uint16_t u16[4];
51 		uint8_t  u8[8];
52 	} __attribute__ (( __may_alias__ )) *dest_u = dest;
53 	const union {
54 		uint32_t u32[2];
55 		uint16_t u16[4];
56 		uint8_t  u8[8];
57 	} __attribute__ (( __may_alias__ )) *src_u = src;
58 	const void *esi;
59 	void *edi;
60 
61 	switch ( len ) {
62 	case 0 : /* 0 bytes */
63 		return dest;
64 	/*
65 	 * Single-register moves; these are always better than a
66 	 * string operation.  We can clobber an arbitrary two
67 	 * registers (data, source, dest can re-use source register)
68 	 * instead of being restricted to esi and edi.  There's also a
69 	 * much greater potential for optimising with nearby code.
70 	 *
71 	 */
72 	case 1 : /* 4 bytes */
73 		dest_u->u8[0]  = src_u->u8[0];
74 		return dest;
75 	case 2 : /* 6 bytes */
76 		dest_u->u16[0] = src_u->u16[0];
77 		return dest;
78 	case 4 : /* 4 bytes */
79 		dest_u->u32[0] = src_u->u32[0];
80 		return dest;
81 	/*
82 	 * Double-register moves; these are probably still a win.
83 	 *
84 	 */
85 	case 3 : /* 12 bytes */
86 		dest_u->u16[0] = src_u->u16[0];
87 		dest_u->u8[2]  = src_u->u8[2];
88 		return dest;
89 	case 5 : /* 10 bytes */
90 		dest_u->u32[0] = src_u->u32[0];
91 		dest_u->u8[4]  = src_u->u8[4];
92 		return dest;
93 	case 6 : /* 12 bytes */
94 		dest_u->u32[0] = src_u->u32[0];
95 		dest_u->u16[2] = src_u->u16[2];
96 		return dest;
97 	case 8 : /* 10 bytes */
98 		dest_u->u32[0] = src_u->u32[0];
99 		dest_u->u32[1] = src_u->u32[1];
100 		return dest;
101 	}
102 
103 	/* Even if we have to load up esi and edi ready for a string
104 	 * operation, we can sometimes save space by using multiple
105 	 * single-byte "movs" operations instead of loading up ecx and
106 	 * using "rep movsb".
107 	 *
108 	 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
109 	 * to allow for saving/restoring ecx 50% of the time.
110 	 *
111 	 * "movsl" and "movsb" are 1 byte each, "movsw" is two bytes.
112 	 * (In 16-bit mode, "movsl" is 2 bytes and "movsw" is 1 byte,
113 	 * but "movsl" moves twice as much data, so it balances out).
114 	 *
115 	 * The cutoff point therefore occurs around 26 bytes; the byte
116 	 * requirements for each method are:
117 	 *
118 	 * len		   16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
119 	 * #bytes (ecx)	    8  8  8  8  8  8  8  8  8  8  8  8  8  8  8  8
120 	 * #bytes (no ecx)  4  5  6  7  5  6  7  8  6  7  8  9  7  8  9 10
121 	 */
122 
123 	esi = src;
124 	edi = dest;
125 
126 	if ( len >= 26 )
127 		return __memcpy ( dest, src, len );
128 
129 	if ( len >= 6*4 )
130 		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
131 				       : "0" ( edi ), "1" ( esi ) : "memory" );
132 	if ( len >= 5*4 )
133 		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
134 				       : "0" ( edi ), "1" ( esi ) : "memory" );
135 	if ( len >= 4*4 )
136 		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
137 				       : "0" ( edi ), "1" ( esi ) : "memory" );
138 	if ( len >= 3*4 )
139 		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
140 				       : "0" ( edi ), "1" ( esi ) : "memory" );
141 	if ( len >= 2*4 )
142 		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
143 				       : "0" ( edi ), "1" ( esi ) : "memory" );
144 	if ( len >= 1*4 )
145 		__asm__ __volatile__ ( "movsl" : "=&D" ( edi ), "=&S" ( esi )
146 				       : "0" ( edi ), "1" ( esi ) : "memory" );
147 	if ( ( len % 4 ) >= 2 )
148 		__asm__ __volatile__ ( "movsw" : "=&D" ( edi ), "=&S" ( esi )
149 				       : "0" ( edi ), "1" ( esi ) : "memory" );
150 	if ( ( len % 2 ) >= 1 )
151 		__asm__ __volatile__ ( "movsb" : "=&D" ( edi ), "=&S" ( esi )
152 				       : "0" ( edi ), "1" ( esi ) : "memory" );
153 
154 	return dest;
155 }
156 
157 /**
158  * Copy memory area
159  *
160  * @v dest		Destination address
161  * @v src		Source address
162  * @v len		Length
163  * @ret dest		Destination address
164  */
165 static inline __attribute__ (( always_inline )) void *
memcpy(void * dest,const void * src,size_t len)166 memcpy ( void *dest, const void *src, size_t len ) {
167 	if ( __builtin_constant_p ( len ) ) {
168 		return __constant_memcpy ( dest, src, len );
169 	} else {
170 		return __memcpy ( dest, src, len );
171 	}
172 }
173 
174 extern void * __memmove ( void *dest, const void *src, size_t len );
175 
176 /**
177  * Copy (possibly overlapping) memory area
178  *
179  * @v dest		Destination address
180  * @v src		Source address
181  * @v len		Length
182  * @ret dest		Destination address
183  */
184 static inline __attribute__ (( always_inline )) void *
memmove(void * dest,const void * src,size_t len)185 memmove ( void *dest, const void *src, size_t len ) {
186 	ssize_t offset = ( dest - src );
187 
188 	if ( __builtin_constant_p ( offset ) ) {
189 		if ( offset <= 0 ) {
190 			return memcpy ( dest, src, len );
191 		} else {
192 			return __memcpy_reverse ( dest, src, len );
193 		}
194 	} else {
195 		return __memmove ( dest, src, len );
196 	}
197 }
198 
199 /**
200  * Fill memory region
201  *
202  * @v dest		Destination address
203  * @v fill		Fill pattern
204  * @v len		Length
205  * @ret dest		Destination address
206  */
207 static inline __attribute__ (( always_inline )) void *
__memset(void * dest,int fill,size_t len)208 __memset ( void *dest, int fill, size_t len ) {
209 	void *discard_D;
210 	size_t discard_c;
211 
212 	__asm__ __volatile__ ( "rep stosb"
213 			       : "=&D" ( discard_D ), "=&c" ( discard_c )
214 			       : "0" ( dest ), "1" ( len ), "a" ( fill )
215 			       : "memory" );
216 	return dest;
217 }
218 
219 /**
220  * Fill memory region with zero (where length is a compile-time constant)
221  *
222  * @v dest		Destination address
223  * @v len		Length
224  * @ret dest		Destination address
225  */
226 static inline __attribute__ (( always_inline )) void *
__constant_memset_zero(void * dest,size_t len)227 __constant_memset_zero ( void *dest, size_t len ) {
228 	union {
229 		uint32_t u32[2];
230 		uint16_t u16[4];
231 		uint8_t  u8[8];
232 	} __attribute__ (( __may_alias__ )) *dest_u = dest;
233 	void *edi;
234 	uint32_t eax;
235 
236 	switch ( len ) {
237 	case 0 : /* 0 bytes */
238 		return dest;
239 
240 	/* Single-register moves.  Almost certainly better than a
241 	 * string operation.  We can avoid clobbering any registers,
242 	 * we can reuse a zero that happens to already be in a
243 	 * register, and we can optimise away the code entirely if the
244 	 * memset() is used to clear a region which then gets
245 	 * immediately overwritten.
246 	 */
247 	case 1 : /* 3 bytes */
248 		dest_u->u8[0] = 0;
249 		return dest;
250 	case 2: /* 5 bytes */
251 		dest_u->u16[0] = 0;
252 		return dest;
253 	case 4: /* 6 bytes */
254 		dest_u->u32[0] = 0;
255 		return dest;
256 
257 	/* Double-register moves.  Very probably better than a string
258 	 * operation.
259 	 */
260 	case 3 : /* 9 bytes */
261 		dest_u->u16[0] = 0;
262 		dest_u->u8[2]  = 0;
263 		return dest;
264 	case 5 : /* 10 bytes */
265 		dest_u->u32[0] = 0;
266 		dest_u->u8[4]  = 0;
267 		return dest;
268 	case 6 : /* 12 bytes */
269 		dest_u->u32[0] = 0;
270 		dest_u->u16[2] = 0;
271 		return dest;
272 	case 8 : /* 13 bytes */
273 		dest_u->u32[0] = 0;
274 		dest_u->u32[1] = 0;
275 		return dest;
276 	}
277 
278 	/* As with memcpy(), we can potentially save space by using
279 	 * multiple single-byte "stos" instructions instead of loading
280 	 * up ecx and using "rep stosb".
281 	 *
282 	 * "load ecx, rep movsb" is 7 bytes, plus an average of 1 byte
283 	 * to allow for saving/restoring ecx 50% of the time.
284 	 *
285 	 * "stosl" and "stosb" are 1 byte each, "stosw" is two bytes.
286 	 *
287 	 * The calculations are therefore the same as for memcpy(),
288 	 * giving a cutoff point of around 26 bytes.
289 	 */
290 
291 	edi = dest;
292 	eax = 0;
293 
294 	if ( len >= 26 )
295 		return __memset ( dest, 0, len );
296 
297 	if ( len >= 6*4 )
298 		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
299 				       : "0" ( edi ), "1" ( eax ) : "memory" );
300 	if ( len >= 5*4 )
301 		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
302 				       : "0" ( edi ), "1" ( eax ) : "memory" );
303 	if ( len >= 4*4 )
304 		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
305 				       : "0" ( edi ), "1" ( eax ) : "memory" );
306 	if ( len >= 3*4 )
307 		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
308 				       : "0" ( edi ), "1" ( eax ) : "memory" );
309 	if ( len >= 2*4 )
310 		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
311 				       : "0" ( edi ), "1" ( eax ) : "memory" );
312 	if ( len >= 1*4 )
313 		__asm__ __volatile__ ( "stosl" : "=&D" ( edi ), "=&a" ( eax )
314 				       : "0" ( edi ), "1" ( eax ) : "memory" );
315 	if ( ( len % 4 ) >= 2 )
316 		__asm__ __volatile__ ( "stosw" : "=&D" ( edi ), "=&a" ( eax )
317 				       : "0" ( edi ), "1" ( eax ) : "memory" );
318 	if ( ( len % 2 ) >= 1 )
319 		__asm__ __volatile__ ( "stosb" : "=&D" ( edi ), "=&a" ( eax )
320 				       : "0" ( edi ), "1" ( eax ) : "memory" );
321 
322 	return dest;
323 }
324 
325 /**
326  * Fill memory region
327  *
328  * @v dest		Destination address
329  * @v fill		Fill pattern
330  * @v len		Length
331  * @ret dest		Destination address
332  */
333 static inline __attribute__ (( always_inline )) void *
memset(void * dest,int fill,size_t len)334 memset ( void *dest, int fill, size_t len ) {
335 
336 	if ( __builtin_constant_p ( fill ) && ( fill == 0 ) &&
337 	     __builtin_constant_p ( len ) ) {
338 		return __constant_memset_zero ( dest, len );
339 	} else {
340 		return __memset ( dest, fill, len );
341 	}
342 }
343 
344 #endif /* X86_BITS_STRING_H */
345