1 /*
2  * This file is part of the Scale2x project.
3  *
4  * Copyright (C) 2001, 2002, 2003, 2004 Andrea Mazzoleni
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  */
20 
21 /*
22  * This file contains a C and MMX implementation of the Scale2x effect.
23  *
24  * You can find an high level description of the effect at :
25  *
26  * http://scale2x.sourceforge.net/
27  *
28  * Alternatively at the previous license terms, you are allowed to use this
29  * code in your program with these conditions:
30  * - the program is not used in commercial activities.
31  * - the whole source code of the program is released with the binary.
32  * - derivative works of the program are allowed.
33  */
34 
35 #if HAVE_CONFIG_H
36 #include <config.h>
37 #endif
38 
39 #include "scale2x.h"
40 
41 #include <assert.h>
42 
43 /***************************************************************************/
44 /* Scale2x C implementation */
45 
46 /**
47  * Define the macro USE_SCALE_RANDOMWRITE to enable
48  * an optimized version which writes memory in random order.
49  * This version is a little faster if you write in system memory.
50  * But it's a lot slower if you write in video memory.
51  * So, enable it only if you are sure to never write directly in video memory.
52  */
53 /* #define USE_SCALE_RANDOMWRITE */
54 
scale2x_8_def_whole(scale2x_uint8 * restrict dst0,scale2x_uint8 * restrict dst1,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)55 static inline void scale2x_8_def_whole(scale2x_uint8* restrict dst0, scale2x_uint8* restrict dst1, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
56 {
57 	assert(count >= 2);
58 
59 	/* first pixel */
60 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
61 		dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
62 		dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
63 		dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
64 		dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
65 	} else {
66 		dst0[0] = src1[0];
67 		dst0[1] = src1[0];
68 		dst1[0] = src1[0];
69 		dst1[1] = src1[0];
70 	}
71 	++src0;
72 	++src1;
73 	++src2;
74 	dst0 += 2;
75 	dst1 += 2;
76 
77 	/* central pixels */
78 	count -= 2;
79 	while (count) {
80 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
81 			dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
82 			dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
83 			dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
84 			dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
85 		} else {
86 			dst0[0] = src1[0];
87 			dst0[1] = src1[0];
88 			dst1[0] = src1[0];
89 			dst1[1] = src1[0];
90 		}
91 
92 		++src0;
93 		++src1;
94 		++src2;
95 		dst0 += 2;
96 		dst1 += 2;
97 		--count;
98 	}
99 
100 	/* last pixel */
101 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
102 		dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
103 		dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
104 		dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
105 		dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
106 	} else {
107 		dst0[0] = src1[0];
108 		dst0[1] = src1[0];
109 		dst1[0] = src1[0];
110 		dst1[1] = src1[0];
111 	}
112 }
113 
scale2x_8_def_border(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)114 static inline void scale2x_8_def_border(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
115 {
116 	assert(count >= 2);
117 
118 	/* first pixel */
119 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
120 		dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
121 		dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
122 	} else {
123 		dst[0] = src1[0];
124 		dst[1] = src1[0];
125 	}
126 	++src0;
127 	++src1;
128 	++src2;
129 	dst += 2;
130 
131 	/* central pixels */
132 	count -= 2;
133 	while (count) {
134 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
135 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
136 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
137 		} else {
138 			dst[0] = src1[0];
139 			dst[1] = src1[0];
140 		}
141 
142 		++src0;
143 		++src1;
144 		++src2;
145 		dst += 2;
146 		--count;
147 	}
148 
149 	/* last pixel */
150 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
151 		dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
152 		dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
153 	} else {
154 		dst[0] = src1[0];
155 		dst[1] = src1[0];
156 	}
157 }
158 
scale2x_8_def_center(scale2x_uint8 * restrict dst,const scale2x_uint8 * restrict src0,const scale2x_uint8 * restrict src1,const scale2x_uint8 * restrict src2,unsigned count)159 static inline void scale2x_8_def_center(scale2x_uint8* restrict dst, const scale2x_uint8* restrict src0, const scale2x_uint8* restrict src1, const scale2x_uint8* restrict src2, unsigned count)
160 {
161 	assert(count >= 2);
162 
163 	/* first pixel */
164 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
165 		dst[0] = src1[0];
166 		dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
167 	} else {
168 		dst[0] = src1[0];
169 		dst[1] = src1[0];
170 	}
171 	++src0;
172 	++src1;
173 	++src2;
174 	dst += 2;
175 
176 	/* central pixels */
177 	count -= 2;
178 	while (count) {
179 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
180 			dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
181 			dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
182 		} else {
183 			dst[0] = src1[0];
184 			dst[1] = src1[0];
185 		}
186 
187 		++src0;
188 		++src1;
189 		++src2;
190 		dst += 2;
191 		--count;
192 	}
193 
194 	/* last pixel */
195 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
196 		dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
197 		dst[1] = src1[0];
198 	} else {
199 		dst[0] = src1[0];
200 		dst[1] = src1[0];
201 	}
202 }
203 
scale2x_16_def_whole(scale2x_uint16 * restrict dst0,scale2x_uint16 * restrict dst1,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)204 static inline void scale2x_16_def_whole(scale2x_uint16* restrict dst0, scale2x_uint16* restrict dst1, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
205 {
206 	assert(count >= 2);
207 
208 	/* first pixel */
209 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
210 		dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
211 		dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
212 		dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
213 		dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
214 	} else {
215 		dst0[0] = src1[0];
216 		dst0[1] = src1[0];
217 		dst1[0] = src1[0];
218 		dst1[1] = src1[0];
219 	}
220 	++src0;
221 	++src1;
222 	++src2;
223 	dst0 += 2;
224 	dst1 += 2;
225 
226 	/* central pixels */
227 	count -= 2;
228 	while (count) {
229 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
230 			dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
231 			dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
232 			dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
233 			dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
234 		} else {
235 			dst0[0] = src1[0];
236 			dst0[1] = src1[0];
237 			dst1[0] = src1[0];
238 			dst1[1] = src1[0];
239 		}
240 
241 		++src0;
242 		++src1;
243 		++src2;
244 		dst0 += 2;
245 		dst1 += 2;
246 		--count;
247 	}
248 
249 	/* last pixel */
250 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
251 		dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
252 		dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
253 		dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
254 		dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
255 	} else {
256 		dst0[0] = src1[0];
257 		dst0[1] = src1[0];
258 		dst1[0] = src1[0];
259 		dst1[1] = src1[0];
260 	}
261 }
262 
scale2x_16_def_border(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)263 static inline void scale2x_16_def_border(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
264 {
265 	assert(count >= 2);
266 
267 	/* first pixel */
268 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
269 		dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
270 		dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
271 	} else {
272 		dst[0] = src1[0];
273 		dst[1] = src1[0];
274 	}
275 	++src0;
276 	++src1;
277 	++src2;
278 	dst += 2;
279 
280 	/* central pixels */
281 	count -= 2;
282 	while (count) {
283 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
284 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
285 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
286 		} else {
287 			dst[0] = src1[0];
288 			dst[1] = src1[0];
289 		}
290 
291 		++src0;
292 		++src1;
293 		++src2;
294 		dst += 2;
295 		--count;
296 	}
297 
298 	/* last pixel */
299 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
300 		dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
301 		dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
302 	} else {
303 		dst[0] = src1[0];
304 		dst[1] = src1[0];
305 	}
306 }
307 
scale2x_16_def_center(scale2x_uint16 * restrict dst,const scale2x_uint16 * restrict src0,const scale2x_uint16 * restrict src1,const scale2x_uint16 * restrict src2,unsigned count)308 static inline void scale2x_16_def_center(scale2x_uint16* restrict dst, const scale2x_uint16* restrict src0, const scale2x_uint16* restrict src1, const scale2x_uint16* restrict src2, unsigned count)
309 {
310 	assert(count >= 2);
311 
312 	/* first pixel */
313 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
314 		dst[0] = src1[0];
315 		dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
316 	} else {
317 		dst[0] = src1[0];
318 		dst[1] = src1[0];
319 	}
320 	++src0;
321 	++src1;
322 	++src2;
323 	dst += 2;
324 
325 	/* central pixels */
326 	count -= 2;
327 	while (count) {
328 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
329 			dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
330 			dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
331 		} else {
332 			dst[0] = src1[0];
333 			dst[1] = src1[0];
334 		}
335 
336 		++src0;
337 		++src1;
338 		++src2;
339 		dst += 2;
340 		--count;
341 	}
342 
343 	/* last pixel */
344 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
345 		dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
346 		dst[1] = src1[0];
347 	} else {
348 		dst[0] = src1[0];
349 		dst[1] = src1[0];
350 	}
351 }
352 
scale2x_32_def_whole(scale2x_uint32 * restrict dst0,scale2x_uint32 * restrict dst1,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)353 static inline void scale2x_32_def_whole(scale2x_uint32* restrict dst0, scale2x_uint32* restrict dst1, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
354 {
355 	assert(count >= 2);
356 
357 	/* first pixel */
358 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
359 		dst0[0] = src1[0] == src0[0] ? src0[0] : src1[0];
360 		dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
361 		dst1[0] = src1[0] == src2[0] ? src2[0] : src1[0];
362 		dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
363 	} else {
364 		dst0[0] = src1[0];
365 		dst0[1] = src1[0];
366 		dst1[0] = src1[0];
367 		dst1[1] = src1[0];
368 	}
369 	++src0;
370 	++src1;
371 	++src2;
372 	dst0 += 2;
373 	dst1 += 2;
374 
375 	/* central pixels */
376 	count -= 2;
377 	while (count) {
378 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
379 			dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
380 			dst0[1] = src1[1] == src0[0] ? src0[0] : src1[0];
381 			dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
382 			dst1[1] = src1[1] == src2[0] ? src2[0] : src1[0];
383 		} else {
384 			dst0[0] = src1[0];
385 			dst0[1] = src1[0];
386 			dst1[0] = src1[0];
387 			dst1[1] = src1[0];
388 		}
389 
390 		++src0;
391 		++src1;
392 		++src2;
393 		dst0 += 2;
394 		dst1 += 2;
395 		--count;
396 	}
397 
398 	/* last pixel */
399 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
400 		dst0[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
401 		dst0[1] = src1[0] == src0[0] ? src0[0] : src1[0];
402 		dst1[0] = src1[-1] == src2[0] ? src2[0] : src1[0];
403 		dst1[1] = src1[0] == src2[0] ? src2[0] : src1[0];
404 	} else {
405 		dst0[0] = src1[0];
406 		dst0[1] = src1[0];
407 		dst1[0] = src1[0];
408 		dst1[1] = src1[0];
409 	}
410 }
411 
scale2x_32_def_border(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)412 static inline void scale2x_32_def_border(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
413 {
414 	assert(count >= 2);
415 
416 	/* first pixel */
417 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
418 		dst[0] = src1[0] == src0[0] ? src0[0] : src1[0];
419 		dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
420 	} else {
421 		dst[0] = src1[0];
422 		dst[1] = src1[0];
423 	}
424 	++src0;
425 	++src1;
426 	++src2;
427 	dst += 2;
428 
429 	/* central pixels */
430 	count -= 2;
431 	while (count) {
432 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
433 			dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
434 			dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
435 		} else {
436 			dst[0] = src1[0];
437 			dst[1] = src1[0];
438 		}
439 
440 		++src0;
441 		++src1;
442 		++src2;
443 		dst += 2;
444 		--count;
445 	}
446 
447 	/* last pixel */
448 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
449 		dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
450 		dst[1] = src1[0] == src0[0] ? src0[0] : src1[0];
451 	} else {
452 		dst[0] = src1[0];
453 		dst[1] = src1[0];
454 	}
455 }
456 
scale2x_32_def_center(scale2x_uint32 * restrict dst,const scale2x_uint32 * restrict src0,const scale2x_uint32 * restrict src1,const scale2x_uint32 * restrict src2,unsigned count)457 static inline void scale2x_32_def_center(scale2x_uint32* restrict dst, const scale2x_uint32* restrict src0, const scale2x_uint32* restrict src1, const scale2x_uint32* restrict src2, unsigned count)
458 {
459 	assert(count >= 2);
460 
461 	/* first pixel */
462 	if (src0[0] != src2[0] && src1[0] != src1[1]) {
463 		dst[0] = src1[0];
464 		dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
465 	} else {
466 		dst[0] = src1[0];
467 		dst[1] = src1[0];
468 	}
469 	++src0;
470 	++src1;
471 	++src2;
472 	dst += 2;
473 
474 	/* central pixels */
475 	count -= 2;
476 	while (count) {
477 		if (src0[0] != src2[0] && src1[-1] != src1[1]) {
478 			dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
479 			dst[1] = (src1[1] == src0[0] && src1[0] != src2[1]) || (src1[1] == src2[0] && src1[0] != src0[1]) ? src1[1] : src1[0];
480 		} else {
481 			dst[0] = src1[0];
482 			dst[1] = src1[0];
483 		}
484 
485 		++src0;
486 		++src1;
487 		++src2;
488 		dst += 2;
489 		--count;
490 	}
491 
492 	/* last pixel */
493 	if (src0[0] != src2[0] && src1[-1] != src1[0]) {
494 		dst[0] = (src1[-1] == src0[0] && src1[0] != src2[-1]) || (src1[-1] == src2[0] && src1[0] != src0[-1]) ? src1[-1] : src1[0];
495 		dst[1] = src1[0];
496 	} else {
497 		dst[0] = src1[0];
498 		dst[1] = src1[0];
499 	}
500 }
501 
502 /**
503  * Scale by a factor of 2 a row of pixels of 8 bits.
504  * The function is implemented in C.
505  * The pixels over the left and right borders are assumed of the same color of
506  * the pixels on the border.
507  * Note that the implementation is optimized to write data sequentially to
508  * maximize the bandwidth on video memory.
509  * \param src0 Pointer at the first pixel of the previous row.
510  * \param src1 Pointer at the first pixel of the current row.
511  * \param src2 Pointer at the first pixel of the next row.
512  * \param count Length in pixels of the src0, src1 and src2 rows.
513  * It must be at least 2.
514  * \param dst0 First destination row, double length in pixels.
515  * \param dst1 Second destination row, double length in pixels.
516  */
scale2x_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)517 void scale2x_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
518 {
519 #ifdef USE_SCALE_RANDOMWRITE
520 	scale2x_8_def_whole(dst0, dst1, src0, src1, src2, count);
521 #else
522 	scale2x_8_def_border(dst0, src0, src1, src2, count);
523 	scale2x_8_def_border(dst1, src2, src1, src0, count);
524 #endif
525 }
526 
527 /**
528  * Scale by a factor of 2 a row of pixels of 16 bits.
529  * This function operates like scale2x_8_def() but for 16 bits pixels.
530  * \param src0 Pointer at the first pixel of the previous row.
531  * \param src1 Pointer at the first pixel of the current row.
532  * \param src2 Pointer at the first pixel of the next row.
533  * \param count Length in pixels of the src0, src1 and src2 rows.
534  * It must be at least 2.
535  * \param dst0 First destination row, double length in pixels.
536  * \param dst1 Second destination row, double length in pixels.
537  */
scale2x_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)538 void scale2x_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
539 {
540 #ifdef USE_SCALE_RANDOMWRITE
541 	scale2x_16_def_whole(dst0, dst1, src0, src1, src2, count);
542 #else
543 	scale2x_16_def_border(dst0, src0, src1, src2, count);
544 	scale2x_16_def_border(dst1, src2, src1, src0, count);
545 #endif
546 }
547 
548 /**
549  * Scale by a factor of 2 a row of pixels of 32 bits.
550  * This function operates like scale2x_8_def() but for 32 bits pixels.
551  * \param src0 Pointer at the first pixel of the previous row.
552  * \param src1 Pointer at the first pixel of the current row.
553  * \param src2 Pointer at the first pixel of the next row.
554  * \param count Length in pixels of the src0, src1 and src2 rows.
555  * It must be at least 2.
556  * \param dst0 First destination row, double length in pixels.
557  * \param dst1 Second destination row, double length in pixels.
558  */
scale2x_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)559 void scale2x_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
560 {
561 #ifdef USE_SCALE_RANDOMWRITE
562 	scale2x_32_def_whole(dst0, dst1, src0, src1, src2, count);
563 #else
564 	scale2x_32_def_border(dst0, src0, src1, src2, count);
565 	scale2x_32_def_border(dst1, src2, src1, src0, count);
566 #endif
567 }
568 
569 /**
570  * Scale by a factor of 2x3 a row of pixels of 8 bits.
571  * \note Like scale2x_8_def();
572  */
scale2x3_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)573 void scale2x3_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
574 {
575 #ifdef USE_SCALE_RANDOMWRITE
576 	scale2x_8_def_whole(dst0, dst2, src0, src1, src2, count);
577 	scale2x_8_def_center(dst1, src0, src1, src2, count);
578 #else
579 	scale2x_8_def_border(dst0, src0, src1, src2, count);
580 	scale2x_8_def_center(dst1, src0, src1, src2, count);
581 	scale2x_8_def_border(dst2, src2, src1, src0, count);
582 #endif
583 }
584 
585 /**
586  * Scale by a factor of 2x3 a row of pixels of 16 bits.
587  * \note Like scale2x_16_def();
588  */
scale2x3_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)589 void scale2x3_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
590 {
591 #ifdef USE_SCALE_RANDOMWRITE
592 	scale2x_16_def_whole(dst0, dst2, src0, src1, src2, count);
593 	scale2x_16_def_center(dst1, src0, src1, src2, count);
594 #else
595 	scale2x_16_def_border(dst0, src0, src1, src2, count);
596 	scale2x_16_def_center(dst1, src0, src1, src2, count);
597 	scale2x_16_def_border(dst2, src2, src1, src0, count);
598 #endif
599 }
600 
601 /**
602  * Scale by a factor of 2x3 a row of pixels of 32 bits.
603  * \note Like scale2x_32_def();
604  */
scale2x3_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)605 void scale2x3_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
606 {
607 #ifdef USE_SCALE_RANDOMWRITE
608 	scale2x_32_def_whole(dst0, dst2, src0, src1, src2, count);
609 	scale2x_32_def_center(dst1, src0, src1, src2, count);
610 #else
611 	scale2x_32_def_border(dst0, src0, src1, src2, count);
612 	scale2x_32_def_center(dst1, src0, src1, src2, count);
613 	scale2x_32_def_border(dst2, src2, src1, src0, count);
614 #endif
615 }
616 
617 /**
618  * Scale by a factor of 2x4 a row of pixels of 8 bits.
619  * \note Like scale2x_8_def();
620  */
scale2x4_8_def(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)621 void scale2x4_8_def(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
622 {
623 #ifdef USE_SCALE_RANDOMWRITE
624 	scale2x_8_def_whole(dst0, dst3, src0, src1, src2, count);
625 	scale2x_8_def_center(dst1, src0, src1, src2, count);
626 	scale2x_8_def_center(dst2, src0, src1, src2, count);
627 #else
628 	scale2x_8_def_border(dst0, src0, src1, src2, count);
629 	scale2x_8_def_center(dst1, src0, src1, src2, count);
630 	scale2x_8_def_center(dst2, src0, src1, src2, count);
631 	scale2x_8_def_border(dst3, src2, src1, src0, count);
632 #endif
633 }
634 
635 /**
636  * Scale by a factor of 2x4 a row of pixels of 16 bits.
637  * \note Like scale2x_16_def();
638  */
scale2x4_16_def(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)639 void scale2x4_16_def(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
640 {
641 #ifdef USE_SCALE_RANDOMWRITE
642 	scale2x_16_def_whole(dst0, dst3, src0, src1, src2, count);
643 	scale2x_16_def_center(dst1, src0, src1, src2, count);
644 	scale2x_16_def_center(dst2, src0, src1, src2, count);
645 #else
646 	scale2x_16_def_border(dst0, src0, src1, src2, count);
647 	scale2x_16_def_center(dst1, src0, src1, src2, count);
648 	scale2x_16_def_center(dst2, src0, src1, src2, count);
649 	scale2x_16_def_border(dst3, src2, src1, src0, count);
650 #endif
651 }
652 
653 /**
654  * Scale by a factor of 2x4 a row of pixels of 32 bits.
655  * \note Like scale2x_32_def();
656  */
scale2x4_32_def(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)657 void scale2x4_32_def(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
658 {
659 #ifdef USE_SCALE_RANDOMWRITE
660 	scale2x_32_def_whole(dst0, dst3, src0, src1, src2, count);
661 	scale2x_32_def_center(dst1, src0, src1, src2, count);
662 	scale2x_32_def_center(dst2, src0, src1, src2, count);
663 #else
664 	scale2x_32_def_border(dst0, src0, src1, src2, count);
665 	scale2x_32_def_center(dst1, src0, src1, src2, count);
666 	scale2x_32_def_center(dst2, src0, src1, src2, count);
667 	scale2x_32_def_border(dst3, src2, src1, src0, count);
668 #endif
669 }
670 
671 /***************************************************************************/
672 /* Scale2x MMX implementation */
673 
674 #if defined(__GNUC__) && (defined(HAVE_MMX) || defined(__amd64__))
675 
676 /*
677  * Apply the Scale2x effect at a single row.
678  * This function must be called only by the other scale2x functions.
679  *
680  * Considering the pixel map :
681  *
682  *      ABC (src0)
683  *      DEF (src1)
684  *      GHI (src2)
685  *
686  * this functions compute 2 new pixels in substitution of the source pixel E
687  * like this map :
688  *
689  *      ab (dst)
690  *
691  * with these variables :
692  *
693  *      &current -> E
694  *      &current_left -> D
695  *      &current_right -> F
696  *      &current_upper -> B
697  *      &current_lower -> H
698  *
699  *      %0 -> current_upper
700  *      %1 -> current
701  *      %2 -> current_lower
702  *      %3 -> dst
703  *      %4 -> counter
704  *
705  *      %mm0 -> *current_left
706  *      %mm1 -> *current_next
707  *      %mm2 -> tmp0
708  *      %mm3 -> tmp1
709  *      %mm4 -> tmp2
710  *      %mm5 -> tmp3
711  *      %mm6 -> *current_upper
712  *      %mm7 -> *current
713  */
scale2x_8_mmx_border(scale2x_uint8 * dst,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)714 static inline void scale2x_8_mmx_border(scale2x_uint8* dst, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
715 {
716 	assert(count >= 16);
717 	assert(count % 8 == 0);
718 
719 	/* always do the first and last run */
720 	count -= 2*8;
721 
722 	__asm__ __volatile__(
723 /* first run */
724 		/* set the current, current_pre, current_next registers */
725 		"movq 0(%1), %%mm0\n"
726 		"movq 0(%1), %%mm7\n"
727 		"movq 8(%1), %%mm1\n"
728 		"psllq $56, %%mm0\n"
729 		"psllq $56, %%mm1\n"
730 		"psrlq $56, %%mm0\n"
731 		"movq %%mm7, %%mm2\n"
732 		"movq %%mm7, %%mm3\n"
733 		"psllq $8, %%mm2\n"
734 		"psrlq $8, %%mm3\n"
735 		"por %%mm2, %%mm0\n"
736 		"por %%mm3, %%mm1\n"
737 
738 		/* current_upper */
739 		"movq (%0), %%mm6\n"
740 
741 		/* compute the upper-left pixel for dst on %%mm2 */
742 		/* compute the upper-right pixel for dst on %%mm4 */
743 		"movq %%mm0, %%mm2\n"
744 		"movq %%mm1, %%mm4\n"
745 		"movq %%mm0, %%mm3\n"
746 		"movq %%mm1, %%mm5\n"
747 		"pcmpeqb %%mm6, %%mm2\n"
748 		"pcmpeqb %%mm6, %%mm4\n"
749 		"pcmpeqb (%2), %%mm3\n"
750 		"pcmpeqb (%2), %%mm5\n"
751 		"pandn %%mm2, %%mm3\n"
752 		"pandn %%mm4, %%mm5\n"
753 		"movq %%mm0, %%mm2\n"
754 		"movq %%mm1, %%mm4\n"
755 		"pcmpeqb %%mm1, %%mm2\n"
756 		"pcmpeqb %%mm0, %%mm4\n"
757 		"pandn %%mm3, %%mm2\n"
758 		"pandn %%mm5, %%mm4\n"
759 		"movq %%mm2, %%mm3\n"
760 		"movq %%mm4, %%mm5\n"
761 		"pand %%mm6, %%mm2\n"
762 		"pand %%mm6, %%mm4\n"
763 		"pandn %%mm7, %%mm3\n"
764 		"pandn %%mm7, %%mm5\n"
765 		"por %%mm3, %%mm2\n"
766 		"por %%mm5, %%mm4\n"
767 
768 		/* set *dst */
769 		"movq %%mm2, %%mm3\n"
770 		"punpcklbw %%mm4, %%mm2\n"
771 		"punpckhbw %%mm4, %%mm3\n"
772 		"movq %%mm2, (%3)\n"
773 		"movq %%mm3, 8(%3)\n"
774 
775 		/* next */
776 		"add $8, %0\n"
777 		"add $8, %1\n"
778 		"add $8, %2\n"
779 		"add $16, %3\n"
780 
781 /* central runs */
782 		"shr $3, %4\n"
783 		"jz 1f\n"
784 
785 		"0:\n"
786 
787 		/* set the current, current_pre, current_next registers */
788 		"movq -8(%1), %%mm0\n"
789 		"movq (%1), %%mm7\n"
790 		"movq 8(%1), %%mm1\n"
791 		"psrlq $56, %%mm0\n"
792 		"psllq $56, %%mm1\n"
793 		"movq %%mm7, %%mm2\n"
794 		"movq %%mm7, %%mm3\n"
795 		"psllq $8, %%mm2\n"
796 		"psrlq $8, %%mm3\n"
797 		"por %%mm2, %%mm0\n"
798 		"por %%mm3, %%mm1\n"
799 
800 		/* current_upper */
801 		"movq (%0), %%mm6\n"
802 
803 		/* compute the upper-left pixel for dst on %%mm2 */
804 		/* compute the upper-right pixel for dst on %%mm4 */
805 		"movq %%mm0, %%mm2\n"
806 		"movq %%mm1, %%mm4\n"
807 		"movq %%mm0, %%mm3\n"
808 		"movq %%mm1, %%mm5\n"
809 		"pcmpeqb %%mm6, %%mm2\n"
810 		"pcmpeqb %%mm6, %%mm4\n"
811 		"pcmpeqb (%2), %%mm3\n"
812 		"pcmpeqb (%2), %%mm5\n"
813 		"pandn %%mm2, %%mm3\n"
814 		"pandn %%mm4, %%mm5\n"
815 		"movq %%mm0, %%mm2\n"
816 		"movq %%mm1, %%mm4\n"
817 		"pcmpeqb %%mm1, %%mm2\n"
818 		"pcmpeqb %%mm0, %%mm4\n"
819 		"pandn %%mm3, %%mm2\n"
820 		"pandn %%mm5, %%mm4\n"
821 		"movq %%mm2, %%mm3\n"
822 		"movq %%mm4, %%mm5\n"
823 		"pand %%mm6, %%mm2\n"
824 		"pand %%mm6, %%mm4\n"
825 		"pandn %%mm7, %%mm3\n"
826 		"pandn %%mm7, %%mm5\n"
827 		"por %%mm3, %%mm2\n"
828 		"por %%mm5, %%mm4\n"
829 
830 		/* set *dst */
831 		"movq %%mm2, %%mm3\n"
832 		"punpcklbw %%mm4, %%mm2\n"
833 		"punpckhbw %%mm4, %%mm3\n"
834 		"movq %%mm2, (%3)\n"
835 		"movq %%mm3, 8(%3)\n"
836 
837 		/* next */
838 		"add $8, %0\n"
839 		"add $8, %1\n"
840 		"add $8, %2\n"
841 		"add $16, %3\n"
842 
843 		"dec %4\n"
844 		"jnz 0b\n"
845 		"1:\n"
846 
847 /* final run */
848 		/* set the current, current_pre, current_next registers */
849 		"movq (%1), %%mm1\n"
850 		"movq (%1), %%mm7\n"
851 		"movq -8(%1), %%mm0\n"
852 		"psrlq $56, %%mm1\n"
853 		"psrlq $56, %%mm0\n"
854 		"psllq $56, %%mm1\n"
855 		"movq %%mm7, %%mm2\n"
856 		"movq %%mm7, %%mm3\n"
857 		"psllq $8, %%mm2\n"
858 		"psrlq $8, %%mm3\n"
859 		"por %%mm2, %%mm0\n"
860 		"por %%mm3, %%mm1\n"
861 
862 		/* current_upper */
863 		"movq (%0), %%mm6\n"
864 
865 		/* compute the upper-left pixel for dst on %%mm2 */
866 		/* compute the upper-right pixel for dst on %%mm4 */
867 		"movq %%mm0, %%mm2\n"
868 		"movq %%mm1, %%mm4\n"
869 		"movq %%mm0, %%mm3\n"
870 		"movq %%mm1, %%mm5\n"
871 		"pcmpeqb %%mm6, %%mm2\n"
872 		"pcmpeqb %%mm6, %%mm4\n"
873 		"pcmpeqb (%2), %%mm3\n"
874 		"pcmpeqb (%2), %%mm5\n"
875 		"pandn %%mm2, %%mm3\n"
876 		"pandn %%mm4, %%mm5\n"
877 		"movq %%mm0, %%mm2\n"
878 		"movq %%mm1, %%mm4\n"
879 		"pcmpeqb %%mm1, %%mm2\n"
880 		"pcmpeqb %%mm0, %%mm4\n"
881 		"pandn %%mm3, %%mm2\n"
882 		"pandn %%mm5, %%mm4\n"
883 		"movq %%mm2, %%mm3\n"
884 		"movq %%mm4, %%mm5\n"
885 		"pand %%mm6, %%mm2\n"
886 		"pand %%mm6, %%mm4\n"
887 		"pandn %%mm7, %%mm3\n"
888 		"pandn %%mm7, %%mm5\n"
889 		"por %%mm3, %%mm2\n"
890 		"por %%mm5, %%mm4\n"
891 
892 		/* set *dst */
893 		"movq %%mm2, %%mm3\n"
894 		"punpcklbw %%mm4, %%mm2\n"
895 		"punpckhbw %%mm4, %%mm3\n"
896 		"movq %%mm2, (%3)\n"
897 		"movq %%mm3, 8(%3)\n"
898 
899 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
900 		:
901 		: "cc"
902 	);
903 }
904 
scale2x_16_mmx_border(scale2x_uint16 * dst,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)905 static inline void scale2x_16_mmx_border(scale2x_uint16* dst, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
906 {
907 	assert(count >= 8);
908 	assert(count % 4 == 0);
909 
910 	/* always do the first and last run */
911 	count -= 2*4;
912 
913 	__asm__ __volatile__(
914 /* first run */
915 		/* set the current, current_pre, current_next registers */
916 		"movq 0(%1), %%mm0\n"
917 		"movq 0(%1), %%mm7\n"
918 		"movq 8(%1), %%mm1\n"
919 		"psllq $48, %%mm0\n"
920 		"psllq $48, %%mm1\n"
921 		"psrlq $48, %%mm0\n"
922 		"movq %%mm7, %%mm2\n"
923 		"movq %%mm7, %%mm3\n"
924 		"psllq $16, %%mm2\n"
925 		"psrlq $16, %%mm3\n"
926 		"por %%mm2, %%mm0\n"
927 		"por %%mm3, %%mm1\n"
928 
929 		/* current_upper */
930 		"movq (%0), %%mm6\n"
931 
932 		/* compute the upper-left pixel for dst on %%mm2 */
933 		/* compute the upper-right pixel for dst on %%mm4 */
934 		"movq %%mm0, %%mm2\n"
935 		"movq %%mm1, %%mm4\n"
936 		"movq %%mm0, %%mm3\n"
937 		"movq %%mm1, %%mm5\n"
938 		"pcmpeqw %%mm6, %%mm2\n"
939 		"pcmpeqw %%mm6, %%mm4\n"
940 		"pcmpeqw (%2), %%mm3\n"
941 		"pcmpeqw (%2), %%mm5\n"
942 		"pandn %%mm2, %%mm3\n"
943 		"pandn %%mm4, %%mm5\n"
944 		"movq %%mm0, %%mm2\n"
945 		"movq %%mm1, %%mm4\n"
946 		"pcmpeqw %%mm1, %%mm2\n"
947 		"pcmpeqw %%mm0, %%mm4\n"
948 		"pandn %%mm3, %%mm2\n"
949 		"pandn %%mm5, %%mm4\n"
950 		"movq %%mm2, %%mm3\n"
951 		"movq %%mm4, %%mm5\n"
952 		"pand %%mm6, %%mm2\n"
953 		"pand %%mm6, %%mm4\n"
954 		"pandn %%mm7, %%mm3\n"
955 		"pandn %%mm7, %%mm5\n"
956 		"por %%mm3, %%mm2\n"
957 		"por %%mm5, %%mm4\n"
958 
959 		/* set *dst */
960 		"movq %%mm2, %%mm3\n"
961 		"punpcklwd %%mm4, %%mm2\n"
962 		"punpckhwd %%mm4, %%mm3\n"
963 		"movq %%mm2, (%3)\n"
964 		"movq %%mm3, 8(%3)\n"
965 
966 		/* next */
967 		"add $8, %0\n"
968 		"add $8, %1\n"
969 		"add $8, %2\n"
970 		"add $16, %3\n"
971 
972 /* central runs */
973 		"shr $2, %4\n"
974 		"jz 1f\n"
975 
976 		"0:\n"
977 
978 		/* set the current, current_pre, current_next registers */
979 		"movq -8(%1), %%mm0\n"
980 		"movq (%1), %%mm7\n"
981 		"movq 8(%1), %%mm1\n"
982 		"psrlq $48, %%mm0\n"
983 		"psllq $48, %%mm1\n"
984 		"movq %%mm7, %%mm2\n"
985 		"movq %%mm7, %%mm3\n"
986 		"psllq $16, %%mm2\n"
987 		"psrlq $16, %%mm3\n"
988 		"por %%mm2, %%mm0\n"
989 		"por %%mm3, %%mm1\n"
990 
991 		/* current_upper */
992 		"movq (%0), %%mm6\n"
993 
994 		/* compute the upper-left pixel for dst on %%mm2 */
995 		/* compute the upper-right pixel for dst on %%mm4 */
996 		"movq %%mm0, %%mm2\n"
997 		"movq %%mm1, %%mm4\n"
998 		"movq %%mm0, %%mm3\n"
999 		"movq %%mm1, %%mm5\n"
1000 		"pcmpeqw %%mm6, %%mm2\n"
1001 		"pcmpeqw %%mm6, %%mm4\n"
1002 		"pcmpeqw (%2), %%mm3\n"
1003 		"pcmpeqw (%2), %%mm5\n"
1004 		"pandn %%mm2, %%mm3\n"
1005 		"pandn %%mm4, %%mm5\n"
1006 		"movq %%mm0, %%mm2\n"
1007 		"movq %%mm1, %%mm4\n"
1008 		"pcmpeqw %%mm1, %%mm2\n"
1009 		"pcmpeqw %%mm0, %%mm4\n"
1010 		"pandn %%mm3, %%mm2\n"
1011 		"pandn %%mm5, %%mm4\n"
1012 		"movq %%mm2, %%mm3\n"
1013 		"movq %%mm4, %%mm5\n"
1014 		"pand %%mm6, %%mm2\n"
1015 		"pand %%mm6, %%mm4\n"
1016 		"pandn %%mm7, %%mm3\n"
1017 		"pandn %%mm7, %%mm5\n"
1018 		"por %%mm3, %%mm2\n"
1019 		"por %%mm5, %%mm4\n"
1020 
1021 		/* set *dst */
1022 		"movq %%mm2, %%mm3\n"
1023 		"punpcklwd %%mm4, %%mm2\n"
1024 		"punpckhwd %%mm4, %%mm3\n"
1025 		"movq %%mm2, (%3)\n"
1026 		"movq %%mm3, 8(%3)\n"
1027 
1028 		/* next */
1029 		"add $8, %0\n"
1030 		"add $8, %1\n"
1031 		"add $8, %2\n"
1032 		"add $16, %3\n"
1033 
1034 		"dec %4\n"
1035 		"jnz 0b\n"
1036 		"1:\n"
1037 
1038 /* final run */
1039 		/* set the current, current_pre, current_next registers */
1040 		"movq (%1), %%mm1\n"
1041 		"movq (%1), %%mm7\n"
1042 		"movq -8(%1), %%mm0\n"
1043 		"psrlq $48, %%mm1\n"
1044 		"psrlq $48, %%mm0\n"
1045 		"psllq $48, %%mm1\n"
1046 		"movq %%mm7, %%mm2\n"
1047 		"movq %%mm7, %%mm3\n"
1048 		"psllq $16, %%mm2\n"
1049 		"psrlq $16, %%mm3\n"
1050 		"por %%mm2, %%mm0\n"
1051 		"por %%mm3, %%mm1\n"
1052 
1053 		/* current_upper */
1054 		"movq (%0), %%mm6\n"
1055 
1056 		/* compute the upper-left pixel for dst on %%mm2 */
1057 		/* compute the upper-right pixel for dst on %%mm4 */
1058 		"movq %%mm0, %%mm2\n"
1059 		"movq %%mm1, %%mm4\n"
1060 		"movq %%mm0, %%mm3\n"
1061 		"movq %%mm1, %%mm5\n"
1062 		"pcmpeqw %%mm6, %%mm2\n"
1063 		"pcmpeqw %%mm6, %%mm4\n"
1064 		"pcmpeqw (%2), %%mm3\n"
1065 		"pcmpeqw (%2), %%mm5\n"
1066 		"pandn %%mm2, %%mm3\n"
1067 		"pandn %%mm4, %%mm5\n"
1068 		"movq %%mm0, %%mm2\n"
1069 		"movq %%mm1, %%mm4\n"
1070 		"pcmpeqw %%mm1, %%mm2\n"
1071 		"pcmpeqw %%mm0, %%mm4\n"
1072 		"pandn %%mm3, %%mm2\n"
1073 		"pandn %%mm5, %%mm4\n"
1074 		"movq %%mm2, %%mm3\n"
1075 		"movq %%mm4, %%mm5\n"
1076 		"pand %%mm6, %%mm2\n"
1077 		"pand %%mm6, %%mm4\n"
1078 		"pandn %%mm7, %%mm3\n"
1079 		"pandn %%mm7, %%mm5\n"
1080 		"por %%mm3, %%mm2\n"
1081 		"por %%mm5, %%mm4\n"
1082 
1083 		/* set *dst */
1084 		"movq %%mm2, %%mm3\n"
1085 		"punpcklwd %%mm4, %%mm2\n"
1086 		"punpckhwd %%mm4, %%mm3\n"
1087 		"movq %%mm2, (%3)\n"
1088 		"movq %%mm3, 8(%3)\n"
1089 
1090 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1091 		:
1092 		: "cc"
1093 	);
1094 }
1095 
scale2x_32_mmx_border(scale2x_uint32 * dst,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1096 static inline void scale2x_32_mmx_border(scale2x_uint32* dst, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1097 {
1098 	assert(count >= 4);
1099 	assert(count % 2 == 0);
1100 
1101 	/* always do the first and last run */
1102 	count -= 2*2;
1103 
1104 	__asm__ __volatile__(
1105 /* first run */
1106 		/* set the current, current_pre, current_next registers */
1107 		"movq 0(%1), %%mm0\n"
1108 		"movq 0(%1), %%mm7\n"
1109 		"movq 8(%1), %%mm1\n"
1110 		"psllq $32, %%mm0\n"
1111 		"psllq $32, %%mm1\n"
1112 		"psrlq $32, %%mm0\n"
1113 		"movq %%mm7, %%mm2\n"
1114 		"movq %%mm7, %%mm3\n"
1115 		"psllq $32, %%mm2\n"
1116 		"psrlq $32, %%mm3\n"
1117 		"por %%mm2, %%mm0\n"
1118 		"por %%mm3, %%mm1\n"
1119 
1120 		/* current_upper */
1121 		"movq (%0), %%mm6\n"
1122 
1123 		/* compute the upper-left pixel for dst on %%mm2 */
1124 		/* compute the upper-right pixel for dst on %%mm4 */
1125 		"movq %%mm0, %%mm2\n"
1126 		"movq %%mm1, %%mm4\n"
1127 		"movq %%mm0, %%mm3\n"
1128 		"movq %%mm1, %%mm5\n"
1129 		"pcmpeqd %%mm6, %%mm2\n"
1130 		"pcmpeqd %%mm6, %%mm4\n"
1131 		"pcmpeqd (%2), %%mm3\n"
1132 		"pcmpeqd (%2), %%mm5\n"
1133 		"pandn %%mm2, %%mm3\n"
1134 		"pandn %%mm4, %%mm5\n"
1135 		"movq %%mm0, %%mm2\n"
1136 		"movq %%mm1, %%mm4\n"
1137 		"pcmpeqd %%mm1, %%mm2\n"
1138 		"pcmpeqd %%mm0, %%mm4\n"
1139 		"pandn %%mm3, %%mm2\n"
1140 		"pandn %%mm5, %%mm4\n"
1141 		"movq %%mm2, %%mm3\n"
1142 		"movq %%mm4, %%mm5\n"
1143 		"pand %%mm6, %%mm2\n"
1144 		"pand %%mm6, %%mm4\n"
1145 		"pandn %%mm7, %%mm3\n"
1146 		"pandn %%mm7, %%mm5\n"
1147 		"por %%mm3, %%mm2\n"
1148 		"por %%mm5, %%mm4\n"
1149 
1150 		/* set *dst */
1151 		"movq %%mm2, %%mm3\n"
1152 		"punpckldq %%mm4, %%mm2\n"
1153 		"punpckhdq %%mm4, %%mm3\n"
1154 		"movq %%mm2, (%3)\n"
1155 		"movq %%mm3, 8(%3)\n"
1156 
1157 		/* next */
1158 		"add $8, %0\n"
1159 		"add $8, %1\n"
1160 		"add $8, %2\n"
1161 		"add $16, %3\n"
1162 
1163 /* central runs */
1164 		"shr $1, %4\n"
1165 		"jz 1f\n"
1166 
1167 		"0:\n"
1168 
1169 		/* set the current, current_pre, current_next registers */
1170 		"movq -8(%1), %%mm0\n"
1171 		"movq (%1), %%mm7\n"
1172 		"movq 8(%1), %%mm1\n"
1173 		"psrlq $32, %%mm0\n"
1174 		"psllq $32, %%mm1\n"
1175 		"movq %%mm7, %%mm2\n"
1176 		"movq %%mm7, %%mm3\n"
1177 		"psllq $32, %%mm2\n"
1178 		"psrlq $32, %%mm3\n"
1179 		"por %%mm2, %%mm0\n"
1180 		"por %%mm3, %%mm1\n"
1181 
1182 		/* current_upper */
1183 		"movq (%0), %%mm6\n"
1184 
1185 		/* compute the upper-left pixel for dst on %%mm2 */
1186 		/* compute the upper-right pixel for dst on %%mm4 */
1187 		"movq %%mm0, %%mm2\n"
1188 		"movq %%mm1, %%mm4\n"
1189 		"movq %%mm0, %%mm3\n"
1190 		"movq %%mm1, %%mm5\n"
1191 		"pcmpeqd %%mm6, %%mm2\n"
1192 		"pcmpeqd %%mm6, %%mm4\n"
1193 		"pcmpeqd (%2), %%mm3\n"
1194 		"pcmpeqd (%2), %%mm5\n"
1195 		"pandn %%mm2, %%mm3\n"
1196 		"pandn %%mm4, %%mm5\n"
1197 		"movq %%mm0, %%mm2\n"
1198 		"movq %%mm1, %%mm4\n"
1199 		"pcmpeqd %%mm1, %%mm2\n"
1200 		"pcmpeqd %%mm0, %%mm4\n"
1201 		"pandn %%mm3, %%mm2\n"
1202 		"pandn %%mm5, %%mm4\n"
1203 		"movq %%mm2, %%mm3\n"
1204 		"movq %%mm4, %%mm5\n"
1205 		"pand %%mm6, %%mm2\n"
1206 		"pand %%mm6, %%mm4\n"
1207 		"pandn %%mm7, %%mm3\n"
1208 		"pandn %%mm7, %%mm5\n"
1209 		"por %%mm3, %%mm2\n"
1210 		"por %%mm5, %%mm4\n"
1211 
1212 		/* set *dst */
1213 		"movq %%mm2, %%mm3\n"
1214 		"punpckldq %%mm4, %%mm2\n"
1215 		"punpckhdq %%mm4, %%mm3\n"
1216 		"movq %%mm2, (%3)\n"
1217 		"movq %%mm3, 8(%3)\n"
1218 
1219 		/* next */
1220 		"add $8, %0\n"
1221 		"add $8, %1\n"
1222 		"add $8, %2\n"
1223 		"add $16, %3\n"
1224 
1225 		"dec %4\n"
1226 		"jnz 0b\n"
1227 		"1:\n"
1228 
1229 /* final run */
1230 		/* set the current, current_pre, current_next registers */
1231 		"movq (%1), %%mm1\n"
1232 		"movq (%1), %%mm7\n"
1233 		"movq -8(%1), %%mm0\n"
1234 		"psrlq $32, %%mm1\n"
1235 		"psrlq $32, %%mm0\n"
1236 		"psllq $32, %%mm1\n"
1237 		"movq %%mm7, %%mm2\n"
1238 		"movq %%mm7, %%mm3\n"
1239 		"psllq $32, %%mm2\n"
1240 		"psrlq $32, %%mm3\n"
1241 		"por %%mm2, %%mm0\n"
1242 		"por %%mm3, %%mm1\n"
1243 
1244 		/* current_upper */
1245 		"movq (%0), %%mm6\n"
1246 
1247 		/* compute the upper-left pixel for dst on %%mm2 */
1248 		/* compute the upper-right pixel for dst on %%mm4 */
1249 		"movq %%mm0, %%mm2\n"
1250 		"movq %%mm1, %%mm4\n"
1251 		"movq %%mm0, %%mm3\n"
1252 		"movq %%mm1, %%mm5\n"
1253 		"pcmpeqd %%mm6, %%mm2\n"
1254 		"pcmpeqd %%mm6, %%mm4\n"
1255 		"pcmpeqd (%2), %%mm3\n"
1256 		"pcmpeqd (%2), %%mm5\n"
1257 		"pandn %%mm2, %%mm3\n"
1258 		"pandn %%mm4, %%mm5\n"
1259 		"movq %%mm0, %%mm2\n"
1260 		"movq %%mm1, %%mm4\n"
1261 		"pcmpeqd %%mm1, %%mm2\n"
1262 		"pcmpeqd %%mm0, %%mm4\n"
1263 		"pandn %%mm3, %%mm2\n"
1264 		"pandn %%mm5, %%mm4\n"
1265 		"movq %%mm2, %%mm3\n"
1266 		"movq %%mm4, %%mm5\n"
1267 		"pand %%mm6, %%mm2\n"
1268 		"pand %%mm6, %%mm4\n"
1269 		"pandn %%mm7, %%mm3\n"
1270 		"pandn %%mm7, %%mm5\n"
1271 		"por %%mm3, %%mm2\n"
1272 		"por %%mm5, %%mm4\n"
1273 
1274 		/* set *dst */
1275 		"movq %%mm2, %%mm3\n"
1276 		"punpckldq %%mm4, %%mm2\n"
1277 		"punpckhdq %%mm4, %%mm3\n"
1278 		"movq %%mm2, (%3)\n"
1279 		"movq %%mm3, 8(%3)\n"
1280 
1281 		: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
1282 		:
1283 		: "cc"
1284 	);
1285 }
1286 
1287 /**
1288  * Scale by a factor of 2 a row of pixels of 8 bits.
1289  * This is a very fast MMX implementation.
1290  * The implementation uses a combination of cmp/and/not operations to
1291  * completly remove the need of conditional jumps. This trick give the
1292  * major speed improvement.
1293  * Also, using the 8 bytes MMX registers more than one pixel are computed
1294  * at the same time.
1295  * Before calling this function you must ensure that the currenct CPU supports
1296  * the MMX instruction set. After calling it you must be sure to call the EMMS
1297  * instruction before any floating-point operation.
1298  * The pixels over the left and right borders are assumed of the same color of
1299  * the pixels on the border.
1300  * Note that the implementation is optimized to write data sequentially to
1301  * maximize the bandwidth on video memory.
1302  * \param src0 Pointer at the first pixel of the previous row.
1303  * \param src1 Pointer at the first pixel of the current row.
1304  * \param src2 Pointer at the first pixel of the next row.
1305  * \param count Length in pixels of the src0, src1 and src2 rows. It must
1306  * be at least 16 and a multiple of 8.
1307  * \param dst0 First destination row, double length in pixels.
1308  * \param dst1 Second destination row, double length in pixels.
1309  */
scale2x_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1310 void scale2x_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1311 {
1312 	if (count % 8 != 0 || count < 16) {
1313 		scale2x_8_def(dst0, dst1, src0, src1, src2, count);
1314 	} else {
1315 		scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1316 		scale2x_8_mmx_border(dst1, src2, src1, src0, count);
1317 	}
1318 }
1319 
1320 /**
1321  * Scale by a factor of 2 a row of pixels of 16 bits.
1322  * This function operates like scale2x_8_mmx() but for 16 bits pixels.
1323  * \param src0 Pointer at the first pixel of the previous row.
1324  * \param src1 Pointer at the first pixel of the current row.
1325  * \param src2 Pointer at the first pixel of the next row.
1326  * \param count Length in pixels of the src0, src1 and src2 rows. It must
1327  * be at least 8 and a multiple of 4.
1328  * \param dst0 First destination row, double length in pixels.
1329  * \param dst1 Second destination row, double length in pixels.
1330  */
scale2x_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1331 void scale2x_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1332 {
1333 	if (count % 4 != 0 || count < 8) {
1334 		scale2x_16_def(dst0, dst1, src0, src1, src2, count);
1335 	} else {
1336 		scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1337 		scale2x_16_mmx_border(dst1, src2, src1, src0, count);
1338 	}
1339 }
1340 
1341 /**
1342  * Scale by a factor of 2 a row of pixels of 32 bits.
1343  * This function operates like scale2x_8_mmx() but for 32 bits pixels.
1344  * \param src0 Pointer at the first pixel of the previous row.
1345  * \param src1 Pointer at the first pixel of the current row.
1346  * \param src2 Pointer at the first pixel of the next row.
1347  * \param count Length in pixels of the src0, src1 and src2 rows. It must
1348  * be at least 4 and a multiple of 2.
1349  * \param dst0 First destination row, double length in pixels.
1350  * \param dst1 Second destination row, double length in pixels.
1351  */
scale2x_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1352 void scale2x_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1353 {
1354 	if (count % 2 != 0 || count < 4) {
1355 		scale2x_32_def(dst0, dst1, src0, src1, src2, count);
1356 	} else {
1357 		scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1358 		scale2x_32_mmx_border(dst1, src2, src1, src0, count);
1359 	}
1360 }
1361 
1362 /**
1363  * Scale by a factor of 2x3 a row of pixels of 8 bits.
1364  * This function operates like scale2x_8_mmx() but with an expansion
1365  * factor of 2x3 instead of 2x2.
1366  */
scale2x3_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1367 void scale2x3_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1368 {
1369 	if (count % 8 != 0 || count < 16) {
1370 		scale2x3_8_def(dst0, dst1, dst2, src0, src1, src2, count);
1371 	} else {
1372 		scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1373 		scale2x_8_def_center(dst1, src0, src1, src2, count);
1374 		scale2x_8_mmx_border(dst2, src2, src1, src0, count);
1375 	}
1376 }
1377 
1378 /**
1379  * Scale by a factor of 2x3 a row of pixels of 16 bits.
1380  * This function operates like scale2x_16_mmx() but with an expansion
1381  * factor of 2x3 instead of 2x2.
1382  */
scale2x3_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1383 void scale2x3_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1384 {
1385 	if (count % 4 != 0 || count < 8) {
1386 		scale2x3_16_def(dst0, dst1, dst2, src0, src1, src2, count);
1387 	} else {
1388 		scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1389 		scale2x_16_def_center(dst1, src0, src1, src2, count);
1390 		scale2x_16_mmx_border(dst2, src2, src1, src0, count);
1391 	}
1392 }
1393 
1394 /**
1395  * Scale by a factor of 2x3 a row of pixels of 32 bits.
1396  * This function operates like scale2x_32_mmx() but with an expansion
1397  * factor of 2x3 instead of 2x2.
1398  */
scale2x3_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1399 void scale2x3_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1400 {
1401 	if (count % 2 != 0 || count < 4) {
1402 		scale2x3_32_def(dst0, dst1, dst2, src0, src1, src2, count);
1403 	} else {
1404 		scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1405 		scale2x_32_def_center(dst1, src0, src1, src2, count);
1406 		scale2x_32_mmx_border(dst2, src2, src1, src0, count);
1407 	}
1408 }
1409 
1410 /**
1411  * Scale by a factor of 2x4 a row of pixels of 8 bits.
1412  * This function operates like scale2x_8_mmx() but with an expansion
1413  * factor of 2x4 instead of 2x2.
1414  */
scale2x4_8_mmx(scale2x_uint8 * dst0,scale2x_uint8 * dst1,scale2x_uint8 * dst2,scale2x_uint8 * dst3,const scale2x_uint8 * src0,const scale2x_uint8 * src1,const scale2x_uint8 * src2,unsigned count)1415 void scale2x4_8_mmx(scale2x_uint8* dst0, scale2x_uint8* dst1, scale2x_uint8* dst2, scale2x_uint8* dst3, const scale2x_uint8* src0, const scale2x_uint8* src1, const scale2x_uint8* src2, unsigned count)
1416 {
1417 	if (count % 8 != 0 || count < 16) {
1418 		scale2x4_8_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1419 	} else {
1420 		scale2x_8_mmx_border(dst0, src0, src1, src2, count);
1421 		scale2x_8_def_center(dst1, src0, src1, src2, count);
1422 		scale2x_8_def_center(dst2, src0, src1, src2, count);
1423 		scale2x_8_mmx_border(dst3, src2, src1, src0, count);
1424 	}
1425 }
1426 
1427 /**
1428  * Scale by a factor of 2x4 a row of pixels of 16 bits.
1429  * This function operates like scale2x_16_mmx() but with an expansion
1430  * factor of 2x4 instead of 2x2.
1431  */
scale2x4_16_mmx(scale2x_uint16 * dst0,scale2x_uint16 * dst1,scale2x_uint16 * dst2,scale2x_uint16 * dst3,const scale2x_uint16 * src0,const scale2x_uint16 * src1,const scale2x_uint16 * src2,unsigned count)1432 void scale2x4_16_mmx(scale2x_uint16* dst0, scale2x_uint16* dst1, scale2x_uint16* dst2, scale2x_uint16* dst3, const scale2x_uint16* src0, const scale2x_uint16* src1, const scale2x_uint16* src2, unsigned count)
1433 {
1434 	if (count % 4 != 0 || count < 8) {
1435 		scale2x4_16_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1436 	} else {
1437 		scale2x_16_mmx_border(dst0, src0, src1, src2, count);
1438 		scale2x_16_def_center(dst1, src0, src1, src2, count);
1439 		scale2x_16_def_center(dst2, src0, src1, src2, count);
1440 		scale2x_16_mmx_border(dst3, src2, src1, src0, count);
1441 	}
1442 }
1443 
1444 /**
1445  * Scale by a factor of 2x4 a row of pixels of 32 bits.
1446  * This function operates like scale2x_32_mmx() but with an expansion
1447  * factor of 2x4 instead of 2x2.
1448  */
scale2x4_32_mmx(scale2x_uint32 * dst0,scale2x_uint32 * dst1,scale2x_uint32 * dst2,scale2x_uint32 * dst3,const scale2x_uint32 * src0,const scale2x_uint32 * src1,const scale2x_uint32 * src2,unsigned count)1449 void scale2x4_32_mmx(scale2x_uint32* dst0, scale2x_uint32* dst1, scale2x_uint32* dst2, scale2x_uint32* dst3, const scale2x_uint32* src0, const scale2x_uint32* src1, const scale2x_uint32* src2, unsigned count)
1450 {
1451 	if (count % 2 != 0 || count < 4) {
1452 		scale2x4_32_def(dst0, dst1, dst2, dst3, src0, src1, src2, count);
1453 	} else {
1454 		scale2x_32_mmx_border(dst0, src0, src1, src2, count);
1455 		scale2x_32_def_center(dst1, src0, src1, src2, count);
1456 		scale2x_32_def_center(dst2, src0, src1, src2, count);
1457 		scale2x_32_mmx_border(dst3, src2, src1, src0, count);
1458 	}
1459 }
1460 
1461 #endif
1462 
1463