1 #define COMPONENT_SIZE 8
2 #define MASK 0xff
3 #define ONE_HALF 0x80
4 
5 #define A_SHIFT 8 * 3
6 #define R_SHIFT 8 * 2
7 #define G_SHIFT 8
8 #define A_MASK 0xff000000
9 #define R_MASK 0xff0000
10 #define G_MASK 0xff00
11 
12 #define RB_MASK 0xff00ff
13 #define AG_MASK 0xff00ff00
14 #define RB_ONE_HALF 0x800080
15 #define RB_MASK_PLUS_ONE 0x1000100
16 
17 #define ALPHA_8(x) ((x) >> A_SHIFT)
18 #define RED_8(x) (((x) >> R_SHIFT) & MASK)
19 #define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
20 #define BLUE_8(x) ((x) & MASK)
21 
22 /*
23  * ARMv6 has UQADD8 instruction, which implements unsigned saturated
24  * addition for 8-bit values packed in 32-bit registers. It is very useful
25  * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
26  * otherwise need a lot of arithmetic operations to simulate this operation).
27  * Since most of the major ARM linux distros are built for ARMv7, we are
28  * much less dependent on runtime CPU detection and can get practical
29  * benefits from conditional compilation here for a lot of users.
30  */
31 
32 #if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \
33     !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__))
34 #if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \
35     defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \
36     defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \
37     defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \
38     defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \
39     defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__)
40 
41 static force_inline uint32_t
un8x4_add_un8x4(uint32_t x,uint32_t y)42 un8x4_add_un8x4 (uint32_t x, uint32_t y)
43 {
44     uint32_t t;
45     asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y));
46     return t;
47 }
48 
49 #define UN8x4_ADD_UN8x4(x, y) \
50     ((x) = un8x4_add_un8x4 ((x), (y)))
51 
52 #define UN8_rb_ADD_UN8_rb(x, y, t) \
53     ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t))
54 
55 #define ADD_UN8(x, y, t) \
56     ((t) = (x), un8x4_add_un8x4 ((t), (y)))
57 
58 #endif
59 #endif
60 
61 /*****************************************************************************/
62 
63 /*
64  * Helper macros.
65  */
66 
67 #define MUL_UN8(a, b, t)						\
68     ((t) = (a) * (uint16_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
69 
70 #define DIV_UN8(a, b)							\
71     (((uint16_t) (a) * MASK + ((b) / 2)) / (b))
72 
73 #ifndef ADD_UN8
74 #define ADD_UN8(x, y, t)				     \
75     ((t) = (x) + (y),					     \
76      (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT))))
77 #endif
78 
79 #define DIV_ONE_UN8(x)							\
80     (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
81 
82 /*
83  * The methods below use some tricks to be able to do two color
84  * components at the same time.
85  */
86 
87 /*
88  * x_rb = (x_rb * a) / 255
89  */
90 #define UN8_rb_MUL_UN8(x, a, t)						\
91     do									\
92     {									\
93 	t  = ((x) & RB_MASK) * (a);					\
94 	t += RB_ONE_HALF;						\
95 	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
96 	x &= RB_MASK;							\
97     } while (0)
98 
99 /*
100  * x_rb = min (x_rb + y_rb, 255)
101  */
102 #ifndef UN8_rb_ADD_UN8_rb
103 #define UN8_rb_ADD_UN8_rb(x, y, t)					\
104     do									\
105     {									\
106 	t = ((x) + (y));						\
107 	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
108 	x = (t & RB_MASK);						\
109     } while (0)
110 #endif
111 
112 /*
113  * x_rb = (x_rb * a_rb) / 255
114  */
115 #define UN8_rb_MUL_UN8_rb(x, a, t)					\
116     do									\
117     {									\
118 	t  = (x & MASK) * (a & MASK);					\
119 	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
120 	t += RB_ONE_HALF;						\
121 	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
122 	x = t & RB_MASK;						\
123     } while (0)
124 
125 /*
126  * x_c = (x_c * a) / 255
127  */
128 #define UN8x4_MUL_UN8(x, a)						\
129     do									\
130     {									\
131 	uint32_t r1__, r2__, t__;					\
132 									\
133 	r1__ = (x);							\
134 	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
135 									\
136 	r2__ = (x) >> G_SHIFT;						\
137 	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
138 									\
139 	(x) = r1__ | (r2__ << G_SHIFT);					\
140     } while (0)
141 
142 /*
143  * x_c = (x_c * a) / 255 + y_c
144  */
145 #define UN8x4_MUL_UN8_ADD_UN8x4(x, a, y)				\
146     do									\
147     {									\
148 	uint32_t r1__, r2__, r3__, t__;					\
149 									\
150 	r1__ = (x);							\
151 	r2__ = (y) & RB_MASK;						\
152 	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
153 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
154 									\
155 	r2__ = (x) >> G_SHIFT;						\
156 	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
157 	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
158 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
159 									\
160 	(x) = r1__ | (r2__ << G_SHIFT);					\
161     } while (0)
162 
163 /*
164  * x_c = (x_c * a + y_c * b) / 255
165  */
166 #define UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
167     do									\
168     {									\
169 	uint32_t r1__, r2__, r3__, t__;					\
170 									\
171 	r1__ = (x);							\
172 	r2__ = (y);							\
173 	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
174 	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
175 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
176 									\
177 	r2__ = ((x) >> G_SHIFT);					\
178 	r3__ = ((y) >> G_SHIFT);					\
179 	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
180 	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
181 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
182 									\
183 	(x) = r1__ | (r2__ << G_SHIFT);					\
184     } while (0)
185 
186 /*
187  * x_c = (x_c * a_c) / 255
188  */
189 #define UN8x4_MUL_UN8x4(x, a)						\
190     do									\
191     {									\
192 	uint32_t r1__, r2__, r3__, t__;					\
193 									\
194 	r1__ = (x);							\
195 	r2__ = (a);							\
196 	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
197 									\
198 	r2__ = (x) >> G_SHIFT;						\
199 	r3__ = (a) >> G_SHIFT;						\
200 	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
201 									\
202 	(x) = r1__ | (r2__ << G_SHIFT);					\
203     } while (0)
204 
205 /*
206  * x_c = (x_c * a_c) / 255 + y_c
207  */
208 #define UN8x4_MUL_UN8x4_ADD_UN8x4(x, a, y)				\
209     do									\
210     {									\
211 	uint32_t r1__, r2__, r3__, t__;					\
212 									\
213 	r1__ = (x);							\
214 	r2__ = (a);							\
215 	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
216 	r2__ = (y) & RB_MASK;						\
217 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
218 									\
219 	r2__ = ((x) >> G_SHIFT);					\
220 	r3__ = ((a) >> G_SHIFT);					\
221 	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
222 	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
223 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
224 									\
225 	(x) = r1__ | (r2__ << G_SHIFT);					\
226     } while (0)
227 
228 /*
229  * x_c = (x_c * a_c + y_c * b) / 255
230  */
231 #define UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
232     do									\
233     {									\
234 	uint32_t r1__, r2__, r3__, t__;					\
235 									\
236 	r1__ = (x);							\
237 	r2__ = (a);							\
238 	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
239 	r2__ = (y);							\
240 	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
241 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
242 									\
243 	r2__ = (x) >> G_SHIFT;						\
244 	r3__ = (a) >> G_SHIFT;						\
245 	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
246 	r3__ = (y) >> G_SHIFT;						\
247 	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
248 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
249 									\
250 	x = r1__ | (r2__ << G_SHIFT);					\
251     } while (0)
252 
253 /*
254   x_c = min(x_c + y_c, 255)
255 */
256 #ifndef UN8x4_ADD_UN8x4
257 #define UN8x4_ADD_UN8x4(x, y)						\
258     do									\
259     {									\
260 	uint32_t r1__, r2__, r3__, t__;					\
261 									\
262 	r1__ = (x) & RB_MASK;						\
263 	r2__ = (y) & RB_MASK;						\
264 	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
265 									\
266 	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\
267 	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
268 	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
269 									\
270 	x = r1__ | (r2__ << G_SHIFT);					\
271     } while (0)
272 #endif
273