1/*
2mediastreamer2 library - modular sound and video processing and streaming
3Copyright (C) 2006-2010  Belledonne Communications SARL (simon.morlat@linphone.org)
4
5This program is free software; you can redistribute it and/or
6modify it under the terms of the GNU General Public License
7as published by the Free Software Foundation; either version 2
8of the License, or (at your option) any later version.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program; if not, write to the Free Software
17Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
18*/
19
20
21#ifdef __ELF__
22#   define ELF
23#else
24#   define ELF @
25#endif
26
27#ifdef __clang__
28#   define FUNC @.func
29#   define ENDFUNC @.endfunc
30#else
31#   define FUNC .func
32#   define ENDFUNC .endfunc
33#endif
34
35.macro  require8 val=1
36ELF     .eabi_attribute 24, \val
37.endm
38
39.macro  preserve8 val=1
40ELF     .eabi_attribute 25, \val
41.endm
42
43.macro function name
44	.global \name
45ELF	.hidden \name
46ELF	.type   \name, %function
47	FUNC   \name
48\name:
49        .endm
50
51
52.section .rodata
53.align 8
54ymult:
55.word 9535 , 9535, 9535, 9535
56rvmult:
57.word 13074, 13074, 13074, 13074
58gbmult:
59.word 6660, 6660, 6660, 6660
60gumult:
61.word 3203, 3203, 3203, 3203
62bumult:
63.word 16531, 16531, 16531, 16531
64
65
66.fpu neon
67.text
68
69/*void ms_line_rgb2rgb565_4 (const int16_t *r, const int16_t *g, const int16_t *b, uint16_t *dst, int width)*/
70function ms_line_rgb2rgb565_4
71		push       {r4}
72		ldr			r4	, [sp ,#4]
731:
74		vld1.16		{d0}, [r0,:64]!
75		vld1.16		{d1}, [r1,:64]!
76		vld1.16		{d2}, [r2,:64]!
77		vshr.u16	d0, d0, #3
78		vshr.u16	d1, d1, #2
79		vshr.u16	d2, d2, #3
80		vsli.16 		d2, d1, #5	/*inserts g  into d2*/
81		vsli.16		d2, d0, #11	/*inserts r into d2 */
82		vst1.16	{d2}, [r3,:64]!
83		subs		r4,  r4,   #4
84		bne 			1b
85		pop			{r4}
86		bx			lr
87ENDFUNC
88
89
90/*void ms_line_rgb2rgb565_8 (const int16_t *r, const int16_t *g, const int16_t *b, uint16_t *dst, int width)*/
91function ms_line_rgb2rgb565_8
92		push       {r4}
93		ldr			r4	, [sp ,#4]			/*load width into r4 */
941:
95		vld1.16		{d0,d1}, [r0,:64]!
96		vshr.u16	q0, q0, #3
97		vld1.16		{d2,d3}, [r1,:64]!
98		vshr.u16	q1, q1, #2
99		vld1.16		{d4,d5}, [r2,:64]!
100		vshr.u16	q2, q2, #3
101		vsli.16 	q2, q1, #5	/*inserts g  into d2*/
102		vsli.16		q2, q0, #11	/*inserts r into d2 */
103		vst1.16		{q2}, [r3,:64]!
104		subs		r4,  r4,   #8
105		bne 			1b
106		pop			{r4}
107		bx			lr
108ENDFUNC
109
110.macro load_pixels_4_2	d_reg1, d_reg2, src
111	add		r12, \src, #2		/* offset to reach next pixels */
112	vld1.16		\d_reg1[0], [\src], r4	/* transfer the pixel pointed by r4 into q2 */
113	vld1.16		\d_reg1[1], [\src], r5
114	vld1.16		\d_reg1[2], [\src], r6
115	vld1.16		\d_reg1[3], [\src], r7
116	vld1.16		\d_reg2[0], [r12], r4	/* transfer the pixel pointed by r4 into q2 */
117	vld1.16		\d_reg2[1], [r12], r5
118	vld1.16		\d_reg2[2], [r12], r6
119	vld1.16		\d_reg2[3], [r12], r7
120.endm
121
122
123.macro filter_pixels_8 q_srcdst, q_src2
124	vsub.s16	q9 , \q_src2, \q_srcdst	/*	q9=x(n+1)-x(n)	*/
125	vmul.s16	q10 , q9, q8	/* q10 = coef * q9 */
126	vsra.s16	\q_srcdst , q10, #7
127	vabs.s16	\q_srcdst , \q_srcdst
128.endm
129
130/*void ms_line_scale_8(const uint32_t *grid, const int16_t **src, int16_t **dst int dst_width, int16_t *filter);*/
131function ms_line_scale_8
132	push {r4-r12,lr}	/* we use lr as a normal register here */
133	ldr		lr	, [sp ,#40] /*r4-r12+lr= 10 registers 40=10*4 offset to retrieve filter table*/
134
135	ldm		r1,	{r8,r9}
136	ldr		r1,	[r1,#8]
137
138	ldm		r2,	{r10,r11}
139	ldr		r2,	[r2,#8]
140
1411:
142
143	ldm		r0!, {r4,r5,r6,r7} 	/* load 4 entries of the grid into r4,r5,r6,r7 */
144
145	load_pixels_4_2	d4, d10, r1
146	load_pixels_4_2	d6, d12, r8
147	load_pixels_4_2	d8, d14, r9
148
149	ldm		r0!, {r4,r5,r6,r7} /* load 4 more entries of the grid into r4,r5,r6,r7 */
150
151	load_pixels_4_2	d5, d11, r1
152	load_pixels_4_2	d7, d13, r8
153	load_pixels_4_2	d9, d15, r9
154				/* x(n)= q2,q3,q4  x(n+1)=q5,q6,q7 */
155	vld1.16		{q8} , [lr]!	/* load the filtering coefficients in q8*/
156				/* we need to compute (coef*(x(n+1)-x(n)))>>7 + x(n) */
157
158	filter_pixels_8	q2 , q5
159	filter_pixels_8	q3 , q6
160	filter_pixels_8	q4 , q7
161
162	vst1.16		{q2} , [r2]!	/*write q2 (the 8 selected pixels) into memory pointed by r2*/
163	vst1.16		{q3} , [r10]!
164	vst1.16		{q4} , [r11]!
165	subs		r3,r3,#8	/*we have processed 8 pixels, decrement width*/
166	bne		1b
167	pop		{r4-r12,pc}
168ENDFUNC
169
170
171
172.macro load_pixels_4	d_reg, src
173	vld1.16		\d_reg[0], [\src], r4	/* transfer the pixel pointed by r4 into q2 */
174	vld1.16		\d_reg[1], [\src], r5
175	vld1.16		\d_reg[2], [\src], r6
176	vld1.16		\d_reg[3], [\src], r7
177.endm
178
179/*void ms_line_scale_8(const uint32_t *grid, const uint16_t **src, uint16_t **dst int dst_width);*/
180function ms_line_scale_simple_8
181	push {r4-r11}
182	ldr		r8,	[r1,#4]
183	ldr		r9,	[r1,#8]
184	ldr		r1,	[r1]
185	ldr		r10,	[r2,#4]
186	ldr		r11,	[r2,#8]
187	ldr		r2,	[r2]
1881:
189	ldrd		r4,r5, [r0],#8	/* load 2 entries of the grid into r4,r5 */
190	ldrd		r6,r7, [r0],#8	/* load 2 entries of the grid into r6,r7 */
191
192	load_pixels_4   d4, r1
193	load_pixels_4	d6, r8
194	load_pixels_4	d8, r9
195
196	ldrd		r4,r5, [r0],#8	/* load 2 entries of the grid into r4,r5 */
197	ldrd		r6,r7, [r0],#8	/* load 2 entries of the grid into r6,r7 */
198
199	load_pixels_4   d5, r1
200	load_pixels_4	d7, r8
201	load_pixels_4	d9, r9
202
203	vst1.16		{q2} , [r2]!	/*write q2 (the 8 selected pixels) into memory pointed by r2*/
204	vst1.16		{q3} , [r10]!
205	vst1.16		{q4} , [r11]!
206	subs		r3,r3,#8	/*we have processed 8 pixels, decrement width*/
207	bne		1b
208	pop		{r4-r11}
209	bx		lr
210ENDFUNC
211
212
213.if 0
214
215/* void line_yuv2rgb(uint8_t *y, uint8_t *u, uint8_t *v,  int16_t *r, int16_t *g, int16_t *b, int n) */
216function line_yuv2rgb
217	push 			{r4-r7}
218	ldr				r6, [sp, #12]	/*load n into r6*/
219	ldr				r5, [sp, #16]	/*load b into r5*/
220	ldr				r4, [sp, #20]	/*load g into r5*/
221	vld1.8			d12, [r0]!			/* load 8 y */
222	vmovl.u8		q6, d12				/*expand them to 16bits */
223	vmovl.u16	q0 , d12				/*expand 4 first of them to 32 bits into q0 */
224	vmovl.u16	q1 , d13				/*expand 4 more of them to 32 bits into q1*/
225	vld1.8			d12[0], [r1]!		/*load 4 u */
226	vmovl.u8		q6, d12				/*expand them to 16bits */
227	vmovl.u16	q2 , d12				/*expand 4 first of them to 32 bits into q2 */
228	vld1.8			d12[0], [r2]!		/*load 4 v */
229	vmovl.u8		q6, d12				/*expand them to 16bits */
230	vmovl.u16	q3 , d12				/*expand 4 first of them to 32 bits into q2 */
231				/* at this stage we have y in q0 and q1, u in q2, and v in q3 */
232	mov				r7 , # 16
233	vdup.32		q4, r7
234	vsub.s32		q0 , q0, q4			/*remove bias from y */
235	vsub.s32		q1 , q1, q4			/*remove bias from y */
236	mov				r7 , # 128
237	vdup.32		q4, r7
238	vsub.s32		q2 , q2, q4			/*remove bias from u */
239	vsub.s32		q3 , q3, q4			/*remove bias from v */
240	movrel			r7 , ymult
241	vld1.i32		q4 , [r7]
242	vmul.s32		q0, q0, q4			/*multiply y with 9535 */
243	vmul.s32		q1, q1, q4			/*multiply y with 9535 */
244	movrel			r7 , rvmult
245	vld1.i32		q4 , [r7]
246	/**/
247	pop				{r4-r7}
248	bx				lr
249ENDFUNC
250
251
252
253.endif
254
255