1/* 2mediastreamer2 library - modular sound and video processing and streaming 3Copyright (C) 2006-2010 Belledonne Communications SARL (simon.morlat@linphone.org) 4 5This program is free software; you can redistribute it and/or 6modify it under the terms of the GNU General Public License 7as published by the Free Software Foundation; either version 2 8of the License, or (at your option) any later version. 9 10This program is distributed in the hope that it will be useful, 11but WITHOUT ANY WARRANTY; without even the implied warranty of 12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13GNU General Public License for more details. 14 15You should have received a copy of the GNU General Public License 16along with this program; if not, write to the Free Software 17Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18*/ 19 20 21#ifdef __ELF__ 22# define ELF 23#else 24# define ELF @ 25#endif 26 27#ifdef __clang__ 28# define FUNC @.func 29# define ENDFUNC @.endfunc 30#else 31# define FUNC .func 32# define ENDFUNC .endfunc 33#endif 34 35.macro require8 val=1 36ELF .eabi_attribute 24, \val 37.endm 38 39.macro preserve8 val=1 40ELF .eabi_attribute 25, \val 41.endm 42 43.macro function name 44 .global \name 45ELF .hidden \name 46ELF .type \name, %function 47 FUNC \name 48\name: 49 .endm 50 51 52.section .rodata 53.align 8 54ymult: 55.word 9535 , 9535, 9535, 9535 56rvmult: 57.word 13074, 13074, 13074, 13074 58gbmult: 59.word 6660, 6660, 6660, 6660 60gumult: 61.word 3203, 3203, 3203, 3203 62bumult: 63.word 16531, 16531, 16531, 16531 64 65 66.fpu neon 67.text 68 69/*void ms_line_rgb2rgb565_4 (const int16_t *r, const int16_t *g, const int16_t *b, uint16_t *dst, int width)*/ 70function ms_line_rgb2rgb565_4 71 push {r4} 72 ldr r4 , [sp ,#4] 731: 74 vld1.16 {d0}, [r0,:64]! 75 vld1.16 {d1}, [r1,:64]! 76 vld1.16 {d2}, [r2,:64]! 77 vshr.u16 d0, d0, #3 78 vshr.u16 d1, d1, #2 79 vshr.u16 d2, d2, #3 80 vsli.16 d2, d1, #5 /*inserts g into d2*/ 81 vsli.16 d2, d0, #11 /*inserts r into d2 */ 82 vst1.16 {d2}, [r3,:64]! 83 subs r4, r4, #4 84 bne 1b 85 pop {r4} 86 bx lr 87ENDFUNC 88 89 90/*void ms_line_rgb2rgb565_8 (const int16_t *r, const int16_t *g, const int16_t *b, uint16_t *dst, int width)*/ 91function ms_line_rgb2rgb565_8 92 push {r4} 93 ldr r4 , [sp ,#4] /*load width into r4 */ 941: 95 vld1.16 {d0,d1}, [r0,:64]! 96 vshr.u16 q0, q0, #3 97 vld1.16 {d2,d3}, [r1,:64]! 98 vshr.u16 q1, q1, #2 99 vld1.16 {d4,d5}, [r2,:64]! 100 vshr.u16 q2, q2, #3 101 vsli.16 q2, q1, #5 /*inserts g into d2*/ 102 vsli.16 q2, q0, #11 /*inserts r into d2 */ 103 vst1.16 {q2}, [r3,:64]! 104 subs r4, r4, #8 105 bne 1b 106 pop {r4} 107 bx lr 108ENDFUNC 109 110.macro load_pixels_4_2 d_reg1, d_reg2, src 111 add r12, \src, #2 /* offset to reach next pixels */ 112 vld1.16 \d_reg1[0], [\src], r4 /* transfer the pixel pointed by r4 into q2 */ 113 vld1.16 \d_reg1[1], [\src], r5 114 vld1.16 \d_reg1[2], [\src], r6 115 vld1.16 \d_reg1[3], [\src], r7 116 vld1.16 \d_reg2[0], [r12], r4 /* transfer the pixel pointed by r4 into q2 */ 117 vld1.16 \d_reg2[1], [r12], r5 118 vld1.16 \d_reg2[2], [r12], r6 119 vld1.16 \d_reg2[3], [r12], r7 120.endm 121 122 123.macro filter_pixels_8 q_srcdst, q_src2 124 vsub.s16 q9 , \q_src2, \q_srcdst /* q9=x(n+1)-x(n) */ 125 vmul.s16 q10 , q9, q8 /* q10 = coef * q9 */ 126 vsra.s16 \q_srcdst , q10, #7 127 vabs.s16 \q_srcdst , \q_srcdst 128.endm 129 130/*void ms_line_scale_8(const uint32_t *grid, const int16_t **src, int16_t **dst int dst_width, int16_t *filter);*/ 131function ms_line_scale_8 132 push {r4-r12,lr} /* we use lr as a normal register here */ 133 ldr lr , [sp ,#40] /*r4-r12+lr= 10 registers 40=10*4 offset to retrieve filter table*/ 134 135 ldm r1, {r8,r9} 136 ldr r1, [r1,#8] 137 138 ldm r2, {r10,r11} 139 ldr r2, [r2,#8] 140 1411: 142 143 ldm r0!, {r4,r5,r6,r7} /* load 4 entries of the grid into r4,r5,r6,r7 */ 144 145 load_pixels_4_2 d4, d10, r1 146 load_pixels_4_2 d6, d12, r8 147 load_pixels_4_2 d8, d14, r9 148 149 ldm r0!, {r4,r5,r6,r7} /* load 4 more entries of the grid into r4,r5,r6,r7 */ 150 151 load_pixels_4_2 d5, d11, r1 152 load_pixels_4_2 d7, d13, r8 153 load_pixels_4_2 d9, d15, r9 154 /* x(n)= q2,q3,q4 x(n+1)=q5,q6,q7 */ 155 vld1.16 {q8} , [lr]! /* load the filtering coefficients in q8*/ 156 /* we need to compute (coef*(x(n+1)-x(n)))>>7 + x(n) */ 157 158 filter_pixels_8 q2 , q5 159 filter_pixels_8 q3 , q6 160 filter_pixels_8 q4 , q7 161 162 vst1.16 {q2} , [r2]! /*write q2 (the 8 selected pixels) into memory pointed by r2*/ 163 vst1.16 {q3} , [r10]! 164 vst1.16 {q4} , [r11]! 165 subs r3,r3,#8 /*we have processed 8 pixels, decrement width*/ 166 bne 1b 167 pop {r4-r12,pc} 168ENDFUNC 169 170 171 172.macro load_pixels_4 d_reg, src 173 vld1.16 \d_reg[0], [\src], r4 /* transfer the pixel pointed by r4 into q2 */ 174 vld1.16 \d_reg[1], [\src], r5 175 vld1.16 \d_reg[2], [\src], r6 176 vld1.16 \d_reg[3], [\src], r7 177.endm 178 179/*void ms_line_scale_8(const uint32_t *grid, const uint16_t **src, uint16_t **dst int dst_width);*/ 180function ms_line_scale_simple_8 181 push {r4-r11} 182 ldr r8, [r1,#4] 183 ldr r9, [r1,#8] 184 ldr r1, [r1] 185 ldr r10, [r2,#4] 186 ldr r11, [r2,#8] 187 ldr r2, [r2] 1881: 189 ldrd r4,r5, [r0],#8 /* load 2 entries of the grid into r4,r5 */ 190 ldrd r6,r7, [r0],#8 /* load 2 entries of the grid into r6,r7 */ 191 192 load_pixels_4 d4, r1 193 load_pixels_4 d6, r8 194 load_pixels_4 d8, r9 195 196 ldrd r4,r5, [r0],#8 /* load 2 entries of the grid into r4,r5 */ 197 ldrd r6,r7, [r0],#8 /* load 2 entries of the grid into r6,r7 */ 198 199 load_pixels_4 d5, r1 200 load_pixels_4 d7, r8 201 load_pixels_4 d9, r9 202 203 vst1.16 {q2} , [r2]! /*write q2 (the 8 selected pixels) into memory pointed by r2*/ 204 vst1.16 {q3} , [r10]! 205 vst1.16 {q4} , [r11]! 206 subs r3,r3,#8 /*we have processed 8 pixels, decrement width*/ 207 bne 1b 208 pop {r4-r11} 209 bx lr 210ENDFUNC 211 212 213.if 0 214 215/* void line_yuv2rgb(uint8_t *y, uint8_t *u, uint8_t *v, int16_t *r, int16_t *g, int16_t *b, int n) */ 216function line_yuv2rgb 217 push {r4-r7} 218 ldr r6, [sp, #12] /*load n into r6*/ 219 ldr r5, [sp, #16] /*load b into r5*/ 220 ldr r4, [sp, #20] /*load g into r5*/ 221 vld1.8 d12, [r0]! /* load 8 y */ 222 vmovl.u8 q6, d12 /*expand them to 16bits */ 223 vmovl.u16 q0 , d12 /*expand 4 first of them to 32 bits into q0 */ 224 vmovl.u16 q1 , d13 /*expand 4 more of them to 32 bits into q1*/ 225 vld1.8 d12[0], [r1]! /*load 4 u */ 226 vmovl.u8 q6, d12 /*expand them to 16bits */ 227 vmovl.u16 q2 , d12 /*expand 4 first of them to 32 bits into q2 */ 228 vld1.8 d12[0], [r2]! /*load 4 v */ 229 vmovl.u8 q6, d12 /*expand them to 16bits */ 230 vmovl.u16 q3 , d12 /*expand 4 first of them to 32 bits into q2 */ 231 /* at this stage we have y in q0 and q1, u in q2, and v in q3 */ 232 mov r7 , # 16 233 vdup.32 q4, r7 234 vsub.s32 q0 , q0, q4 /*remove bias from y */ 235 vsub.s32 q1 , q1, q4 /*remove bias from y */ 236 mov r7 , # 128 237 vdup.32 q4, r7 238 vsub.s32 q2 , q2, q4 /*remove bias from u */ 239 vsub.s32 q3 , q3, q4 /*remove bias from v */ 240 movrel r7 , ymult 241 vld1.i32 q4 , [r7] 242 vmul.s32 q0, q0, q4 /*multiply y with 9535 */ 243 vmul.s32 q1, q1, q4 /*multiply y with 9535 */ 244 movrel r7 , rvmult 245 vld1.i32 q4 , [r7] 246 /**/ 247 pop {r4-r7} 248 bx lr 249ENDFUNC 250 251 252 253.endif 254 255