1; vim:filetype=nasm ts=8 2 3; libFLAC - Free Lossless Audio Codec library 4; Copyright (C) 2001-2009 Josh Coalson 5; Copyright (C) 2011-2013 Xiph.Org Foundation 6; 7; Redistribution and use in source and binary forms, with or without 8; modification, are permitted provided that the following conditions 9; are met: 10; 11; - Redistributions of source code must retain the above copyright 12; notice, this list of conditions and the following disclaimer. 13; 14; - Redistributions in binary form must reproduce the above copyright 15; notice, this list of conditions and the following disclaimer in the 16; documentation and/or other materials provided with the distribution. 17; 18; - Neither the name of the Xiph.org Foundation nor the names of its 19; contributors may be used to endorse or promote products derived from 20; this software without specific prior written permission. 21; 22; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 26; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 27; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 28; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 29; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 30; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 34%include "nasm.h" 35 36 data_section 37 38cglobal FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov 39 40 code_section 41 42; ********************************************************************** 43; 44; unsigned FLAC__fixed_compute_best_predictor(const FLAC__int32 *data, unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) 45; { 46; FLAC__int32 last_error_0 = data[-1]; 47; FLAC__int32 last_error_1 = data[-1] - data[-2]; 48; FLAC__int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); 49; FLAC__int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); 50; FLAC__int32 error, save; 51; FLAC__uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0; 52; unsigned i, order; 53; 54; for(i = 0; i < data_len; i++) { 55; error = data[i] ; total_error_0 += local_abs(error); save = error; 56; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error; 57; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error; 58; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error; 59; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save; 60; } 61; 62; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) 63; order = 0; 64; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) 65; order = 1; 66; else if(total_error_2 < min(total_error_3, total_error_4)) 67; order = 2; 68; else if(total_error_3 < total_error_4) 69; order = 3; 70; else 71; order = 4; 72; 73; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0); 74; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0); 75; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0); 76; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0); 77; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0); 78; 79; return order; 80; } 81 ALIGN 16 82cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov 83 84 ; esp + 36 == data[] 85 ; esp + 40 == data_len 86 ; esp + 44 == residual_bits_per_sample[] 87 88 push ebp 89 push ebx 90 push esi 91 push edi 92 sub esp, byte 16 93 ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs 94 95 ; ebx == &data[i] 96 ; ecx == loop counter (i) 97 ; ebp == order 98 ; mm0 == total_error_1:total_error_0 99 ; mm1 == total_error_2:total_error_3 100 ; mm2 == :total_error_4 101 ; mm3 == last_error_1:last_error_0 102 ; mm4 == last_error_2:last_error_3 103 104 mov ecx, [esp + 40] ; ecx = data_len 105 test ecx, ecx 106 jz near .data_len_is_0 107 108 mov ebx, [esp + 36] ; ebx = data[] 109 movd mm3, [ebx - 4] ; mm3 = 0:last_error_0 110 movd mm2, [ebx - 8] ; mm2 = 0:data[-2] 111 movd mm1, [ebx - 12] ; mm1 = 0:data[-3] 112 movd mm0, [ebx - 16] ; mm0 = 0:data[-4] 113 movq mm5, mm3 ; mm5 = 0:last_error_0 114 psubd mm5, mm2 ; mm5 = 0:last_error_1 115 punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0 116 psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3] 117 psubd mm5, mm2 ; mm5 = 0:last_error_2 118 movq mm4, mm5 ; mm4 = 0:last_error_2 119 psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3]) 120 paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3]) 121 psubd mm4, mm0 ; mm4 = 0:last_error_3 122 punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3 123 pxor mm0, mm0 ; mm0 = total_error_1:total_error_0 124 pxor mm1, mm1 ; mm1 = total_error_2:total_error_3 125 pxor mm2, mm2 ; mm2 = 0:total_error_4 126 127 ALIGN 16 128.loop: 129 movd mm7, [ebx] ; mm7 = 0:error_0 130 add ebx, byte 4 131 movq mm6, mm7 ; mm6 = 0:error_0 132 psubd mm7, mm3 ; mm7 = :error_1 133 punpckldq mm6, mm7 ; mm6 = error_1:error_0 134 movq mm5, mm6 ; mm5 = error_1:error_0 135 movq mm7, mm6 ; mm7 = error_1:error_0 136 psubd mm5, mm3 ; mm5 = error_2: 137 movq mm3, mm6 ; mm3 = error_1:error_0 138 psrad mm6, 31 139 pxor mm7, mm6 140 psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0) 141 paddd mm0, mm7 ; mm0 = total_error_1:total_error_0 142 movq mm6, mm5 ; mm6 = error_2: 143 psubd mm5, mm4 ; mm5 = error_3: 144 punpckhdq mm5, mm6 ; mm5 = error_2:error_3 145 movq mm7, mm5 ; mm7 = error_2:error_3 146 movq mm6, mm5 ; mm6 = error_2:error_3 147 psubd mm5, mm4 ; mm5 = :error_4 148 movq mm4, mm6 ; mm4 = error_2:error_3 149 psrad mm6, 31 150 pxor mm7, mm6 151 psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3) 152 paddd mm1, mm7 ; mm1 = total_error_2:total_error_3 153 movq mm6, mm5 ; mm6 = :error_4 154 psrad mm5, 31 155 pxor mm6, mm5 156 psubd mm6, mm5 ; mm6 = :abs(error_4) 157 paddd mm2, mm6 ; mm2 = :total_error_4 158 159 dec ecx 160 jnz short .loop 161 162; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) 163; order = 0; 164; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) 165; order = 1; 166; else if(total_error_2 < min(total_error_3, total_error_4)) 167; order = 2; 168; else if(total_error_3 < total_error_4) 169; order = 3; 170; else 171; order = 4; 172 movq mm3, mm0 ; mm3 = total_error_1:total_error_0 173 movd edi, mm2 ; edi = total_error_4 174 movd esi, mm1 ; esi = total_error_3 175 movd eax, mm0 ; eax = total_error_0 176 punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2 177 punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1 178 movd edx, mm1 ; edx = total_error_2 179 movd ecx, mm3 ; ecx = total_error_1 180 181 xor ebx, ebx 182 xor ebp, ebp 183 inc ebx 184 cmp ecx, eax 185 cmovb eax, ecx ; eax = min(total_error_0, total_error_1) 186 cmovbe ebp, ebx 187 inc ebx 188 cmp edx, eax 189 cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) 190 cmovbe ebp, ebx 191 inc ebx 192 cmp esi, eax 193 cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) 194 cmovbe ebp, ebx 195 inc ebx 196 cmp edi, eax 197 cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) 198 cmovbe ebp, ebx 199 movd ebx, mm0 ; ebx = total_error_0 200 emms 201 202 ; residual_bits_per_sample[0] = (FLAC__float)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0); 203 ; residual_bits_per_sample[1] = (FLAC__float)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0); 204 ; residual_bits_per_sample[2] = (FLAC__float)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0); 205 ; residual_bits_per_sample[3] = (FLAC__float)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0); 206 ; residual_bits_per_sample[4] = (FLAC__float)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0); 207 xor eax, eax 208 fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) 209.rbps_0: 210 test ebx, ebx 211 jz .total_error_0_is_0 212 fld1 ; ST = 1.0 data_len 213 mov [esp], ebx 214 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_0 215 mov ebx, [esp + 44] 216 fild qword [esp] ; ST = total_error_0 1.0 data_len 217 fdiv st2 ; ST = total_error_0/data_len 1.0 data_len 218 fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len 219 fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len 220 fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len 221 fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len 222 jmp short .rbps_1 223.total_error_0_is_0: 224 mov ebx, [esp + 44] 225 mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 226.rbps_1: 227 test ecx, ecx 228 jz .total_error_1_is_0 229 fld1 ; ST = 1.0 data_len 230 mov [esp], ecx 231 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_1 232 fild qword [esp] ; ST = total_error_1 1.0 data_len 233 fdiv st2 ; ST = total_error_1/data_len 1.0 data_len 234 fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len 235 fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len 236 fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len 237 fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len 238 jmp short .rbps_2 239.total_error_1_is_0: 240 mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 241.rbps_2: 242 test edx, edx 243 jz .total_error_2_is_0 244 fld1 ; ST = 1.0 data_len 245 mov [esp], edx 246 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_2 247 fild qword [esp] ; ST = total_error_2 1.0 data_len 248 fdiv st2 ; ST = total_error_2/data_len 1.0 data_len 249 fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len 250 fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len 251 fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len 252 fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len 253 jmp short .rbps_3 254.total_error_2_is_0: 255 mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 256.rbps_3: 257 test esi, esi 258 jz .total_error_3_is_0 259 fld1 ; ST = 1.0 data_len 260 mov [esp], esi 261 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_3 262 fild qword [esp] ; ST = total_error_3 1.0 data_len 263 fdiv st2 ; ST = total_error_3/data_len 1.0 data_len 264 fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len 265 fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len 266 fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len 267 fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len 268 jmp short .rbps_4 269.total_error_3_is_0: 270 mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 271.rbps_4: 272 test edi, edi 273 jz .total_error_4_is_0 274 fld1 ; ST = 1.0 data_len 275 mov [esp], edi 276 mov [esp + 4], eax ; [esp] = (FLAC__uint64)total_error_4 277 fild qword [esp] ; ST = total_error_4 1.0 data_len 278 fdiv st2 ; ST = total_error_4/data_len 1.0 data_len 279 fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len 280 fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len 281 fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len 282 fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len 283 jmp short .rbps_end 284.total_error_4_is_0: 285 mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 286.rbps_end: 287 fstp st0 ; ST = [empty] 288 jmp short .end 289.data_len_is_0: 290 ; data_len == 0, so residual_bits_per_sample[*] = 0.0 291 xor ebp, ebp 292 mov edi, [esp + 44] 293 mov [edi], ebp 294 mov [edi + 4], ebp 295 mov [edi + 8], ebp 296 mov [edi + 12], ebp 297 mov [edi + 16], ebp 298 add ebp, byte 4 ; order = 4 299 300.end: 301 mov eax, ebp ; return order 302 add esp, byte 16 303 pop edi 304 pop esi 305 pop ebx 306 pop ebp 307 ret 308 309; end 310