Lines Matching refs:F32vec4

15 inline F32vec4 convert_high(const Iu16vec8 &a) {  in convert_high()
19 inline F32vec4 convert_low(const Iu16vec8 &a) { in convert_low()
24 inline Iu16vec8 F32vec4toIu16vec8(const F32vec4 &hi, const F32vec4 &lo) { in F32vec4toIu16vec8()
31 inline Is16vec8 F32vec4toIs16vec8(const F32vec4 &hi, const F32vec4 &lo) { in F32vec4toIs16vec8()
42 static inline F32vec4 v_fast_exp(F32vec4 val) in v_fast_exp()
44 const F32vec4 fast_exp_a((1 << 23)/M_LN2); in v_fast_exp()
45 const F32vec4 fast_exp_b_c(127.0f * (1 << 23) - 405000); in v_fast_exp()
46 const F32vec4 v_zero = _mm_setzero_ps(); in v_fast_exp()
47 const F32vec4 v_m16 = F32vec4(-16.0f); in v_fast_exp()
49 F32vec4 result = (__m128) _mm_cvtps_epi32(fast_exp_a * val + fast_exp_b_c); in v_fast_exp()
101 const F32vec4 v_zero = _mm_setzero_ps(); in separable_bf_mono_tile()
102 const F32vec4 v_one = F32vec4(1.0f); in separable_bf_mono_tile()
103 const F32vec4 v_ffff = F32vec4((float) 0xffff); in separable_bf_mono_tile()
106 F32vec4 v_num = v_zero; in separable_bf_mono_tile()
107 F32vec4 v_denom = v_zero; in separable_bf_mono_tile()
108 const F32vec4 v_I_s0 = _mm_loadu_ps(&rbuf[x]); in separable_bf_mono_tile()
109 const F32vec4 v_Ar(Ar); in separable_bf_mono_tile()
112 const F32vec4 v_I_s = _mm_loadu_ps(&rbuf[k-wr + x]); in separable_bf_mono_tile()
113 const F32vec4 v_D_sq = SQR(v_I_s - v_I_s0); in separable_bf_mono_tile()
114 const F32vec4 v_f = v_fast_exp(v_Ar * v_D_sq - F32vec4(kernel[k])); in separable_bf_mono_tile()
173 const F32vec4 v_zero(0.0f); in separable_bf_mono_tile()
174 const F32vec4 v_one(1.0f); in separable_bf_mono_tile()
175 const F32vec4 v_ffff((float) 0xffff); in separable_bf_mono_tile()
179 const F32vec4 b0 = _mm_loadu_ps(&cbuf[y]); in separable_bf_mono_tile()
180 const F32vec4 v_Ar(Ar); in separable_bf_mono_tile()
183 F32vec4 num = v_zero; in separable_bf_mono_tile()
184 F32vec4 denom = v_zero; in separable_bf_mono_tile()
187 const F32vec4 b = _mm_loadu_ps(&cbuf[(k-wr) + y]); in separable_bf_mono_tile()
188 const F32vec4 D_sq = SQR(b - b0); in separable_bf_mono_tile()
189 const F32vec4 f = v_fast_exp(v_Ar * D_sq - F32vec4(kernel[k])); in separable_bf_mono_tile()
246 const F32vec4 N = _mm_load_ps((float*)_##N);
252 inline void XYZtoF32vec4(F32vec4& x, F32vec4& y, F32vec4& z, const F32vec4& a, const F32vec4& b, co… in XYZtoF32vec4()
285 inline void F32vec4toXYZ(F32vec4& a, F32vec4& b, F32vec4& c, const F32vec4& x, const F32vec4& y, co… in F32vec4toXYZ()
291 F32vec4 ta = _mm_shuffle_ps(x, x, _MM_SHUFFLE(1,2,3,0)); in F32vec4toXYZ()
292 F32vec4 tb = _mm_shuffle_ps(y, y, _MM_SHUFFLE(2,3,0,1)); in F32vec4toXYZ()
293 F32vec4 tc = _mm_shuffle_ps(z, z, _MM_SHUFFLE(3,0,1,2)); in F32vec4toXYZ()
307 const F32vec4 v_ffff((float) 0xffff); in planar_YST_to_interleaved_RGB()
308 const F32vec4 v_zero(0.0f); in planar_YST_to_interleaved_RGB()
309 const F32vec4 v_05(0.5f); in planar_YST_to_interleaved_RGB()
311 F32vec4 v_yst_to_rgb[9]; in planar_YST_to_interleaved_RGB()
314 v_yst_to_rgb[i] = F32vec4(yst_to_rgb[i]); in planar_YST_to_interleaved_RGB()
325 F32vec4 y = _mm_loadu_ps(&buf_y[idx]); in planar_YST_to_interleaved_RGB()
326 F32vec4 s = _mm_loadu_ps(&buf_s[idx]) - v_05; in planar_YST_to_interleaved_RGB()
327 F32vec4 t = _mm_loadu_ps(&buf_t[idx]) - v_05; in planar_YST_to_interleaved_RGB()
329 F32vec4 v_rgb[3]; in planar_YST_to_interleaved_RGB()
335 F32vec4 a1, b1, c1; in planar_YST_to_interleaved_RGB()
363 F32vec4 a2, b2, c2; in planar_YST_to_interleaved_RGB()
435 const F32vec4 v_inv_norm(inv_norm); in interleaved_RGB_to_planar_YST()
436 const F32vec4 v_05(0.5f); in interleaved_RGB_to_planar_YST()
438 F32vec4 v_rgb_to_yst[3][3]; in interleaved_RGB_to_planar_YST()
442 v_rgb_to_yst[y][x] = F32vec4(rgb_to_yst[3 * y + x]); in interleaved_RGB_to_planar_YST()
461 F32vec4 src4_1 = convert_low(src8_1); // R1 B0 G0 R0 -> a1 in interleaved_RGB_to_planar_YST()
462 F32vec4 src4_2 = convert_high(src8_1); // G2 R2 B1 G1 -> b1 in interleaved_RGB_to_planar_YST()
463 F32vec4 src4_3 = convert_low(src8_2); // B3 G3 R3 B2 -> c1 in interleaved_RGB_to_planar_YST()
465 F32vec4 src4_4 = convert_high(src8_2); // R5 B4 G4 R4 -> a2 in interleaved_RGB_to_planar_YST()
466 F32vec4 src4_5 = convert_low(src8_3); // G6 R6 B5 G5 -> b2 in interleaved_RGB_to_planar_YST()
467 F32vec4 src4_6 = convert_high(src8_3); // B7 G7 R7 B6 -> c2 in interleaved_RGB_to_planar_YST()
469 F32vec4 src4_rgb[3]; in interleaved_RGB_to_planar_YST()
474 F32vec4 v_r = v_inv_norm * src4_rgb[0]; in interleaved_RGB_to_planar_YST()
475 F32vec4 v_g = v_inv_norm * src4_rgb[1]; in interleaved_RGB_to_planar_YST()
476 F32vec4 v_b = v_inv_norm * src4_rgb[2]; in interleaved_RGB_to_planar_YST()
478F32vec4 y = v_rgb_to_yst[0][0] * v_r + v_rgb_to_yst[0][1] * v_g + v_rgb_to_yst[0][2] * v_b; in interleaved_RGB_to_planar_YST()
479F32vec4 s = v_rgb_to_yst[1][0] * v_r + v_rgb_to_yst[1][1] * v_g + v_rgb_to_yst[1][2] * v_b + v_05; in interleaved_RGB_to_planar_YST()
480F32vec4 t = v_rgb_to_yst[2][0] * v_r + v_rgb_to_yst[2][1] * v_g + v_rgb_to_yst[2][2] * v_b + v_05; in interleaved_RGB_to_planar_YST()
585 const F32vec4 v_zero = _mm_setzero_ps(); in separable_bf_chroma_tile()
586 const F32vec4 v_one = F32vec4(1.0f); in separable_bf_chroma_tile()
590 const F32vec4 s0_a = _mm_loadu_ps(&rbuf_a[x]); in separable_bf_chroma_tile()
591 const F32vec4 s0_b = _mm_loadu_ps(&rbuf_b[x]); in separable_bf_chroma_tile()
592 const F32vec4 v_Ar(Ar); in separable_bf_chroma_tile()
595 F32vec4 a_num = v_zero; in separable_bf_chroma_tile()
596 F32vec4 b_num = v_zero; in separable_bf_chroma_tile()
597 F32vec4 denom = v_zero; in separable_bf_chroma_tile()
602 const F32vec4 s_a = _mm_loadu_ps(&rbuf_a[idx]); in separable_bf_chroma_tile()
603 const F32vec4 s_b = _mm_loadu_ps(&rbuf_b[idx]); in separable_bf_chroma_tile()
605 const F32vec4 D_sq = SQR(s_a - s0_a) + SQR(s_b - s0_b); in separable_bf_chroma_tile()
606 const F32vec4 f = v_fast_exp(v_Ar * D_sq - F32vec4(kernel[k])); in separable_bf_chroma_tile()
684 const F32vec4 v_zero(0.0f); in separable_bf_chroma_tile()
685 const F32vec4 v_one(1.0f); in separable_bf_chroma_tile()
686 const F32vec4 v_ffff((float) 0xffff); in separable_bf_chroma_tile()
690 const F32vec4 b0_a = _mm_loadu_ps(&cbuf_a[y]); in separable_bf_chroma_tile()
691 const F32vec4 b0_b = _mm_loadu_ps(&cbuf_b[y]); in separable_bf_chroma_tile()
692 const F32vec4 v_Ar(Ar); in separable_bf_chroma_tile()
695 F32vec4 a_num = v_zero; in separable_bf_chroma_tile()
696 F32vec4 b_num = v_zero; in separable_bf_chroma_tile()
697 F32vec4 denom = v_zero; in separable_bf_chroma_tile()
702 const F32vec4 b_a = _mm_loadu_ps(&cbuf_a[idx]); in separable_bf_chroma_tile()
703 const F32vec4 b_b = _mm_loadu_ps(&cbuf_b[idx]); in separable_bf_chroma_tile()
705 const F32vec4 D_sq = SQR(b_a - b0_a) + SQR(b_b - b0_b); in separable_bf_chroma_tile()
706 const F32vec4 f = v_fast_exp(v_Ar * D_sq - F32vec4(kernel[k])); in separable_bf_chroma_tile()