1 ///////////////////////////////////////////////////////////////////////
2 // File: dotproductsse.cpp
3 // Description: Architecture-specific dot-product function.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2015, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 ///////////////////////////////////////////////////////////////////////
17
18 #if !defined(__SSE4_1__)
19 # if defined(__i686__) || defined(__x86_64__)
20 # error Implementation only for SSE 4.1 capable architectures
21 # endif
22 #else
23
24 # include <emmintrin.h>
25 # include <smmintrin.h>
26 # include <cstdint>
27 # include "dotproduct.h"
28
29 namespace tesseract {
30
31 // Computes and returns the dot product of the n-vectors u and v.
32 // Uses Intel SSE intrinsics to access the SIMD instruction set.
33 #if defined(FAST_FLOAT)
DotProductSSE(const float * u,const float * v,int n)34 float DotProductSSE(const float *u, const float *v, int n) {
35 int max_offset = n - 4;
36 int offset = 0;
37 // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
38 // v, and multiplying them together in parallel.
39 __m128 sum = _mm_setzero_ps();
40 if (offset <= max_offset) {
41 offset = 4;
42 // Aligned load is reputedly faster but requires 16 byte aligned input.
43 if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
44 (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
45 // Use aligned load.
46 sum = _mm_load_ps(u);
47 __m128 floats2 = _mm_load_ps(v);
48 // Multiply.
49 sum = _mm_mul_ps(sum, floats2);
50 while (offset <= max_offset) {
51 __m128 floats1 = _mm_load_ps(u + offset);
52 floats2 = _mm_load_ps(v + offset);
53 floats1 = _mm_mul_ps(floats1, floats2);
54 sum = _mm_add_ps(sum, floats1);
55 offset += 4;
56 }
57 } else {
58 // Use unaligned load.
59 sum = _mm_loadu_ps(u);
60 __m128 floats2 = _mm_loadu_ps(v);
61 // Multiply.
62 sum = _mm_mul_ps(sum, floats2);
63 while (offset <= max_offset) {
64 __m128 floats1 = _mm_loadu_ps(u + offset);
65 floats2 = _mm_loadu_ps(v + offset);
66 floats1 = _mm_mul_ps(floats1, floats2);
67 sum = _mm_add_ps(sum, floats1);
68 offset += 4;
69 }
70 }
71 }
72 // Add the 4 sums in sum horizontally.
73 #if 0
74 alignas(32) float tmp[4];
75 _mm_store_ps(tmp, sum);
76 float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
77 #else
78 __m128 zero = _mm_setzero_ps();
79 // https://www.felixcloutier.com/x86/haddps
80 sum = _mm_hadd_ps(sum, zero);
81 sum = _mm_hadd_ps(sum, zero);
82 // Extract the low result.
83 float result = _mm_cvtss_f32(sum);
84 #endif
85 // Add on any left-over products.
86 while (offset < n) {
87 result += u[offset] * v[offset];
88 ++offset;
89 }
90 return result;
91 }
92 #else
93 double DotProductSSE(const double *u, const double *v, int n) {
94 int max_offset = n - 2;
95 int offset = 0;
96 // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
97 // v, and multiplying them together in parallel.
98 __m128d sum = _mm_setzero_pd();
99 if (offset <= max_offset) {
100 offset = 2;
101 // Aligned load is reputedly faster but requires 16 byte aligned input.
102 if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
103 (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
104 // Use aligned load.
105 sum = _mm_load_pd(u);
106 __m128d floats2 = _mm_load_pd(v);
107 // Multiply.
108 sum = _mm_mul_pd(sum, floats2);
109 while (offset <= max_offset) {
110 __m128d floats1 = _mm_load_pd(u + offset);
111 floats2 = _mm_load_pd(v + offset);
112 offset += 2;
113 floats1 = _mm_mul_pd(floats1, floats2);
114 sum = _mm_add_pd(sum, floats1);
115 }
116 } else {
117 // Use unaligned load.
118 sum = _mm_loadu_pd(u);
119 __m128d floats2 = _mm_loadu_pd(v);
120 // Multiply.
121 sum = _mm_mul_pd(sum, floats2);
122 while (offset <= max_offset) {
123 __m128d floats1 = _mm_loadu_pd(u + offset);
124 floats2 = _mm_loadu_pd(v + offset);
125 offset += 2;
126 floats1 = _mm_mul_pd(floats1, floats2);
127 sum = _mm_add_pd(sum, floats1);
128 }
129 }
130 }
131 // Add the 2 sums in sum horizontally.
132 sum = _mm_hadd_pd(sum, sum);
133 // Extract the low result.
134 double result = _mm_cvtsd_f64(sum);
135 // Add on any left-over products.
136 while (offset < n) {
137 result += u[offset] * v[offset];
138 ++offset;
139 }
140 return result;
141 }
142 #endif
143
144 } // namespace tesseract.
145
146 #endif
147