1 ///////////////////////////////////////////////////////////////////////
2 // File:        dotproductsse.cpp
3 // Description: Architecture-specific dot-product function.
4 // Author:      Ray Smith
5 //
6 // (C) Copyright 2015, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 ///////////////////////////////////////////////////////////////////////
17 
18 #if !defined(__SSE4_1__)
19 #  if defined(__i686__) || defined(__x86_64__)
20 #    error Implementation only for SSE 4.1 capable architectures
21 #  endif
22 #else
23 
24 #  include <emmintrin.h>
25 #  include <smmintrin.h>
26 #  include <cstdint>
27 #  include "dotproduct.h"
28 
29 namespace tesseract {
30 
31 // Computes and returns the dot product of the n-vectors u and v.
32 // Uses Intel SSE intrinsics to access the SIMD instruction set.
33 #if defined(FAST_FLOAT)
DotProductSSE(const float * u,const float * v,int n)34 float DotProductSSE(const float *u, const float *v, int n) {
35   int max_offset = n - 4;
36   int offset = 0;
37   // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
38   // v, and multiplying them together in parallel.
39   __m128 sum = _mm_setzero_ps();
40   if (offset <= max_offset) {
41     offset = 4;
42     // Aligned load is reputedly faster but requires 16 byte aligned input.
43     if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
44         (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
45       // Use aligned load.
46       sum = _mm_load_ps(u);
47       __m128 floats2 = _mm_load_ps(v);
48       // Multiply.
49       sum = _mm_mul_ps(sum, floats2);
50       while (offset <= max_offset) {
51         __m128 floats1 = _mm_load_ps(u + offset);
52         floats2 = _mm_load_ps(v + offset);
53         floats1 = _mm_mul_ps(floats1, floats2);
54         sum = _mm_add_ps(sum, floats1);
55         offset += 4;
56       }
57     } else {
58       // Use unaligned load.
59       sum = _mm_loadu_ps(u);
60       __m128 floats2 = _mm_loadu_ps(v);
61       // Multiply.
62       sum = _mm_mul_ps(sum, floats2);
63       while (offset <= max_offset) {
64         __m128 floats1 = _mm_loadu_ps(u + offset);
65         floats2 = _mm_loadu_ps(v + offset);
66         floats1 = _mm_mul_ps(floats1, floats2);
67         sum = _mm_add_ps(sum, floats1);
68         offset += 4;
69       }
70     }
71   }
72   // Add the 4 sums in sum horizontally.
73 #if 0
74 	alignas(32) float tmp[4];
75 	_mm_store_ps(tmp, sum);
76 	float result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
77 #else
78   __m128 zero = _mm_setzero_ps();
79   // https://www.felixcloutier.com/x86/haddps
80   sum = _mm_hadd_ps(sum, zero);
81   sum = _mm_hadd_ps(sum, zero);
82   // Extract the low result.
83   float result = _mm_cvtss_f32(sum);
84 #endif
85   // Add on any left-over products.
86   while (offset < n) {
87     result += u[offset] * v[offset];
88     ++offset;
89   }
90   return result;
91 }
92 #else
93 double DotProductSSE(const double *u, const double *v, int n) {
94   int max_offset = n - 2;
95   int offset = 0;
96   // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
97   // v, and multiplying them together in parallel.
98   __m128d sum = _mm_setzero_pd();
99   if (offset <= max_offset) {
100     offset = 2;
101     // Aligned load is reputedly faster but requires 16 byte aligned input.
102     if ((reinterpret_cast<uintptr_t>(u) & 15) == 0 &&
103         (reinterpret_cast<uintptr_t>(v) & 15) == 0) {
104       // Use aligned load.
105       sum = _mm_load_pd(u);
106       __m128d floats2 = _mm_load_pd(v);
107       // Multiply.
108       sum = _mm_mul_pd(sum, floats2);
109       while (offset <= max_offset) {
110         __m128d floats1 = _mm_load_pd(u + offset);
111         floats2 = _mm_load_pd(v + offset);
112         offset += 2;
113         floats1 = _mm_mul_pd(floats1, floats2);
114         sum = _mm_add_pd(sum, floats1);
115       }
116     } else {
117       // Use unaligned load.
118       sum = _mm_loadu_pd(u);
119       __m128d floats2 = _mm_loadu_pd(v);
120       // Multiply.
121       sum = _mm_mul_pd(sum, floats2);
122       while (offset <= max_offset) {
123         __m128d floats1 = _mm_loadu_pd(u + offset);
124         floats2 = _mm_loadu_pd(v + offset);
125         offset += 2;
126         floats1 = _mm_mul_pd(floats1, floats2);
127         sum = _mm_add_pd(sum, floats1);
128       }
129     }
130   }
131   // Add the 2 sums in sum horizontally.
132   sum = _mm_hadd_pd(sum, sum);
133   // Extract the low result.
134   double result = _mm_cvtsd_f64(sum);
135   // Add on any left-over products.
136   while (offset < n) {
137     result += u[offset] * v[offset];
138     ++offset;
139   }
140   return result;
141 }
142 #endif
143 
144 } // namespace tesseract.
145 
146 #endif
147