1 /* -*- C++ -*-
2 *
3 * graphics_x86_common.h - shared code between SSE graphics backends
4 *
5 * Copyright (c) 2021 TellowKrinkle
6 *
7 * tellowkrinkle@gmail.com
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, see <http://www.gnu.org/licenses/>
21 * or write to the Free Software Foundation, Inc.,
22 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #ifdef USE_X86_GFX
26
27 #include <SDL.h>
28 #include <immintrin.h>
29
30 #include "graphics_common.h"
31
32 /// 0x0000gg?? -> 0x00gg00gg
extractFromGTo16L(__m128i v)33 static HELPER_FN __m128i extractFromGTo16L(__m128i v) {
34 #ifdef __SSSE3__
35 __m128i mask = _mm_setr_epi8(1, 0x80, 1, 0x80, 5, 0x80, 5, 0x80, 9, 0x80, 9, 0x80, 13, 0x80, 13, 0x80);
36 return _mm_shuffle_epi8(v, mask);
37 #else
38 __m128i shifted = _mm_srli_epi32(v, 8); // 0x000000gg
39 return _mm_or_si128(shifted, _mm_slli_epi32(shifted, 16));
40 #endif
41 }
42
43 /// 0x????gg?? -> 0x000000gg
extractG(__m128i v)44 static HELPER_FN __m128i extractG(__m128i v) {
45 #ifdef __SSSE3__
46 __m128i mask = _mm_setr_epi8(1, 0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80);
47 return _mm_shuffle_epi8(v, mask);
48 #else
49 __m128i shifted = _mm_srli_epi32(v, 8);
50 return _mm_and_si128(shifted, _mm_set1_epi32(0xFF));
51 #endif
52 }
53
54 /// 0x000000bb -> 0x00bb00bb
extractBTo16L(__m128i v)55 static HELPER_FN __m128i extractBTo16L(__m128i v) {
56 #ifdef __SSSE3__
57 __m128i mask = _mm_setr_epi8(0, 0x80, 0, 0x80, 4, 0x80, 4, 0x80, 8, 0x80, 8, 0x80, 12, 0x80, 12, 0x80);
58 return _mm_shuffle_epi8(v, mask);
59 #else
60 return _mm_or_si128(_mm_slli_epi32(v, 16), v);
61 #endif
62 }
63
imageFilterBlend_SSE_Common(Uint32 * dst_buffer,Uint32 * src_buffer,Uint8 * alphap,int alpha,int length)64 static HELPER_FN void imageFilterBlend_SSE_Common(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length) {
65 int n = length;
66
67 // Compute first few values so we're on a 16-byte boundary in dst_buffer
68 while( (((long)dst_buffer & 0xF) > 0) && (n > 0) ) {
69 BLEND_PIXEL();
70 --n; ++dst_buffer; ++src_buffer;
71 }
72
73 // Do bulk of processing using SSE2 (process 4 32bit (BGRA) pixels)
74 // create basic bitmasks 0x00FF00FF, 0x000000FF
75 __m128i bmask2 = _mm_set1_epi32(0x00FF00FF);
76 while (n >= 4) {
77 // alpha1 = ((src_argb >> 24) * alpha) >> 8
78 __m128i a = _mm_set1_epi32(alpha);
79 __m128i buf = _mm_loadu_si128((__m128i*)src_buffer);
80 __m128i tmp = _mm_srli_epi32(buf, 24);
81 a = _mm_mullo_epi16(a, tmp);
82 // double-up alpha1 (0x0000vvxx -> 0x00vv00vv)
83 a = extractFromGTo16L(a);
84 // rb = (src_argb & bmask2) * alpha1
85 tmp = _mm_and_si128(buf, bmask2);
86 __m128i rb = _mm_mullo_epi16(a, tmp);
87 // g = ((src_argb >> 8) & bmask) * alpha1
88 tmp = extractG(buf);
89 __m128i g = _mm_mullo_epi16(a, tmp);
90 // alpha2 = alpha1 ^ bmask2
91 a = _mm_xor_si128(a, bmask2);
92 buf = _mm_load_si128((__m128i*)dst_buffer);
93 // rb += (dst_argb & bmask2) * alpha2
94 tmp = _mm_and_si128(buf, bmask2);
95 tmp = _mm_mullo_epi16(a, tmp);
96 rb = _mm_add_epi16(rb, tmp);
97 // rb = 0xbbxxrrxx -> 0x00bb00rr
98 rb = _mm_srli_epi16(rb, 8);
99 // g += ((dst_argb >> 8) & bmask) * alpha2
100 tmp = extractG(buf);
101 tmp = _mm_mullo_epi16(a, tmp);
102 g = _mm_add_epi32(g, tmp);
103 // g = g & ~bmask2
104 g = _mm_andnot_si128(bmask2, g);
105 // dst_argb = rb | g
106 tmp = _mm_or_si128(rb, g);
107 _mm_store_si128((__m128i*)dst_buffer, tmp);
108
109 n -= 4; src_buffer += 4; dst_buffer += 4; alphap += 16;
110 }
111
112 // If any pixels are left over, deal with them individually
113 ++n;
114 BASIC_BLEND();
115 }
116
alphaMaskBlend_SSE_Common(SDL_Surface * dst,SDL_Surface * s1,SDL_Surface * s2,SDL_Surface * mask_surface,const SDL_Rect & rect,Uint32 mask_value)117 static HELPER_FN bool alphaMaskBlend_SSE_Common(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value)
118 {
119 if (mask_surface->w < 4) {
120 return false;
121 }
122
123 int end_x = rect.x + rect.w;
124 int end_y = rect.y + rect.h;
125 int mask_height = mask_surface->h;
126 int mask_width = mask_surface->w;
127
128 int mask_off_base_y = rect.y % mask_surface->h;
129 int mask_off_base_x = rect.x % mask_surface->w;
130 for (int y = rect.y, my = mask_off_base_y; y < end_y; y++, my++) {
131 if (my >= mask_height) { my = 0; }
132 Uint32* s1p = getPointerToRow<Uint32>(s1, y);
133 Uint32* s2p = getPointerToRow<Uint32>(s2, y);
134 Uint32* dstp = getPointerToRow<Uint32>(dst, y);
135 Uint32* mask_buf = getPointerToRow<Uint32>(mask_surface, my);
136
137 int x = rect.x, mx = mask_off_base_x;
138 while (!is_aligned(dstp + x, 16) && (x < end_x)) {
139 dstp[x] = blendMaskOnePixel(s1p[x], s2p[x], mask_buf[mx], mask_value);
140 x++, mx++;
141 if (mx >= mask_width) { mx = 0; }
142 }
143 __m128i mask_value_v = _mm_set1_epi32(mask_value);
144 __m128i mask_000000ff = _mm_set1_epi32(0x000000FF);
145 __m128i mask_00ff00ff = _mm_set1_epi32(0x00FF00FF);
146 while (x < (end_x - 3)) {
147 __m128i s1v = _mm_loadu_si128((__m128i*)(s1p + x));
148 __m128i s2v = _mm_loadu_si128((__m128i*)(s2p + x));
149 __m128i mskv;
150 if (__builtin_expect(mx + 3 < mask_width, true)) {
151 mskv = _mm_loadu_si128((__m128i*)(mask_buf + mx));
152 } else {
153 __attribute__((aligned(16))) Uint32 tmp[4];
154 for (int i = 0; i < 4; i++) {
155 if (mx + i < mask_width) {
156 tmp[i] = mask_buf[mx + i];
157 } else {
158 tmp[i] = mask_buf[mx + i - mask_width];
159 }
160 }
161 mskv = _mm_load_si128((__m128i*)tmp);
162 }
163 mskv = _mm_and_si128(mskv, mask_000000ff);
164 __m128i mask2 = _mm_subs_epu16(mask_value_v, mskv);
165 mask2 = _mm_min_epi16(mask2, mask_000000ff); // min(mask2, 0xFF)
166 #ifdef __clang__
167 asm("":"+x"(mask2)::); // clang optimization makes things worse, block it
168 #endif
169 mask2 = extractBTo16L(mask2); // Spread alpha for multiplying (0x00aa00aa)
170 __m128i mask1 = _mm_xor_si128(mask2, mask_00ff00ff);
171 // out_rb = ((s1v & rbmask) * mask1 + (s2v & rbmask) * mask2) >> 8
172 __m128i s1v_rb = _mm_mullo_epi16(mask1, _mm_and_si128(s1v, mask_00ff00ff));
173 __m128i s2v_rb = _mm_mullo_epi16(mask2, _mm_and_si128(s2v, mask_00ff00ff));
174 __m128i out_rb = _mm_srli_epi16(_mm_add_epi16(s1v_rb, s2v_rb), 8);
175 // out_g = (((s1v & gmask) >> 8) * mask1 + ((s2v & gmask) >> 8) * mask2) & gmask
176 __m128i s1v_g = _mm_mullo_epi16(mask1, extractG(s1v));
177 __m128i s2v_g = _mm_mullo_epi16(mask2, extractG(s2v));
178 __m128i out_g = _mm_andnot_si128(mask_00ff00ff, _mm_add_epi16(s1v_g, s2v_g));
179 _mm_store_si128((__m128i*)(dstp + x), _mm_or_si128(out_rb, out_g));
180
181 x += 4;
182 mx += 4;
183 if (mx >= mask_width) { mx -= mask_width; }
184 }
185 while (x < end_x) {
186 dstp[x] = blendMaskOnePixel(s1p[x], s2p[x], mask_buf[mx], mask_value);
187 x++, mx++;
188 if (mx >= mask_width) { mx = 0; }
189 }
190 }
191 return true;
192 }
193
alphaMaskBlendConst_SSE_Common(SDL_Surface * dst,SDL_Surface * s1,SDL_Surface * s2,const SDL_Rect & rect,Uint32 mask_value)194 static HELPER_FN void alphaMaskBlendConst_SSE_Common(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, const SDL_Rect& rect, Uint32 mask_value)
195 {
196 int end_x = (rect.x + rect.w) * 4;
197 int end_y = rect.y + rect.h;
198 for (int y = rect.y; y < end_y; y++) {
199 char* s1p = getPointerToRow<char>(s1, y);
200 char* s2p = getPointerToRow<char>(s2, y);
201 char* dstp = getPointerToRow<char>(dst, y);
202
203 int x = rect.x * 4;
204 for (; !is_aligned(dstp + x, 16) && (x < end_x); x += 4) {
205 *(Uint32*)(dstp + x) = blendMaskOnePixel(*(Uint32*)(s1p + x), *(Uint32*)(s2p + x), 0, mask_value);
206 }
207 __m128i mask_000000ff = _mm_set1_epi32(0x000000FF);
208 __m128i mask_00ff00ff = _mm_set1_epi32(0x00FF00FF);
209 __m128i mask2 = _mm_set1_epi16(mask_value);
210 __m128i mask1 = _mm_xor_si128(mask2, mask_00ff00ff);
211 for (; x < (end_x - 15); x += 16) {
212 __m128i s1v = _mm_loadu_si128((__m128i*)(s1p + x));
213 __m128i s2v = _mm_loadu_si128((__m128i*)(s2p + x));
214 // out_rb = ((s1v & rbmask) * mask1 + (s2v & rbmask) * mask2) >> 8
215 __m128i s1v_rb = _mm_mullo_epi16(mask1, _mm_and_si128(s1v, mask_00ff00ff));
216 __m128i s2v_rb = _mm_mullo_epi16(mask2, _mm_and_si128(s2v, mask_00ff00ff));
217 __m128i out_rb = _mm_srli_epi16(_mm_add_epi16(s1v_rb, s2v_rb), 8);
218 // out_g = (((s1v & gmask) >> 8) * mask1 + ((s2v & gmask) >> 8) * mask2) & gmask
219 __m128i s1v_g = _mm_mullo_epi16(mask1, extractG(s1v));
220 __m128i s2v_g = _mm_mullo_epi16(mask2, extractG(s2v));
221 __m128i out_g = _mm_andnot_si128(mask_00ff00ff, _mm_add_epi16(s1v_g, s2v_g));
222 _mm_store_si128((__m128i*)(dstp + x), _mm_or_si128(out_rb, out_g));
223 }
224 for (; x < end_x; x += 4) {
225 *(Uint32*)(dstp + x) = blendMaskOnePixel(*(Uint32*)(s1p + x), *(Uint32*)(s2p + x), 0, mask_value);
226 }
227 }
228 }
229
230 #endif
231