1 /* -*- C++ -*-
2  *
3  *  graphics_x86_common.h - shared code between SSE graphics backends
4  *
5  *  Copyright (c) 2021 TellowKrinkle
6  *
7  *  tellowkrinkle@gmail.com
8  *
9  *  This program is free software; you can redistribute it and/or modify
10  *  it under the terms of the GNU General Public License as published by
11  *  the Free Software Foundation; either version 2 of the License, or
12  *  (at your option) any later version.
13  *
14  *  This program is distributed in the hope that it will be useful,
15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *  GNU General Public License for more details.
18  *
19  *  You should have received a copy of the GNU General Public License
20  *  along with this program; if not, see <http://www.gnu.org/licenses/>
21  *  or write to the Free Software Foundation, Inc.,
22  *  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  */
24 
25 #ifdef USE_X86_GFX
26 
27 #include <SDL.h>
28 #include <immintrin.h>
29 
30 #include "graphics_common.h"
31 
32 /// 0x0000gg?? -> 0x00gg00gg
extractFromGTo16L(__m128i v)33 static HELPER_FN __m128i extractFromGTo16L(__m128i v) {
34 #ifdef __SSSE3__
35     __m128i mask = _mm_setr_epi8(1, 0x80, 1, 0x80, 5, 0x80, 5, 0x80, 9, 0x80, 9, 0x80, 13, 0x80, 13, 0x80);
36     return _mm_shuffle_epi8(v, mask);
37 #else
38     __m128i shifted = _mm_srli_epi32(v, 8); // 0x000000gg
39     return _mm_or_si128(shifted, _mm_slli_epi32(shifted, 16));
40 #endif
41 }
42 
43 /// 0x????gg?? -> 0x000000gg
extractG(__m128i v)44 static HELPER_FN __m128i extractG(__m128i v) {
45 #ifdef __SSSE3__
46     __m128i mask = _mm_setr_epi8(1, 0x80, 0x80, 0x80, 5, 0x80, 0x80, 0x80, 9, 0x80, 0x80, 0x80, 13, 0x80, 0x80, 0x80);
47     return _mm_shuffle_epi8(v, mask);
48 #else
49     __m128i shifted = _mm_srli_epi32(v, 8);
50     return _mm_and_si128(shifted, _mm_set1_epi32(0xFF));
51 #endif
52 }
53 
54 /// 0x000000bb -> 0x00bb00bb
extractBTo16L(__m128i v)55 static HELPER_FN __m128i extractBTo16L(__m128i v) {
56 #ifdef __SSSE3__
57     __m128i mask = _mm_setr_epi8(0, 0x80, 0, 0x80, 4, 0x80, 4, 0x80, 8, 0x80, 8, 0x80, 12, 0x80, 12, 0x80);
58     return _mm_shuffle_epi8(v, mask);
59 #else
60     return _mm_or_si128(_mm_slli_epi32(v, 16), v);
61 #endif
62 }
63 
imageFilterBlend_SSE_Common(Uint32 * dst_buffer,Uint32 * src_buffer,Uint8 * alphap,int alpha,int length)64 static HELPER_FN void imageFilterBlend_SSE_Common(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length) {
65     int n = length;
66 
67     // Compute first few values so we're on a 16-byte boundary in dst_buffer
68     while( (((long)dst_buffer & 0xF) > 0) && (n > 0) ) {
69         BLEND_PIXEL();
70         --n; ++dst_buffer; ++src_buffer;
71     }
72 
73     // Do bulk of processing using SSE2 (process 4 32bit (BGRA) pixels)
74     // create basic bitmasks 0x00FF00FF, 0x000000FF
75     __m128i bmask2 = _mm_set1_epi32(0x00FF00FF);
76     while (n >= 4) {
77         // alpha1 = ((src_argb >> 24) * alpha) >> 8
78         __m128i a = _mm_set1_epi32(alpha);
79         __m128i buf = _mm_loadu_si128((__m128i*)src_buffer);
80         __m128i tmp = _mm_srli_epi32(buf, 24);
81         a = _mm_mullo_epi16(a, tmp);
82         // double-up alpha1 (0x0000vvxx -> 0x00vv00vv)
83         a = extractFromGTo16L(a);
84         // rb = (src_argb & bmask2) * alpha1
85         tmp = _mm_and_si128(buf, bmask2);
86         __m128i rb = _mm_mullo_epi16(a, tmp);
87         // g = ((src_argb >> 8) & bmask) * alpha1
88         tmp = extractG(buf);
89         __m128i g = _mm_mullo_epi16(a, tmp);
90         // alpha2 = alpha1 ^ bmask2
91         a = _mm_xor_si128(a, bmask2);
92         buf = _mm_load_si128((__m128i*)dst_buffer);
93         // rb += (dst_argb & bmask2) * alpha2
94         tmp = _mm_and_si128(buf, bmask2);
95         tmp = _mm_mullo_epi16(a, tmp);
96         rb = _mm_add_epi16(rb, tmp);
97         // rb = 0xbbxxrrxx -> 0x00bb00rr
98         rb = _mm_srli_epi16(rb, 8);
99         // g += ((dst_argb >> 8) & bmask) * alpha2
100         tmp = extractG(buf);
101         tmp = _mm_mullo_epi16(a, tmp);
102         g = _mm_add_epi32(g, tmp);
103         // g = g & ~bmask2
104         g = _mm_andnot_si128(bmask2, g);
105         // dst_argb = rb | g
106         tmp = _mm_or_si128(rb, g);
107         _mm_store_si128((__m128i*)dst_buffer, tmp);
108 
109         n -= 4; src_buffer += 4; dst_buffer += 4; alphap += 16;
110     }
111 
112     // If any pixels are left over, deal with them individually
113     ++n;
114     BASIC_BLEND();
115 }
116 
alphaMaskBlend_SSE_Common(SDL_Surface * dst,SDL_Surface * s1,SDL_Surface * s2,SDL_Surface * mask_surface,const SDL_Rect & rect,Uint32 mask_value)117 static HELPER_FN bool alphaMaskBlend_SSE_Common(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, SDL_Surface *mask_surface, const SDL_Rect& rect, Uint32 mask_value)
118 {
119     if (mask_surface->w < 4) {
120         return false;
121     }
122 
123     int end_x = rect.x + rect.w;
124     int end_y = rect.y + rect.h;
125     int mask_height = mask_surface->h;
126     int mask_width = mask_surface->w;
127 
128     int mask_off_base_y = rect.y % mask_surface->h;
129     int mask_off_base_x = rect.x % mask_surface->w;
130     for (int y = rect.y, my = mask_off_base_y; y < end_y; y++, my++) {
131         if (my >= mask_height) { my = 0; }
132         Uint32* s1p = getPointerToRow<Uint32>(s1, y);
133         Uint32* s2p = getPointerToRow<Uint32>(s2, y);
134         Uint32* dstp = getPointerToRow<Uint32>(dst, y);
135         Uint32* mask_buf = getPointerToRow<Uint32>(mask_surface, my);
136 
137         int x = rect.x, mx = mask_off_base_x;
138         while (!is_aligned(dstp + x, 16) && (x < end_x)) {
139             dstp[x] = blendMaskOnePixel(s1p[x], s2p[x], mask_buf[mx], mask_value);
140             x++, mx++;
141             if (mx >= mask_width) { mx = 0; }
142         }
143         __m128i mask_value_v = _mm_set1_epi32(mask_value);
144         __m128i mask_000000ff = _mm_set1_epi32(0x000000FF);
145         __m128i mask_00ff00ff = _mm_set1_epi32(0x00FF00FF);
146         while (x < (end_x - 3)) {
147             __m128i s1v = _mm_loadu_si128((__m128i*)(s1p + x));
148             __m128i s2v = _mm_loadu_si128((__m128i*)(s2p + x));
149             __m128i mskv;
150             if (__builtin_expect(mx + 3 < mask_width, true)) {
151                 mskv = _mm_loadu_si128((__m128i*)(mask_buf + mx));
152             } else {
153                 __attribute__((aligned(16))) Uint32 tmp[4];
154                 for (int i = 0; i < 4; i++) {
155                     if (mx + i < mask_width) {
156                         tmp[i] = mask_buf[mx + i];
157                     } else {
158                         tmp[i] = mask_buf[mx + i - mask_width];
159                     }
160                 }
161                 mskv = _mm_load_si128((__m128i*)tmp);
162             }
163             mskv = _mm_and_si128(mskv, mask_000000ff);
164             __m128i mask2 = _mm_subs_epu16(mask_value_v, mskv);
165             mask2 = _mm_min_epi16(mask2, mask_000000ff); // min(mask2, 0xFF)
166 #ifdef __clang__
167             asm("":"+x"(mask2)::); // clang optimization makes things worse, block it
168 #endif
169             mask2 = extractBTo16L(mask2); // Spread alpha for multiplying (0x00aa00aa)
170             __m128i mask1 = _mm_xor_si128(mask2, mask_00ff00ff);
171             // out_rb = ((s1v & rbmask) * mask1 + (s2v & rbmask) * mask2) >> 8
172             __m128i s1v_rb = _mm_mullo_epi16(mask1, _mm_and_si128(s1v, mask_00ff00ff));
173             __m128i s2v_rb = _mm_mullo_epi16(mask2, _mm_and_si128(s2v, mask_00ff00ff));
174             __m128i out_rb = _mm_srli_epi16(_mm_add_epi16(s1v_rb, s2v_rb), 8);
175             // out_g = (((s1v & gmask) >> 8) * mask1 + ((s2v & gmask) >> 8) * mask2) & gmask
176             __m128i s1v_g = _mm_mullo_epi16(mask1, extractG(s1v));
177             __m128i s2v_g = _mm_mullo_epi16(mask2, extractG(s2v));
178             __m128i out_g = _mm_andnot_si128(mask_00ff00ff, _mm_add_epi16(s1v_g, s2v_g));
179             _mm_store_si128((__m128i*)(dstp + x), _mm_or_si128(out_rb, out_g));
180 
181             x += 4;
182             mx += 4;
183             if (mx >= mask_width) { mx -= mask_width; }
184         }
185         while (x < end_x) {
186             dstp[x] = blendMaskOnePixel(s1p[x], s2p[x], mask_buf[mx], mask_value);
187             x++, mx++;
188             if (mx >= mask_width) { mx = 0; }
189         }
190     }
191     return true;
192 }
193 
alphaMaskBlendConst_SSE_Common(SDL_Surface * dst,SDL_Surface * s1,SDL_Surface * s2,const SDL_Rect & rect,Uint32 mask_value)194 static HELPER_FN void alphaMaskBlendConst_SSE_Common(SDL_Surface* dst, SDL_Surface *s1, SDL_Surface *s2, const SDL_Rect& rect, Uint32 mask_value)
195 {
196     int end_x = (rect.x + rect.w) * 4;
197     int end_y = rect.y + rect.h;
198     for (int y = rect.y; y < end_y; y++) {
199         char* s1p = getPointerToRow<char>(s1, y);
200         char* s2p = getPointerToRow<char>(s2, y);
201         char* dstp = getPointerToRow<char>(dst, y);
202 
203         int x = rect.x * 4;
204         for (; !is_aligned(dstp + x, 16) && (x < end_x); x += 4) {
205             *(Uint32*)(dstp + x) = blendMaskOnePixel(*(Uint32*)(s1p + x), *(Uint32*)(s2p + x), 0, mask_value);
206         }
207         __m128i mask_000000ff = _mm_set1_epi32(0x000000FF);
208         __m128i mask_00ff00ff = _mm_set1_epi32(0x00FF00FF);
209         __m128i mask2 = _mm_set1_epi16(mask_value);
210         __m128i mask1 = _mm_xor_si128(mask2, mask_00ff00ff);
211         for (; x < (end_x - 15); x += 16) {
212             __m128i s1v = _mm_loadu_si128((__m128i*)(s1p + x));
213             __m128i s2v = _mm_loadu_si128((__m128i*)(s2p + x));
214             // out_rb = ((s1v & rbmask) * mask1 + (s2v & rbmask) * mask2) >> 8
215             __m128i s1v_rb = _mm_mullo_epi16(mask1, _mm_and_si128(s1v, mask_00ff00ff));
216             __m128i s2v_rb = _mm_mullo_epi16(mask2, _mm_and_si128(s2v, mask_00ff00ff));
217             __m128i out_rb = _mm_srli_epi16(_mm_add_epi16(s1v_rb, s2v_rb), 8);
218             // out_g = (((s1v & gmask) >> 8) * mask1 + ((s2v & gmask) >> 8) * mask2) & gmask
219             __m128i s1v_g = _mm_mullo_epi16(mask1, extractG(s1v));
220             __m128i s2v_g = _mm_mullo_epi16(mask2, extractG(s2v));
221             __m128i out_g = _mm_andnot_si128(mask_00ff00ff, _mm_add_epi16(s1v_g, s2v_g));
222             _mm_store_si128((__m128i*)(dstp + x), _mm_or_si128(out_rb, out_g));
223         }
224         for (; x < end_x; x += 4) {
225             *(Uint32*)(dstp + x) = blendMaskOnePixel(*(Uint32*)(s1p + x), *(Uint32*)(s2p + x), 0, mask_value);
226         }
227     }
228 }
229 
230 #endif
231