1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  * This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "gfxAlphaRecovery.h"
7 #include "gfxImageSurface.h"
8 #include <emmintrin.h>
9 
10 // This file should only be compiled on x86 and x64 systems.  Additionally,
11 // you'll need to compile it with -msse2 if you're using GCC on x86.
12 
13 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
14 __declspec(align(16)) static uint32_t greenMaski[] =
15     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
16 __declspec(align(16)) static uint32_t alphaMaski[] =
17     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
18 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
19 static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
20     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
21 static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
22     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
23 #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
24 #pragma align 16 (greenMaski, alphaMaski)
25 static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
26 static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
27 #endif
28 
29 bool
RecoverAlphaSSE2(gfxImageSurface * blackSurf,const gfxImageSurface * whiteSurf)30 gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
31                                    const gfxImageSurface* whiteSurf)
32 {
33     mozilla::gfx::IntSize size = blackSurf->GetSize();
34 
35     if (size != whiteSurf->GetSize() ||
36         (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
37          blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
38         (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
39          whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
40         return false;
41 
42     blackSurf->Flush();
43     whiteSurf->Flush();
44 
45     unsigned char* blackData = blackSurf->Data();
46     unsigned char* whiteData = whiteSurf->Data();
47 
48     if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
49         (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
50         // Cannot keep these in alignment.
51         return false;
52     }
53 
54     __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
55     __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
56 
57     for (int32_t i = 0; i < size.height; ++i) {
58         int32_t j = 0;
59         // Loop single pixels until at 4 byte alignment.
60         while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
61             *((uint32_t*)blackData) =
62                 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
63                              *reinterpret_cast<uint32_t*>(whiteData));
64             blackData += 4;
65             whiteData += 4;
66             j++;
67         }
68         // This extra loop allows the compiler to do some more clever registry
69         // management and makes it about 5% faster than with only the 4 pixel
70         // at a time loop.
71         for (; j < size.width - 8; j += 8) {
72             __m128i black1 = _mm_load_si128((__m128i*)blackData);
73             __m128i white1 = _mm_load_si128((__m128i*)whiteData);
74             __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
75             __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
76 
77             // Execute the same instructions as described in RecoverPixel, only
78             // using an SSE2 packed saturated subtract.
79             white1 = _mm_subs_epu8(white1, black1);
80             white2 = _mm_subs_epu8(white2, black2);
81             white1 = _mm_subs_epu8(greenMask, white1);
82             white2 = _mm_subs_epu8(greenMask, white2);
83             // Producing the final black pixel in an XMM register and storing
84             // that is actually faster than doing a masked store since that
85             // does an unaligned storage. We have the black pixel in a register
86             // anyway.
87             black1 = _mm_andnot_si128(alphaMask, black1);
88             black2 = _mm_andnot_si128(alphaMask, black2);
89             white1 = _mm_slli_si128(white1, 2);
90             white2 = _mm_slli_si128(white2, 2);
91             white1 = _mm_and_si128(alphaMask, white1);
92             white2 = _mm_and_si128(alphaMask, white2);
93             black1 = _mm_or_si128(white1, black1);
94             black2 = _mm_or_si128(white2, black2);
95 
96             _mm_store_si128((__m128i*)blackData, black1);
97             _mm_store_si128((__m128i*)(blackData + 16), black2);
98             blackData += 32;
99             whiteData += 32;
100         }
101         for (; j < size.width - 4; j += 4) {
102             __m128i black = _mm_load_si128((__m128i*)blackData);
103             __m128i white = _mm_load_si128((__m128i*)whiteData);
104 
105             white = _mm_subs_epu8(white, black);
106             white = _mm_subs_epu8(greenMask, white);
107             black = _mm_andnot_si128(alphaMask, black);
108             white = _mm_slli_si128(white, 2);
109             white = _mm_and_si128(alphaMask, white);
110             black = _mm_or_si128(white, black);
111             _mm_store_si128((__m128i*)blackData, black);
112             blackData += 16;
113             whiteData += 16;
114         }
115         // Loop single pixels until we're done.
116         while (j < size.width) {
117             *((uint32_t*)blackData) =
118                 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
119                              *reinterpret_cast<uint32_t*>(whiteData));
120             blackData += 4;
121             whiteData += 4;
122             j++;
123         }
124         blackData += blackSurf->Stride() - j * 4;
125         whiteData += whiteSurf->Stride() - j * 4;
126     }
127 
128     blackSurf->MarkDirty();
129 
130     return true;
131 }
132 
133 static int32_t
ByteAlignment(int32_t aAlignToLog2,int32_t aX,int32_t aY=0,int32_t aStride=1)134 ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
135 {
136     return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
137 }
138 
139 /*static*/ mozilla::gfx::IntRect
AlignRectForSubimageRecovery(const mozilla::gfx::IntRect & aRect,gfxImageSurface * aSurface)140 gfxAlphaRecovery::AlignRectForSubimageRecovery(const mozilla::gfx::IntRect& aRect,
141                                                gfxImageSurface* aSurface)
142 {
143     NS_ASSERTION(mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
144                  "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
145     static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
146     static const int32_t bpp = 4;
147     static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
148     //
149     // We're going to create a subimage of the surface with size
150     // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
151     // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
152     // properly aligned for SIMD.  So we want to find a rect <x',y',
153     // w',h'> that's a superset of what needs to be redrawn but is
154     // properly aligned.  Proper alignment is
155     //
156     //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
157     //   BPP * w'             \cong BPP * sw  (mod ALIGN)
158     //
159     // (We assume the pixel at surface <0,0> is already ALIGN'd.)
160     // That rect (obviously) has to fit within the surface bounds, and
161     // we should also minimize the extra pixels redrawn only for
162     // alignment's sake.  So we also want
163     //
164     //  minimize <x',y', w',h'>
165     //   0 <= x' <= x
166     //   0 <= y' <= y
167     //   w <= w' <= sw
168     //   h <= h' <= sh
169     //
170     // This is a messy integer non-linear programming problem, except
171     // ... we can assume that ALIGN/BPP is a very small constant.  So,
172     // brute force is viable.  The algorithm below will find a
173     // solution if one exists, but isn't guaranteed to find the
174     // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
175     // most 64 iterations below).  In what's likely the common case,
176     // an already-aligned rectangle, it only needs 1 iteration.
177     //
178     // Is this alignment worth doing?  Recovering alpha will take work
179     // proportional to w*h (assuming alpha recovery computation isn't
180     // memory bound).  This analysis can lead to O(w+h) extra work
181     // (with small constants).  In exchange, we expect to shave off a
182     // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
183     // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
184     // only really care about the w*h >> w+h case anyway; others
185     // should be fast enough even with the overhead.  (Unless the cost
186     // of repainting the expanded rect is high, but in that case
187     // SIMD-ized alpha recovery won't make a difference so this code
188     // shouldn't be called.)
189     //
190     mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
191     const int32_t stride = bpp * surfaceSize.width;
192     if (stride != aSurface->Stride()) {
193         NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
194         return aRect;
195     }
196 
197     const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
198     const int32_t r = x + w;
199     const int32_t sw = surfaceSize.width;
200     const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
201 
202     // The outer two loops below keep the rightmost (|r| above) and
203     // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
204     // return only a superset of the original rect.  These loops
205     // search for an aligned top-left pixel by trying to expand <x,y>
206     // left and up by <dx,dy> pixels, respectively.
207     //
208     // Then if a properly-aligned top-left pixel is found, the
209     // innermost loop tries to find an aligned stride by moving the
210     // rightmost pixel rightward by dr.
211     int32_t dx, dy, dr;
212     for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
213         for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
214             if (0 != ByteAlignment(kByteAlignLog2,
215                                    bpp * (x - dx), y - dy, stride)) {
216                 continue;
217             }
218             for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
219                 if (strideAlign == ByteAlignment(kByteAlignLog2,
220                                                  bpp * (w + dr + dx))) {
221                     goto FOUND_SOLUTION;
222                 }
223             }
224         }
225     }
226 
227     // Didn't find a solution.
228     return aRect;
229 
230 FOUND_SOLUTION:
231     mozilla::gfx::IntRect solution = mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
232     MOZ_ASSERT(mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
233                "'Solution' extends outside surface bounds!");
234     return solution;
235 }
236