1 /*=============================================================================
2                                    pamflip_sse.c
3 ===============================================================================
4   This is part of the Pamflip program.  It contains code that exploits
5   the SSE facility of some CPUs.
6 
7   This code was originally written by Akira Urushibata ("Douso") in 2010 and is
8   contributed to the public domain by all authors.
9 
10   The author makes the following request (which is not a reservation of legal
11   rights): Please study the code and make adjustments to meet specific needs.
12   This part is critical to performance.  I have seen code copied from
13   elsewhere poorly implemented.  A lot of work goes into the development of
14   free software.  It is sad to see derivative work which fails to reach its
15   potential.  Please put a comment in the code so people will know where it
16   came from.
17 
18 =============================================================================*/
19 
20 #include <assert.h>
21 
22 #include "pm_config.h"
23 #include "pm_c_util.h"
24 #include "mallocvar.h"
25 #include "pam.h"
26 
27 #include "config.h"  /* Defines SSE_PBM_XY_FLIP */
28 #include "flip.h"
29 
30 #include "pamflip_sse.h"
31 
32 /* Note that WANT_SSE implies the user expects SSE to be available
33    (i.e. <emmintrin.h> exists).
34 */
35 
36 #if SSE_PBM_XY_FLIP
37 
38 /*----------------------------------------------------------------------------
39    This is a specialized routine for row-for-column PBM transformations.
40    (-cw, -ccw, -xy).  It requires GCC (>= v. 4.2.0) and SSE2.
41 
42    In each cycle, we read sixteen rows from the input.  We process this band
43    left to right in blocks 8 pixels wide.  We use the SSE2 instruction
44    pmovmskb128, which reports the MSB of each byte in a 16 byte array, for
45    fast processing.  We place the 8x16 block into a 16 byte array, and
46    pmovmskb128 reports the 16 pixels on the left edge in one instruction
47    execution.  pslldi128 shifts the array contents leftward.
48 
49    The following routines can write both in both directions (left and right)
50    into the output rows.  They do this by controlling the vertical stacking
51    order when they make the 8x16 blocks.
52 
53    We do all transposition in 8x16 block units, adding padding to
54    the 8 row input buffer and the output plane raster as necessary.
55    doPartialBlockTop() or doPartialBlockBottom() handles the partial
56    input band.  This part can come from either the top or bottom of the
57    vertical input column, but always goes to the right end of the output
58    rows.
59 
60    As an enhancement, we clear the output raster to zero (=white) in the
61    beginning and flip only the 8x16 blocks that contain non-zero bits (=any
62    amount of black pixels).  When we add padding to the edges, we initialize
63    it all to zero to prevent unnecessary transpositions.  Because most
64    real-world documents are largely white, this saves much execution time.  If
65    you are porting this code to an environment in which non-zero bits are the
66    majority, for example, BMP where zero means black, you should seriously
67    consider modifying this.
68 
69    All instructions unique to GCC/SSE are in transpose16Bitrows().
70    It is possible to write a non-SSE version by providing a generic
71    version of transpose16Bitrows() or one tuned for a specific
72    architecture.  Use 8x8 blocks to avoid endian issues.
73 
74    Further enhancement should be possible by employing wider bands,
75    larger blocks as wider SIMD registers become available.  Clearing
76    the white parts after instead of before transposition is also a
77    possibility.
78 -----------------------------------------------------------------------------*/
79 
80 #include <emmintrin.h>
81 
82 typedef char v16qi __attribute__ ((vector_size (16)));
83 typedef int  v4di  __attribute__ ((vector_size (16)));
84 
85 /* Beware when making modifications to code which involve SSE.
86    This is a sensitive part of GCC.  Different compiler versions
87    respond differently to trivial matters such as the difference
88    between above v16qi, v4di and a union defined for handling both.
89    What can be placed into a register is another issue.  Some
90    compilers issue warnings, others abort with error.
91 
92    A char array cannot be loaded into v16qi by casting.  A vector
93    variable must be vector from the beginning.
94 
95    Changes for your local system are okay, but if you intend to
96    publish them, please specify the compiler version you used.
97 
98    This code has been tested on gcc versions 4.2.0, 4.2.4, 4.3.2,
99    4.4.3, 4.4.4, 4.5.0, 4.5.2, 4.6.0 and 4.6.1 clang versions
100    3.0, 3.2, 3.3.
101 
102    We use SSE instructions in "_mm_" form in favor of "__builtin_".
103    In GCC the "__builtin_" form is documented but "_mm_" is not.
104    Former versions of this source file used "__builtin_".  This was
105    changed to make possible compilation with clang.
106 
107    _mm_slli_epi32 : __builtin_ia32_pslldi128
108    _mm_cmpeq_epi8 : __builtin_ia32_pcmpeqb128
109    _mm_movemask_epi8 : __builtin_ia32_pmovmskb128
110 
111    The conversion requires <emmintrin.h> .
112 
113 */
114 
115 
116 
117 static void
transpose16Bitrows(unsigned int const cols,unsigned int const rows,const bit * const block[16],uint16_t ** const outplane,unsigned int const outcol16)118 transpose16Bitrows(unsigned int const cols,
119                    unsigned int const rows,
120                    const bit *  const block[16],
121                    uint16_t **  const outplane,
122                    unsigned int const outcol16) {
123 /*--------------------------------------------------------------------------
124   Convert input rows to output columns.  Works in units of 8x16.
125 
126   Uses pre-calculated pointers ( block[i][col8] ) instead of
127   (xdir > 0) ? (i & 0x08) + 7 - (i & 0x07) : (24 - rows % 16 +i) % 16
128   for efficiency.
129 
130   We load the block directly into a register.  (using a union like:
131 
132        union V16 {
133           v16qi v;
134           unsigned char i[16];
135        };
136   )
137 
138   gcc (v. 4.2, 4.4) sees the suffix [x] of v16.i[x] and apparently decides
139   that the variable has to be addressable and therefore needs to be placed
140   into memory.)
141 ---------------------------------------------------------------------------*/
142     unsigned int col;
143     register v16qi zero128;   /* 16 bytes of zero, in a SSE register */
144 
145     zero128 = zero128 ^ zero128;
146 
147     for (col = 0; col < cols; col += 8) {
148         unsigned int const col8 = col / 8;
149 
150         register v16qi vReg = {
151             block[0][col8],  block[1][col8],
152             block[2][col8],  block[3][col8],
153             block[4][col8],  block[5][col8],
154             block[6][col8],  block[7][col8],
155             block[8][col8],  block[9][col8],
156             block[10][col8], block[11][col8],
157             block[12][col8], block[13][col8],
158             block[14][col8], block[15][col8] };
159 
160         register __m128i const compare =
161             _mm_cmpeq_epi8((__m128i)vReg, (__m128i)zero128);
162 
163         if (_mm_movemask_epi8(compare) != 0xffff) {
164 
165             /* There is some black content in this block; write to outplane */
166 
167             unsigned int outrow;
168             unsigned int i;
169 
170             outrow = col;  /* initial value */
171 
172             for (i = 0; i < 7; ++i) {
173                 /* GCC (>=4.2) automatically unrolls this loop */
174                 outplane[outrow++][outcol16] =
175                     _mm_movemask_epi8((__m128i)vReg);
176                 vReg = (v16qi)_mm_slli_epi32((__m128i)vReg, 1);
177             }
178             outplane[outrow][outcol16] = _mm_movemask_epi8((__m128i)vReg);
179         } else {
180             /* The block is completely white; skip. */
181         }
182     }
183 }
184 
185 
186 
187 static void
analyzeBlock(const struct pam * const inpamP,bit ** const inrow,int const xdir,const bit ** const block,const bit ** const blockPartial,unsigned int * const topOfFullBlockP,unsigned int * const outcol16P)188 analyzeBlock(const struct pam * const inpamP,
189              bit **             const inrow,
190              int                const xdir,
191              const bit **       const block,
192              const bit **       const blockPartial,
193              unsigned int *     const topOfFullBlockP,
194              unsigned int *     const outcol16P) {
195 /*--------------------------------------------------------------------------
196   Set up block[] pointer array.  Provide for both directions and the two
197   "twists" brought about by Intel byte ordering which occur when:
198     (1) 16 bytes are loaded to a SSE register
199     (2) 16 bits are written to memory.
200 
201   If 'rows' is not a multiple of 8, a partial input band appears at one edge.
202   Set *topOfFullBlockP accordingly.  blockPartial[] is an adjusted "block" for
203   this partial band, brought up to a size of 8 rows.  The extra pointers point
204   to a single row which doPartialBlockTop() and doPartialBlockBottom() clear
205   to white.
206 ---------------------------------------------------------------------------*/
207     if (xdir > 0){
208         /* Write output columns left to right */
209         unsigned int i;
210         for (i = 0; i < 16; ++i){
211             unsigned int const index = (i & 0x8) + 7 - (i & 0x7);
212             /* Absorb little-endian "twists" */
213             block[i] = inrow[index];
214             blockPartial[i] = index < inpamP->height%16 ? block[i] : inrow[15];
215         }
216         *topOfFullBlockP = 0;
217         *outcol16P = 0;
218     } else {
219         /* Write output columns right to left */
220         unsigned int i;
221         for (i = 0; i < 16; ++i){
222             unsigned int const index = ((i & 0x8) ^ 0x8) + (i & 0x7);
223             /* Absorb little-endian "twists" */
224             block[i]= inrow[index];
225             blockPartial[i] = index < (16-inpamP->height%16)
226                 ? inrow[0]
227                 : block[i];
228         }
229         *topOfFullBlockP = inpamP->height % 16;
230 
231         if (inpamP->height >= 16) {
232             *outcol16P = inpamP->height/16 - 1;
233         } else
234             *outcol16P = 0;
235     }
236 }
237 
238 
239 
240 static void
doPartialBlockTop(const struct pam * const inpamP,bit ** const inrow,const bit * const blockPartial[16],unsigned int const topOfFullBlock,uint16_t ** const outplane)241 doPartialBlockTop(const struct pam * const inpamP,
242                   bit **             const inrow,
243                   const bit *        const blockPartial[16],
244                   unsigned int       const topOfFullBlock,
245                   uint16_t **        const outplane) {
246 
247     if (topOfFullBlock > 0) {
248         unsigned int colChar, row;
249         unsigned int pad = 16 - topOfFullBlock;
250 
251         for (colChar=0; colChar < pbm_packed_bytes(inpamP->width); ++colChar)
252             inrow[0][colChar] = 0x00;
253 
254         for (row = 0; row < topOfFullBlock; ++row){
255             pbm_readpbmrow_packed(inpamP->file, inrow[row+pad],
256                                   inpamP->width, inpamP->format);
257             if (inpamP->width % 8 > 0){
258                 /* Clear partial byte at end of input row */
259                 int const lastByte = pbm_packed_bytes(inpamP->width) -1;
260 
261                 inrow[row+pad][lastByte] >>= (8 - inpamP->width % 8);
262                 inrow[row+pad][lastByte] <<= (8 - inpamP->width % 8);
263             }
264         }
265 
266         transpose16Bitrows(inpamP->width, inpamP->height, blockPartial,
267                            outplane, inpamP->height /16);
268             /* Transpose partial rows on top of input.  Place on right edge of
269                output.
270             */
271     }
272 }
273 
274 
275 
276 static void
doFullBlocks(const struct pam * const inpamP,bit ** const inrow,int const xdir,const bit * const block[16],unsigned int const topOfFullBlock,unsigned int const initOutcol16,uint16_t ** const outplane)277 doFullBlocks(const struct pam * const inpamP,
278              bit **             const inrow,
279              int                const xdir,
280              const bit *        const block[16],
281              unsigned int       const topOfFullBlock,
282              unsigned int       const initOutcol16,
283              uint16_t **        const outplane) {
284 
285     unsigned int row;
286     unsigned int outcol16;
287     unsigned int modrow;
288     /* Number of current row within buffer */
289 
290     for (row = topOfFullBlock, outcol16 = initOutcol16, modrow = 0;
291          row < inpamP->height;
292          ++row) {
293 
294         pbm_readpbmrow_packed(inpamP->file, inrow[modrow],
295                               inpamP->width, inpamP->format);
296         if (inpamP->width % 8 > 0) {
297             /* Clear partial byte at end of input row */
298             int const lastByte = pbm_packed_bytes(inpamP->width) - 1;
299             inrow[modrow][lastByte] >>= (8 - inpamP->width % 8);
300             inrow[modrow][lastByte] <<= (8 - inpamP->width % 8);
301         }
302 
303         ++modrow;
304         if (modrow == 16) {
305             /* 16 row buffer is full.  Transpose. */
306             modrow = 0;
307 
308             transpose16Bitrows(inpamP->width, inpamP->height,
309                                block, outplane, outcol16);
310             outcol16 += xdir;
311         }
312     }
313 }
314 
315 
316 
317 static void
doPartialBlockBottom(const struct pam * const inpamP,bit ** const inrow,int const xdir,const bit * const blockPartial[16],uint16_t ** const outplane)318 doPartialBlockBottom(const struct pam * const inpamP,
319                      bit **             const inrow,
320                      int                const xdir,
321                      const bit *        const blockPartial[16],
322                      uint16_t **        const outplane) {
323 
324     if (xdir > 0 && inpamP->height % 16 > 0) {
325         unsigned int colChar;
326 
327         for (colChar=0; colChar < pbm_packed_bytes(inpamP->width); ++colChar)
328             inrow[15][colChar] = 0x00;
329 
330         transpose16Bitrows(inpamP->width, inpamP->height, blockPartial,
331                            outplane, inpamP->height/16);
332             /* Transpose partial rows on bottom of input.  Place on right edge
333                of output.
334             */
335     }
336 }
337 
338 
339 
340 static void
writeOut(const struct pam * const outpamP,uint16_t ** const outplane,int const ydir)341 writeOut(const struct pam * const outpamP,
342          uint16_t **        const outplane,
343          int                const ydir) {
344 
345     unsigned int row;
346 
347     for (row = 0; row < outpamP->height; ++row) {
348         unsigned int const outrow = (ydir > 0) ?
349             row :
350             outpamP->height - row - 1;  /* reverse order */
351 
352         pbm_writepbmrow_packed(stdout, (bit *)outplane[outrow],
353                                outpamP->width, 0);
354     }
355 }
356 
357 
358 static void
clearOutplane(const struct pam * const outpamP,uint16_t ** const outplane)359 clearOutplane(const struct pam * const outpamP,
360               uint16_t **        const outplane) {
361 
362     unsigned int row;
363 
364     for (row = 0; row < outpamP->height; ++row) {
365         unsigned int col16;  /* column divided by 16 */
366         for (col16 = 0; col16 < (outpamP->width + 15)/16; ++col16)
367             outplane[row][col16] = 0x0000;
368     }
369 }
370 
371 
372 
373 void
pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,const struct pam * const outpamP,struct XformCore const xformCore)374 pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,
375                                      const struct pam * const outpamP,
376                                      struct XformCore   const xformCore) {
377 /*----------------------------------------------------------------------------
378   This is a specialized routine for row-for-column PBM transformations.
379   (-cw, -ccw, -xy).
380 -----------------------------------------------------------------------------*/
381     int const xdir = xformCore.c;
382         /* Input top  => output left (+1)/ right (-1)  */
383     int const ydir = xformCore.b;
384         /* Input left => output top  (+1)/ bottom (-1) */
385     int const blocksPerRow = ((unsigned int) outpamP->width + 15) /16;
386 
387     bit ** inrow;
388     uint16_t ** outplane;
389     const bit * block[16];
390     const bit * blockPartial[16];
391     unsigned int topOfFullBlock;
392     unsigned int outcol16;
393 
394     inrow = pbm_allocarray_packed( inpamP->width, 16);
395     MALLOCARRAY2(outplane, outpamP->height + 7, blocksPerRow);
396     if (outplane == NULL)
397         pm_error("Could not allocate %u x %u array of 16 bit units",
398                  blocksPerRow, outpamP->height + 7);
399 
400     /* We write to the output array in 16 bit units.  Add margin. */
401 
402     clearOutplane(outpamP, outplane);
403 
404     analyzeBlock(inpamP, inrow, xdir, block, blockPartial,
405                  &topOfFullBlock, &outcol16);
406 
407     doPartialBlockTop(inpamP, inrow, blockPartial, topOfFullBlock, outplane);
408 
409     doFullBlocks(inpamP, inrow, xdir, block,
410                  topOfFullBlock, outcol16, outplane);
411 
412     doPartialBlockBottom(inpamP, inrow, xdir, blockPartial, outplane);
413 
414     writeOut(outpamP, outplane, ydir);
415 
416     pbm_freearray(outplane, outpamP->height + 7);
417     pbm_freearray(inrow, 16);
418 }
419 #else  /* WANT_SSE */
420 
421 void
pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,const struct pam * const outpamP,struct XformCore const xformCore)422 pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,
423                                      const struct pam * const outpamP,
424                                      struct XformCore   const xformCore) {
425 
426     /* Nobody is supposed to call this */
427     assert(false);
428 }
429 #endif
430