1 /* Copyright (C) 2001-2019 Artifex Software, Inc.
2    All Rights Reserved.
3 
4    This software is provided AS-IS with no warranty, either express or
5    implied.
6 
7    This software is distributed under license and may not be copied,
8    modified or distributed except as expressly authorized under the terms
9    of the license contained in the file LICENSE in this distribution.
10 
11    Refer to licensing information at http://www.artifex.com or contact
12    Artifex Software, Inc.,  1305 Grant Avenue - Suite 200, Novato,
13    CA 94945, U.S.A., +1(415)492-9861, for further information.
14 */
15 
16 
17 /* Testbed implementation of Even Better Screening. */
18 
19 /*
20  * Code in this module is covered by US Patents 5,055,942 and
21  * 5,917,614, and corresponding international patents.
22  */
23 
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <math.h>
28 #include "evenbetter-rll.h"
29 
30 /* Set this define if compiling with AltiVec optimizations. */
31 #define noUSE_AVEC
32 
33 /* Set this define if compiling with SSE optimizations. */
34 #define noUSE_SSE2
35 
36 #define EVENBETTER_VERSION 133
37 
38 #define EVEN_SHIFT 16
39 #define IMO_SHIFT 14
40 #define EVEN_RB_CAP (1 << (EVEN_SHIFT - 2))
41 
42 #define FANCY_COUPLING
43 
44 #if defined(USE_AVEC) || defined(USE_SSE2)
45 #define USE_VECTOR
46 #endif
47 
48 #ifdef USE_AVEC
49 #include "eb_avec.h"
50 
51 #endif
52 
53 #ifdef USE_SSE2
54 typedef struct _eb_ctx_sse2 eb_ctx_sse2;
55 typedef struct _eb_srcbuf eb_srcbuf;
56 
57 int eb_test_sse2(void);
58 int eb_sse2_core(eb_ctx_sse2 *ctx, unsigned char **out, eb_srcbuf *in,
59                  int offset);
60 int eb_sse2_rev_rs(eb_ctx_sse2 *ctx, int offset);
61 int eb_sse2_set_daz(void);
62 void eb_sse2_restore_daz(int save_mxcsr);
63 
64 struct _eb_ctx_sse2 {
65   int xs;
66   int *iir_line;
67   int *r_line;
68   int *a_line;
69   int *b_line;
70   char *skip_line;
71   int dummy[2];
72   float *luts[4];
73   float e[4];
74   float e_i_1[4];
75   int r[4];
76   int a[4];
77   int b[4];
78   int ones[4];
79   int twos[4];
80   int aspect2[4];
81   float ehi[4];
82   float elo[4];
83   float ohi[4];
84   float r_mul[4];
85   float kernel[4];
86   unsigned int seed1[4];
87   unsigned int seed2[4];
88 };
89 
90 struct _eb_srcbuf {
91   float im[64];
92   float rb[64];
93   float rs[64];
94   int dummy[3];
95 };
96 
97 #endif
98 
99 typedef struct _EBPlaneCtx EBPlaneCtx;
100 typedef unsigned int uint32;
101 
102 struct _EvenBetterCtx {
103   int source_width;
104   int dest_width;
105   int n_planes;
106   int levels; /* Number of levels on output, <= 256 */
107   EBPlaneCtx **plane_ctx;
108   int aspect;
109   int *strengths;
110   int even_elo;
111   int even_ehi;
112   int *c_line;
113 
114   int even_c1;
115   int do_shadows;
116 
117   uint32  seed1;
118   uint32  seed2;
119 
120   FILE *dump_file;
121   EbDumpLevel dump_level;
122 
123 #ifdef USE_SSE2
124   eb_ctx_sse2 **sse2_ctx;
125   int using_vectors;
126 #endif
127 #ifdef USE_AVEC
128   eb_ctx_avec **avec_ctx;
129   int using_vectors;
130 #endif
131 };
132 
133 struct _EBPlaneCtx {
134   int source_width;
135   int dest_width;
136   int *rb_line;
137   int *iir_line;
138   int *r_line;
139   int *a_line;
140   int *b_line;
141   int *r_line_sh;
142   int *a_line_sh;
143   int *b_line_sh;
144   int *lut;
145   int *rb_lut;
146   char *rs_lut;
147   int *white_count_line;
148 };
149 
150 void *
eb_malloc_aligned(int size,int align)151 eb_malloc_aligned(int size, int align)
152 {
153   void *result;
154   void *alloced = malloc(size + align);
155   int pad;
156 
157   if (alloced == 0)
158     return 0;
159   pad = (((int)(size_t)alloced + 12) & 15) + 4;
160   result = (void *)(pad + (char *)alloced);
161   ((int *)result)[-1] = pad;
162   return result;
163 }
164 
165 void
eb_free_aligned(void * p)166 eb_free_aligned(void *p)
167 {
168   int pad = ((int *)p)[-1];
169   free((char*)p - pad);
170 }
171 
172 static double
eb_compute_rbscale(const EvenBetterParams * params)173 eb_compute_rbscale(const EvenBetterParams *params)
174 {
175   double rbscale = params->rbscale;
176 
177   if (rbscale == 0.0)
178     {
179       rbscale = params->aspect == 1 ? 0.95 :
180         params->aspect == 2 ? 1.8 :
181         params->aspect == 4 ? 3.6 : 1;
182     }
183   return rbscale;
184 }
185 
186 static int
eb_compute_randshift(int nl,int rs_base,int do_shadows,int levels)187 eb_compute_randshift(int nl, int rs_base, int do_shadows, int levels)
188 {
189   int rs = rs_base;
190   if ((nl > (90 << (EVEN_SHIFT - 10)) &&
191        nl < (129 << (EVEN_SHIFT - 10))) ||
192       (nl > (162 << (EVEN_SHIFT - 10)) &&
193        nl < (180 << (EVEN_SHIFT - 10))))
194     rs--;
195   else if (nl > (321 << (EVEN_SHIFT - 10)) &&
196            nl < (361 << (EVEN_SHIFT - 10)))
197     {
198       rs--;
199       if (nl > (331 << (EVEN_SHIFT - 10)) &&
200           nl < (351 << (EVEN_SHIFT - 10)))
201         rs--;
202     }
203   else if ((do_shadows ||
204             nl == (levels - 1) << EVEN_SHIFT) &&
205            nl > ((levels - 1) << EVEN_SHIFT) -
206            (1 << (EVEN_SHIFT - 2)))
207     {
208       /* don't add randomness in extreme shadows */
209     }
210   else if ((nl > (3 << (EVEN_SHIFT - 2))))
211     {
212       nl -= (nl + (1 << (EVEN_SHIFT - 2))) & -(1 << (EVEN_SHIFT - 1));
213       if (nl < 0) nl = -nl;
214       if (nl < (1 << (EVEN_SHIFT - 4))) rs--;
215       if (nl < (1 << (EVEN_SHIFT - 5))) rs--;
216       if (nl < (1 << (EVEN_SHIFT - 6))) rs--;
217     }
218   else
219     {
220       if (nl < (3 << (EVEN_SHIFT - 3))) nl += 1 << (EVEN_SHIFT - 2);
221       nl = nl - (1 << (EVEN_SHIFT - 1));
222       if (nl < 0) nl = -nl;
223       if (nl < (1 << (EVEN_SHIFT - 4))) rs--;
224       if (nl < (1 << (EVEN_SHIFT - 5))) rs--;
225       if (nl < (1 << (EVEN_SHIFT - 6))) rs--;
226     }
227   return rs;
228 }
229 
230 #ifdef USE_SSE2
231 static eb_ctx_sse2 *
eb_ctx_sse2_new(const EvenBetterParams * params,int start_plane,int end_plane)232 eb_ctx_sse2_new(const EvenBetterParams *params, int start_plane, int end_plane)
233 {
234   int xs = params->source_width;
235   int aspect2 = params->aspect * params->aspect;
236   eb_ctx_sse2 *ctx;
237   int i;
238   double im_scale;
239   float r_mul = 1.0 / (params->aspect * (1 << (6 - params->even_c1_scale)));
240   double rbscale = eb_compute_rbscale(params);
241   int rs_base;
242 
243   ctx = (eb_ctx_sse2 *)eb_malloc_aligned(sizeof(eb_ctx_sse2), 16);
244   ctx->xs = xs;
245   for (i = 0; i < 4; i++)
246     {
247       ctx->e[i] = 0.0;
248       ctx->e_i_1[i] = 0.0;
249       ctx->r[i] = 0;
250       ctx->a[i] = 1;
251       ctx->b[i] = aspect2;
252       ctx->ones[i] = 1;
253       ctx->twos[i] = 2;
254       ctx->aspect2[i] = aspect2;
255       ctx->ohi[i] = params->levels - 1;
256       ctx->ehi[i] = 1.1;
257       ctx->elo[i] = -0.1;
258       ctx->r_mul[i] = r_mul;
259       ctx->seed1[i] = (i << 8) + 0x7000;
260       ctx->seed2[i] = (i << 16) + 0x9000;
261     }
262   ctx->kernel[0] = 1.0 / 16;
263   ctx->kernel[1] = 3.0 / 16;
264   ctx->kernel[2] = 5.0 / 16;
265   ctx->kernel[3] = 7.0 / 16;
266 
267   im_scale = (params->levels - 1) * 1.0 / (1 << 24);
268   rs_base = 35 - EVEN_SHIFT - params->rand_scale;
269 
270   for (i = start_plane; i < end_plane; i++)
271     {
272       float *lut = (float *)malloc((ET_SRC_MAX + 1) * sizeof(float) * 3);
273       int j;
274       ctx->luts[i - start_plane] = lut;
275 
276       for (j = 0; j < ET_SRC_MAX + 1; j++)
277         {
278           double g = ((1 << 24) - params->luts[i][j]) * im_scale;
279           int nl, rs;
280 
281           lut[j * 3] = g;
282           if (g == 0.0)
283             lut[j * 3 + 1] = 0.5;
284           else
285             lut[j * 3 + 1] = 0.5 - r_mul * rbscale / g;
286 
287           nl = (params->levels - 1 - g) * (1 << EVEN_SHIFT);
288           rs = eb_compute_randshift(nl, rs_base,
289                                     params->do_shadows, params->levels);
290 
291           lut[j * 3 + 2] = 1.0 / (1 << EVEN_SHIFT) / (1 << rs);
292         }
293     }
294   for (i = i - start_plane; i < 4; i++)
295     ctx->luts[i] = NULL;
296 
297   ctx->iir_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
298   ctx->a_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
299   ctx->b_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
300   ctx->r_line = (int *)eb_malloc_aligned(16 * (xs + 32), 16);
301   for (i = 0; i < (xs + 32) * 4; i++)
302     {
303       ((float *)ctx->iir_line)[i] = 0;
304       ctx->a_line[i] = 1;
305       ctx->b_line[i] = aspect2;
306       ctx->r_line[i] = 0;
307     }
308 
309   ctx->skip_line = (char *)malloc((xs + 15) & -16);
310 
311   return ctx;
312 }
313 
314 static void
eb_ctx_sse2_free(eb_ctx_sse2 * ctx)315 eb_ctx_sse2_free(eb_ctx_sse2 *ctx)
316 {
317   int i;
318 
319   for (i = 0; i < 4; i++)
320     free(ctx->luts[i]);
321   eb_free_aligned(ctx->iir_line);
322   eb_free_aligned(ctx->a_line);
323   eb_free_aligned(ctx->b_line);
324   eb_free_aligned(ctx->r_line);
325   free(ctx->skip_line);
326   eb_free_aligned(ctx);
327 }
328 #endif
329 
330 #ifdef USE_AVEC
331 static eb_ctx_avec *
eb_ctx_avec_new(const EvenBetterParams * params,int start_plane,int end_plane)332 eb_ctx_avec_new(const EvenBetterParams *params, int start_plane, int end_plane)
333 {
334   int xs = params->source_width;
335   int aspect2 = params->aspect * params->aspect;
336   eb_ctx_avec *ctx;
337   int i;
338   double im_scale;
339   double k;
340   float imscale1, imscale2, rbmul, rsbase;
341   float r_mul = 1.0 / (params->aspect * (1 << (6 - params->even_c1_scale)));
342   double rbscale = eb_compute_rbscale(params);
343   vector unsigned int zero = vec_splat_u32(0);
344   const vector float kernel = { 1.0 / 16, 3.0 / 16, 5.0 / 16, 7.0 / 16 };
345   vector float almostone = { 255.0/256, 255.0/256, 255.0/256, 255.0/256 };
346   int rs_base;
347 
348   ctx = (eb_ctx_avec *)eb_malloc_aligned(sizeof(eb_ctx_avec), 16);
349   ctx->xs = xs;
350 
351   ctx->e = (vector float) zero;
352   ctx->e_i_1 = (vector float) zero;
353   ctx->r = zero;
354   ctx->a = zero;
355   im_scale = (params->levels - 1) * (1.0 / (1 << 24));
356   rs_base = 35 - EVEN_SHIFT - params->rand_scale;
357 
358   if (params->gamma == 1.0)
359     k = 0;
360   else if (params->gamma == 1.8)
361     k = 0.835;
362   else if (params->gamma == 2.0)
363     k = 1.0;
364   else
365     /* this shouldn't happen! */
366     k = 0;
367 
368   for (;;)
369     {
370       vector float foff, f0, f1;
371 
372       imscale1 = (1 - k) * (params->levels - 1) * (256.0 / 255.0);
373       imscale2 = k * (params->levels - 1) * sqrt(256.0 / 255.0);
374       for (i = 0; i < 4; i++)
375         {
376           ((float *)&ctx->imscale1)[i] = imscale1;
377           ((float *)&ctx->imscale2)[i] = imscale2;
378         }
379       f0 = vec_rsqrte(almostone);
380       f0 = vec_madd(f0, almostone, (vector float)zero);
381       f1 = vec_madd(f0, ctx->imscale2, (vector float)zero);
382       foff = vec_madd(almostone, ctx->imscale1, f1);
383       f1 = vec_nmsub(f0, ctx->imscale2, foff);
384       f1 = vec_nmsub(almostone, ctx->imscale1, f1);
385       if (vec_all_eq(f1, (vector float)zero))
386         {
387           ctx->foff = foff;
388           break;
389         }
390       k += 1e-5;
391     }
392   rbmul = -r_mul * rbscale;
393   rsbase = 1.0 / (1 << EVEN_SHIFT) / (1 << rs_base);
394   for (i = 0; i < 4; i++)
395     {
396       ((int *)&ctx->b)[i] = aspect2;
397       ((int *)&ctx->aspect2)[i] = aspect2;
398       ((int *)&ctx->seed1)[i] = (i << 8) + 0x7000;
399       ((int *)&ctx->seed2)[i] = (i << 16) + 0x9000;
400       ((float *)&ctx->ohi)[i] = params->levels - 1;
401       ((float *)&ctx->ehi)[i] = 1.1;
402       ((float *)&ctx->elo)[i] = -0.1;
403       ((float *)&ctx->r_mul)[i] = r_mul;
404       ((float *)&ctx->rsbase)[i] = rsbase;
405       ((float *)&ctx->rbmul)[i] = rbmul;
406     }
407   ctx->kernel = kernel;
408 
409   rs_base = 35 - EVEN_SHIFT - params->rand_scale;
410 
411   for (i = start_plane; i < end_plane; i++)
412     {
413       float *lut = (float *)malloc((ET_SRC_MAX + 1) * sizeof(float) * 3);
414       int j;
415       ctx->luts[i - start_plane] = lut;
416 
417       for (j = 0; j < ET_SRC_MAX + 1; j++)
418         {
419           double g = ((1 << 24) - params->luts[i][j]) * im_scale;
420           int nl, rs;
421 
422           lut[j * 3] = g;
423           if (g == 0.0)
424             lut[j * 3 + 1] = 0.5;
425           else
426             lut[j * 3 + 1] = 0.5 - r_mul * rbscale / g;
427           nl = (params->levels - 1 - g) * (1 << EVEN_SHIFT);
428           rs = eb_compute_randshift(nl, rs_base,
429                                     params->do_shadows, params->levels);
430 
431           lut[j * 3 + 2] = 1.0 / (1 << EVEN_SHIFT) / (1 << rs);
432         }
433     }
434   for (i = i - start_plane; i < 4; i++)
435     ctx->luts[i] = NULL;
436 
437   ctx->iir_line = (vector float *)eb_malloc_aligned(16 * (xs + 32), 16);
438   ctx->a_line = (vector unsigned int *)eb_malloc_aligned(16 * (xs + 32), 16);
439   ctx->b_line = (vector unsigned int *)eb_malloc_aligned(16 * (xs + 32), 16);
440   ctx->r_line = (vector unsigned int *)eb_malloc_aligned(16 * (xs + 32), 16);
441   for (i = 0; i < (xs + 32) * 4; i++)
442     {
443       ((float *)ctx->iir_line)[i] = 0;
444       ((int *)ctx->a_line)[i] = 1;
445       ((int *)ctx->b_line)[i] = aspect2;
446       ((int *)ctx->r_line)[i] = 0;
447     }
448 
449   ctx->skip_line = (char *)malloc((xs + 15) & -16);
450 
451   return ctx;
452 }
453 
454 static void
eb_ctx_avec_free(eb_ctx_avec * ctx)455 eb_ctx_avec_free(eb_ctx_avec *ctx)
456 {
457   int i;
458 
459   for (i = 0; i < 4; i++)
460     free(ctx->luts[i]);
461   eb_free_aligned(ctx->iir_line);
462   eb_free_aligned(ctx->a_line);
463   eb_free_aligned(ctx->b_line);
464   eb_free_aligned(ctx->r_line);
465   free(ctx->skip_line);
466   eb_free_aligned(ctx);
467 }
468 
469 #endif
470 
471 #ifdef USE_VECTOR
472 static int
even_better_line_vector(EvenBetterCtx * ebc,uchar ** dest,const ET_Rll * const * src)473 even_better_line_vector(EvenBetterCtx *ebc, uchar **dest,
474                       const ET_Rll *const *src)
475 {
476   int n_planes = ebc->n_planes;
477   int xd = ebc->dest_width;
478   int strip;
479   eb_srcbuf sb_alloc;
480   eb_srcbuf *srcbuf;
481   uchar dummy_a[32];
482   uchar *dummy_dst = (uchar *)(((int)dummy_a + 15) & -16);
483 #ifdef USE_SSE2
484   int save_mxcsr = eb_sse2_set_daz();
485 #endif
486 
487   srcbuf = (eb_srcbuf *)(((int)&sb_alloc + 12) & -16);
488 
489   for (strip = 0; strip < n_planes; strip += 4)
490     {
491 #ifdef USE_AVEC
492       eb_ctx_avec *ctx = ebc->avec_ctx[strip >> 2];
493 #endif
494 #ifdef USE_SSE2
495       eb_ctx_sse2 *ctx = ebc->sse2_ctx[strip >> 2];
496 #endif
497       uchar *destbufs[4];
498       const ET_Rll *const *sbuf = src + strip;
499       int count[4];
500       int src_idx[4];
501       int plane_idx, last_plane;
502       float im[4], rb[4], rs[4];
503       int i;
504 
505       last_plane = n_planes - strip < 4 ? n_planes - strip : 4;
506       for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
507         {
508           count[plane_idx] = 0;
509           src_idx[plane_idx] = 0;
510           destbufs[plane_idx] = dest[plane_idx + strip];
511         }
512       for (; plane_idx < 4; plane_idx++)
513         {
514           int j;
515 
516           for (j = 0; j < 16; j++)
517             {
518               ((float *)&srcbuf->im)[j * 4 + plane_idx] = 0.0;
519               ((float *)&srcbuf->rb)[j * 4 + plane_idx] = 0.0;
520               ((float *)&srcbuf->rs)[j * 4 + plane_idx] = 0.0;
521             }
522         }
523       for (i = 0; i < xd; i += 16)
524         {
525           int jmax = (xd - i) > 16 ? 16 : xd - i;
526           int skip = 1;
527           int j;
528 
529           for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
530             {
531               if (count[plane_idx] < 16 || im[plane_idx] != 0.0)
532                 {
533                   skip = 0;
534                   break;
535                 }
536             }
537           ctx->skip_line[i >> 4] = skip;
538 
539           if (skip)
540             {
541               /* all white */
542 
543               for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
544                 {
545                   uchar *dst_ptr = destbufs[plane_idx];
546                   if (jmax == 16)
547                     {
548                       ((uint32 *)dst_ptr)[(i >> 2) + 0] = 0;
549                       ((uint32 *)dst_ptr)[(i >> 2) + 1] = 0;
550                       ((uint32 *)dst_ptr)[(i >> 2) + 2] = 0;
551                       ((uint32 *)dst_ptr)[(i >> 2) + 3] = 0;
552                     }
553                   else
554                     {
555                       for (j = 0; j < jmax; j++)
556                         dst_ptr[i + j] = 0;
557                     }
558                   count[plane_idx] -= jmax;
559                 }
560             }
561           else
562             {
563               for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
564                 {
565                   const float *lut = ctx->luts[plane_idx];
566                   float imp = im[plane_idx];
567                   float rbp = rb[plane_idx];
568                   float rsp = rs[plane_idx];
569                   for (j = 0; j < jmax; j++)
570                     {
571                       if (count[plane_idx] == 0)
572                         {
573                           const ET_Rll *src_p = sbuf[plane_idx] +
574                             src_idx[plane_idx]++;
575                           ET_SrcPixel src_pixel = src_p->value;
576                           count[plane_idx] = src_p->length;
577                           imp = lut[src_pixel * 3];
578                           rbp = lut[src_pixel * 3 + 1];
579                           rsp = lut[src_pixel * 3 + 2];
580                         }
581                       ((float *)&srcbuf->im)[j * 4 + plane_idx] = imp;
582                       ((float *)&srcbuf->rb)[j * 4 + plane_idx] = rbp;
583                       ((float *)&srcbuf->rs)[j * 4 + plane_idx] = rsp;
584                       count[plane_idx]--;
585                     }
586                   im[plane_idx] = imp;
587                   rb[plane_idx] = rbp;
588                   rs[plane_idx] = rsp;
589                 }
590               for (; plane_idx < 4; plane_idx++)
591                 {
592                   destbufs[plane_idx] = dummy_dst - i;
593                 }
594 #ifdef USE_AVEC
595               eb_avec_core(ctx, (vector unsigned char **)destbufs, srcbuf, i);
596 #endif
597 #ifdef USE_SSE2
598               eb_sse2_core(ctx, destbufs, srcbuf, i);
599 #endif
600             }
601         }
602 
603       for (i = xd & -16; i >= 0; i -= 16)
604         {
605           if (!ctx->skip_line[i >> 4])
606             {
607 #ifdef USE_AVEC
608               eb_avec_rev_rs(ctx, i + 15);
609 #endif
610 #ifdef USE_SSE2
611               eb_sse2_rev_rs(ctx, i + 15);
612 #endif
613             }
614         }
615     }
616 #ifdef USE_SSE2
617   eb_sse2_restore_daz(save_mxcsr);
618 #endif
619   return 0;
620 }
621 #endif
622 
623 #ifdef USE_AVEC
624 static int
even_better_line_fastprep(EvenBetterCtx * ebc,uchar ** dest,const ET_SrcPixel * const * src)625 even_better_line_fastprep(EvenBetterCtx *ebc, uchar **dest,
626                           const ET_SrcPixel *const *src)
627 {
628   int n_planes = ebc->n_planes;
629   int xd = ebc->dest_width;
630   int strip;
631   eb_srcbuf sb_alloc;
632   eb_srcbuf *srcbuf;
633   uchar dummy_a[32];
634   uchar *dummy_dst = (uchar *)(((int)dummy_a + 15) & -16);
635 
636   srcbuf = (eb_srcbuf *)(((int)&sb_alloc + 12) & -16);
637 
638   for (strip = 0; strip < n_planes; strip += 4)
639     {
640 #ifdef USE_AVEC
641       eb_ctx_avec *ctx = ebc->avec_ctx[strip >> 2];
642 #endif
643 #ifdef USE_SSE2
644       eb_ctx_sse2 *ctx = ebc->sse2_ctx[strip >> 2];
645 #endif
646       uchar *destbufs[4];
647       const ET_SrcPixel *const *sbuf = src + strip;
648       int plane_idx, last_plane;
649       int i;
650 
651       last_plane = n_planes - strip < 4 ? n_planes - strip : 4;
652       for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
653         {
654           destbufs[plane_idx] = dest[plane_idx + strip];
655         }
656       for (i = 0; i < xd; i += 16)
657         {
658           int noskip;
659           noskip = eb_avec_prep_srcbuf(ctx, last_plane, srcbuf, sbuf, i);
660           ctx->skip_line[i >> 4] = noskip;
661           if (noskip)
662             {
663               for (plane_idx = last_plane; plane_idx < 4; plane_idx++)
664                 destbufs[plane_idx] = dummy_dst - i;
665               eb_avec_core(ctx, (vector unsigned char **)destbufs, srcbuf, i);
666             }
667           else
668             {
669               /* all white */
670 
671               for (plane_idx = 0; plane_idx < last_plane; plane_idx++)
672                 {
673                   uchar *dst_ptr = destbufs[plane_idx];
674                   ((uint32 *)dst_ptr)[(i >> 2) + 0] = 0;
675                   ((uint32 *)dst_ptr)[(i >> 2) + 1] = 0;
676                   ((uint32 *)dst_ptr)[(i >> 2) + 2] = 0;
677                   ((uint32 *)dst_ptr)[(i >> 2) + 3] = 0;
678                 }
679             }
680         }
681 
682       for (i = xd & -16; i >= 0; i -= 16)
683         {
684           if (ctx->skip_line[i >> 4])
685             {
686 #ifdef USE_AVEC
687               eb_avec_rev_rs(ctx, i + 15);
688 #endif
689 #ifdef USE_SSE2
690               eb_sse2_rev_rs(ctx, i + 15);
691 #endif
692             }
693         }
694     }
695   return 0;
696 }
697 #endif
698 
699 /* Maximum number of planes, but actually we want to dynamically
700    allocate all scratch buffers that depend on this. */
701 #define M 16
702 
703 static void
even_better_line_hi(EvenBetterCtx * ebc,uchar ** dest,const ET_Rll * const * src)704 even_better_line_hi (EvenBetterCtx *ebc, uchar **dest,
705                      const ET_Rll *const *src)
706 {
707   int a[M], b[M];
708   int e_1_0[M], e_m1_1[M], e_0_1[M], e_1_1[M];
709   int iml[M], rbl[M];
710   int i, j;
711   int im;
712   int *pa, *pb, *piir, *pr;
713   int r[M], rg;
714   int xd;
715   uint32 seed1 = ebc->seed1;
716   uint32 seed2 = ebc->seed2;
717   uint32 sum;
718   int plane_idx;
719   int r_scratch[M];
720   int n_planes = ebc->n_planes;
721   int levels = ebc->levels;
722 #ifdef OLD_QUANT
723   int dith_mul = levels << 8;
724 #else
725   int dith_mul = (levels - 1) << 8;
726 #endif
727   int imo_mul = (1 << (EVEN_SHIFT + IMO_SHIFT)) / (levels - 1);
728   int aspect2 = ebc->aspect * ebc->aspect;
729   int *strengths = ebc->strengths;
730   int even_elo = ebc->even_elo;
731   int even_ehi = ebc->even_ehi;
732   int coupling;
733   int *c_line = ebc->c_line;
734   int even_c1 = ebc->even_c1;
735   int rand_shift;
736   int even_rlimit = 1 << (30 - EVEN_SHIFT + even_c1);
737   int count[M], src_idx[M];
738   int rs[M];
739 
740   xd = ebc->dest_width;
741 
742   memset(rbl, 0x00, M * sizeof(int));
743   memset(iml, 0x00, M * sizeof(int));
744   memset(rs, 0x00, M * sizeof(int));
745 
746   for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
747     {
748       a[plane_idx] = 1;
749       b[plane_idx] = aspect2;
750       r[plane_idx] = 0;
751       e_0_1[plane_idx] = 0;
752       e_1_0[plane_idx] = 0;
753       e_1_1[plane_idx] = 0;
754       count[plane_idx] = 0;
755       src_idx[plane_idx] = 0;
756     }
757 
758   coupling = 0;
759 
760   for (i = 0; i < xd;)
761     {
762       int work_planes[M];
763       int n_work = 0;
764       int work_idx;
765       int jmax;
766 
767       jmax = (xd - i) > 16 ? 16 : xd - i;
768 
769       for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
770         {
771           EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
772           int *wcl = ctx->white_count_line;
773           if (count[plane_idx] >= 16 && iml[plane_idx] == 0)
774             wcl[i >> 4]++;
775           else
776             wcl[i >> 4] = 0;
777           if (wcl[i >> 4] > 15)
778             {
779               uchar *dst_ptr = dest[plane_idx];
780               if (jmax == 16)
781                 {
782                   ((uint32 *)dst_ptr)[(i >> 2) + 0] = 0;
783                   ((uint32 *)dst_ptr)[(i >> 2) + 1] = 0;
784                   ((uint32 *)dst_ptr)[(i >> 2) + 2] = 0;
785                   ((uint32 *)dst_ptr)[(i >> 2) + 3] = 0;
786                 }
787               else
788                 {
789                   for (j = 0; j < jmax; j++)
790                     dst_ptr[i + j] = 0;
791                 }
792               count[plane_idx] -= jmax;
793             }
794           else
795             {
796               work_planes[n_work++] = plane_idx;
797             }
798         }
799 
800       if (n_work == 0)
801         {
802           /* all planes were white */
803           i += jmax;
804           continue;
805         }
806 
807       for (j = 0; j < jmax; j++)
808         {
809 #ifdef FANCY_COUPLING
810           coupling += c_line[i];
811 #else
812           coupling = 0;
813 #endif
814           /* Lookup image data and compute R for all planes. */
815           for (work_idx = 0; work_idx < n_work; work_idx++)
816             {
817               int plane_idx = work_planes[work_idx];
818               EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
819               ET_SrcPixel src_pixel;
820               int new_r;
821 
822               pr = ctx->r_line;
823               pa = ctx->a_line;
824               pb = ctx->b_line;
825               if (count[plane_idx] == 0)
826                 {
827                   const ET_Rll *src_p = src[plane_idx] + src_idx[plane_idx]++;
828                   int *lut = ctx->lut;
829                   int *rblut = ctx->rb_lut;
830                   char *rslut = ctx->rs_lut;
831 
832                   count[plane_idx] = src_p->length;
833                   src_pixel = src_p->value;
834                   iml[plane_idx] = lut[src_pixel];
835                   rbl[plane_idx] = rblut[src_pixel];
836                   rs[plane_idx] = rslut[src_pixel];
837                 }
838               count[plane_idx]--;
839 
840               if (r[plane_idx] + a[plane_idx] < pr[i])
841                 {
842                   r[plane_idx] += a[plane_idx];
843                   a[plane_idx] += 2;
844                 }
845               else
846                 {
847                   a[plane_idx] = pa[i];
848                   b[plane_idx] = pb[i];
849                   r[plane_idx] = pr[i];
850                 }
851               if (iml[plane_idx] == 0)
852                 {
853                   r_scratch[plane_idx] = 0;
854                 }
855               else
856                 {
857                   int r_tmp;
858                   const int r_max = 0;
859                   new_r = r[plane_idx];
860                   if (new_r > even_rlimit)
861                     new_r = even_rlimit;
862                   /* Should we store back with the limit? */
863 
864                   rg = new_r << (EVEN_SHIFT - even_c1);
865                   r_tmp = rg - rbl[plane_idx];
866                   if (r_tmp > r_max) r_tmp >>= 3;
867                   r_scratch[plane_idx] = r_tmp;
868                 }
869             }
870 
871           /* Dither each plane. */
872           for (work_idx = 0; work_idx < n_work; work_idx++)
873             {
874               int plane_idx = work_planes[work_idx];
875               EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
876               uchar *dst_ptr = dest[plane_idx];
877               int new_e_1_0;
878               int coupling_contribution;
879 
880               pr = ctx->r_line;
881               pa = ctx->a_line;
882               pb = ctx->b_line;
883               piir = ctx->iir_line;
884 
885               im = iml[plane_idx];
886               e_m1_1[plane_idx] = e_0_1[plane_idx];
887               e_0_1[plane_idx] = e_1_1[plane_idx];
888               e_1_1[plane_idx] = i == xd - 1 ? 0 : piir[i + 1];
889               new_e_1_0 = ((e_1_0[plane_idx] * 7 + e_m1_1[plane_idx] * 3 +
890                             e_0_1[plane_idx] * 5 + e_1_1[plane_idx] * 1) >> 4);
891               if (im == 0)
892                 {
893                   dst_ptr[i] = 0;
894                 }
895               else
896                 {
897                   int err;
898                   int imo;
899 
900                   err = new_e_1_0;
901 
902                   err += r_scratch[plane_idx];
903 
904                   /* Add the two seeds together */
905                   sum = seed1 + seed2;
906 
907                   /* If the add generated a carry, increment
908                    * the result of the addition.
909                    */
910                   if (sum < seed1 || sum < seed2) sum++;
911 
912                   /* Seed2 becomes old seed1, seed1 becomes result */
913                   seed2 = seed1;
914                   seed1 = sum;
915 
916                   rand_shift = rs[plane_idx];
917                   err -= (sum >> rand_shift) - (0x80000000 >> rand_shift);
918 
919                   if (err < even_elo)
920                     err = even_elo;
921 
922                   else if (err > even_ehi)
923                     err = even_ehi;
924 
925 #if 1
926                   err += coupling;
927 #endif
928 
929 #ifdef OLD_QUANT
930                   imo = ((err + im) * dith_mul) >> (EVEN_SHIFT + 8);
931 #else
932                   imo = ((err + im) * dith_mul + (1 << (EVEN_SHIFT + 7))) >> (EVEN_SHIFT + 8);
933 #endif
934                   if (imo < 0) imo = 0;
935                   else if (imo > levels - 1) imo = levels - 1;
936                   dst_ptr[i] = imo;
937                   coupling_contribution = im - ((imo * imo_mul) >> IMO_SHIFT);
938                   new_e_1_0 += coupling_contribution;
939                   coupling += (coupling_contribution * strengths[plane_idx]) >> 8;
940                 }
941               if (dst_ptr[i] != 0)
942                 {
943                   a[plane_idx] = 1;
944                   b[plane_idx] = aspect2;
945                   r[plane_idx] = 0;
946                 }
947               pa[i] = a[plane_idx];
948               pb[i] = b[plane_idx];
949               pr[i] = r[plane_idx];
950               piir[i] = new_e_1_0;
951               e_1_0[plane_idx] = new_e_1_0;
952             }
953 #ifdef FANCY_COUPLING
954           coupling = coupling >> 1;
955           c_line[i] = coupling;
956 #endif
957           i++;
958         }
959     }
960 
961   /* Note: this isn't white optimized, but the payoff is probably not
962      that important. */
963 #ifdef FANCY_COUPLING
964   coupling = 0;
965   for (i = xd - 1; i >= 0; i--)
966     {
967       coupling = (coupling + c_line[i]) >> 1;
968       c_line[i] = (coupling - (coupling >> 4));
969     }
970 #endif
971 
972   /* Update distances. */
973   for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
974     {
975       EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
976       int *wcl = ctx->white_count_line;
977       int av, bv, rv;
978       int jmax;
979 
980       pr = ctx->r_line;
981       pa = ctx->a_line;
982       pb = ctx->b_line;
983 
984       av = 1;
985       bv = 1;
986       rv = 0;
987       jmax = ((xd - 1) & 15) + 1;
988       for (i = xd - 1; i >= 0;)
989         {
990           if (wcl[i >> 4] < 16)
991             {
992               for (j = 0; j < jmax; j++)
993                 {
994                   if (rv + bv + av < pr[i] + pb[i])
995                     {
996                       rv += av;
997                       av += 2;
998                     }
999                   else
1000                     {
1001                       rv = pr[i];
1002                       av = pa[i];
1003                       bv = pb[i];
1004                     }
1005                   if (rv > even_rlimit) rv = even_rlimit;
1006                   pa[i] = av;
1007                   pb[i] = bv + (aspect2 << 1);
1008                   pr[i] = rv + bv;
1009                   i--;
1010                 }
1011             }
1012           else
1013             i -= jmax;
1014           jmax = 16;
1015         }
1016     }
1017 
1018    ebc->seed1 = seed1;
1019    ebc->seed2 = seed2;
1020 }
1021 
1022 static void
even_better_line_both(EvenBetterCtx * ebc,uchar ** dest,const ET_Rll * const * src)1023 even_better_line_both (EvenBetterCtx *ebc, uchar **dest,
1024                        const ET_Rll *const *src)
1025 {
1026 #if 0
1027   int a[M], b[M];
1028   int a_sh[M], b_sh[M];
1029   int e_1_0[M], e_m1_1[M], e_0_1[M], e_1_1[M];
1030   int imraw[M];
1031   int iml[M];
1032   int i;
1033   int im;
1034   int *lut;
1035   const ET_SrcPixel *ps;
1036   int *pa, *pb, *piir, *pr;
1037   int *pa_sh, *pb_sh, *pr_sh;
1038   int r[M], rb, rg;
1039   int r_sh[M];
1040   int *rblut;
1041   int xd, xrem, xs;
1042   uint32 seed1 = ebc->seed1;
1043   uint32 seed2 = ebc->seed2;
1044   uint32 sum;
1045   int plane_idx;
1046   int r_scratch[M];
1047   int src_idx;
1048   int n_planes = ebc->n_planes;
1049   int levels = ebc->levels;
1050 #ifdef OLD_QUANT
1051   int dith_mul = levels << 8;
1052 #else
1053   int dith_mul = (levels - 1) << 8;
1054 #endif
1055   int imo_mul = (1 << (EVEN_SHIFT + IMO_SHIFT)) / (levels - 1);
1056   int aspect2 = ebc->aspect * ebc->aspect;
1057   int *strengths = ebc->strengths;
1058   int even_elo= ebc->even_elo;
1059   int even_ehi= ebc->even_ehi;
1060   int coupling;
1061   int *c_line = ebc->c_line;
1062   int even_c1 = ebc->even_c1;
1063   int rand_shift = ebc->rand_shift;
1064   int even_rlimit = 1 << (30 - EVEN_SHIFT + even_c1);
1065 
1066   xs = ebc->source_width;
1067   xd = ebc->dest_width;
1068   xrem = xd - xs;
1069 
1070   for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
1071     {
1072       a[plane_idx] = 1;
1073       b[plane_idx] = aspect2;
1074       a_sh[plane_idx] = 1;
1075       b_sh[plane_idx] = aspect2;
1076       r[plane_idx] = 0;
1077       r_sh[plane_idx] = 0;
1078       e_0_1[plane_idx] = 0;
1079       e_1_0[plane_idx] = 0;
1080       e_1_1[plane_idx] = 0;
1081     }
1082 
1083   coupling = 0;
1084 
1085   src_idx = 0;
1086   for (i = 0; i < xd; i++)
1087     {
1088 #ifdef FANCY_COUPLING
1089       coupling += c_line[i];
1090 #else
1091       coupling = 0;
1092 #endif
1093 
1094       xrem += xs;
1095       if (xrem >= xd)
1096         {
1097           for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
1098             {
1099               ps = src[plane_idx];
1100               imraw[plane_idx] = ps[src_idx];
1101             }
1102           src_idx++;
1103           xrem -= xd;
1104         }
1105 
1106       /* Lookup image data and compute R for all planes. */
1107       for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
1108         {
1109           EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
1110           ET_SrcPixel src_pixel;
1111           int new_r;
1112 
1113           pr = ctx->r_line;
1114           pa = ctx->a_line;
1115           pb = ctx->b_line;
1116           pr_sh = ctx->r_line_sh;
1117           pa_sh = ctx->a_line_sh;
1118           pb_sh = ctx->b_line_sh;
1119           lut = ctx->lut;
1120           rblut = ctx->rb_lut;
1121           src_pixel = imraw[plane_idx];
1122 
1123           im = lut[src_pixel];
1124           iml[plane_idx] = im;
1125           rb = rblut[src_pixel];
1126           if (r[plane_idx] + a[plane_idx] < pr[i])
1127             {
1128               r[plane_idx] += a[plane_idx];
1129               a[plane_idx] += 2;
1130             }
1131           else
1132             {
1133               a[plane_idx] = pa[i];
1134               b[plane_idx] = pb[i];
1135               r[plane_idx] = pr[i];
1136             }
1137           if (r_sh[plane_idx] + a_sh[plane_idx] < pr_sh[i])
1138             {
1139               r_sh[plane_idx] += a_sh[plane_idx];
1140               a_sh[plane_idx] += 2;
1141             }
1142           else
1143             {
1144               a_sh[plane_idx] = pa_sh[i];
1145               b_sh[plane_idx] = pb_sh[i];
1146               r_sh[plane_idx] = pr_sh[i];
1147             }
1148           if (im == 0 || im == (1 << EVEN_SHIFT))
1149             {
1150               r_scratch[plane_idx] = 0;
1151             }
1152           else
1153             {
1154               new_r = r[plane_idx];
1155               if (new_r > even_rlimit)
1156                 new_r = even_rlimit;
1157               /* Should we store back with the limit? */
1158               rg = new_r << (EVEN_SHIFT - even_c1);
1159 
1160               new_r = r_sh[plane_idx];
1161               if (new_r > even_rlimit)
1162                 new_r = even_rlimit;
1163               rg -= new_r << (EVEN_SHIFT - even_c1);
1164               r_scratch[plane_idx] = rg - rb;
1165             }
1166         }
1167 
1168       /* Dither each plane. */
1169       for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
1170         {
1171           EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
1172           uchar *dst_ptr = dest[plane_idx];
1173           int new_e_1_0;
1174           int coupling_contribution;
1175 
1176           pr = ctx->r_line;
1177           pa = ctx->a_line;
1178           pb = ctx->b_line;
1179           pr_sh = ctx->r_line_sh;
1180           pa_sh = ctx->a_line_sh;
1181           pb_sh = ctx->b_line_sh;
1182           piir = ctx->iir_line;
1183 
1184           im = iml[plane_idx];
1185           e_m1_1[plane_idx] = e_0_1[plane_idx];
1186           e_0_1[plane_idx] = e_1_1[plane_idx];
1187           e_1_1[plane_idx] = i == xd - 1 ? 0 : piir[i + 1];
1188           new_e_1_0 = ((e_1_0[plane_idx] * 7 + e_m1_1[plane_idx] * 3 +
1189                         e_0_1[plane_idx] * 5 + e_1_1[plane_idx] * 1) >> 4);
1190           if (im == 0)
1191             {
1192               dst_ptr[i] = 0;
1193             }
1194           else
1195             {
1196               int err;
1197               int imo;
1198 
1199               err = new_e_1_0;
1200 
1201               err += r_scratch[plane_idx];
1202 
1203               /* Add the two seeds together */
1204               sum = seed1 + seed2;
1205 
1206               /* If the add generated a carry, increment
1207                * the result of the addition.
1208                */
1209               if (sum < seed1 || sum < seed2) sum++;
1210 
1211               /* Seed2 becomes old seed1, seed1 becomes result */
1212               seed2 = seed1;
1213               seed1 = sum;
1214 
1215               err -= (sum >> rand_shift) - (0x80000000 >> rand_shift);
1216 
1217               if (err < even_elo)
1218                 err = even_elo;
1219 
1220               else if (err > even_ehi)
1221                 err = even_ehi;
1222 
1223 #if 1
1224               err += coupling;
1225 #endif
1226 
1227 #ifdef OLD_QUANT
1228               imo = ((err + im) * dith_mul) >> (EVEN_SHIFT + 8);
1229 #else
1230               imo = ((err + im) * dith_mul + (1 << (EVEN_SHIFT + 7))) >> (EVEN_SHIFT + 8);
1231 #endif
1232               if (imo < 0) imo = 0;
1233               else if (imo > levels - 1) imo = levels - 1;
1234               dst_ptr[i] = imo;
1235               coupling_contribution = im - ((imo * imo_mul) >> IMO_SHIFT);
1236               new_e_1_0 += coupling_contribution;
1237               coupling += (coupling_contribution * strengths[plane_idx]) >> 8;
1238             }
1239           if (dst_ptr[i] != 0)
1240             {
1241               a[plane_idx] = 1;
1242               b[plane_idx] = aspect2;
1243               r[plane_idx] = 0;
1244             }
1245           if (dst_ptr[i] != levels - 1)
1246             {
1247               a_sh[plane_idx] = 1;
1248               b_sh[plane_idx] = aspect2;
1249               r_sh[plane_idx] = 0;
1250             }
1251           pa[i] = a[plane_idx];
1252           pb[i] = b[plane_idx];
1253           pr[i] = r[plane_idx];
1254           pa_sh[i] = a_sh[plane_idx];
1255           pb_sh[i] = b_sh[plane_idx];
1256           pr_sh[i] = r_sh[plane_idx];
1257           piir[i] = new_e_1_0;
1258           e_1_0[plane_idx] = new_e_1_0;
1259         }
1260 #ifdef FANCY_COUPLING
1261       coupling = coupling >> 1;
1262       c_line[i] = coupling;
1263 #endif
1264     }
1265 
1266 #ifdef FANCY_COUPLING
1267   coupling = 0;
1268   for (i = xd - 1; i >= 0; i--)
1269     {
1270       if (plane_idx == 0)
1271         {
1272           coupling = (coupling + c_line[i]) >> 1;
1273           c_line[i] = (coupling - (coupling >> 4));
1274         }
1275     }
1276 #endif
1277 
1278   /* Update distances. */
1279   for (plane_idx = 0; plane_idx < n_planes; plane_idx++)
1280     {
1281       EBPlaneCtx *ctx = ebc->plane_ctx[plane_idx];
1282       int av, bv, rv;
1283       int av_sh, bv_sh, rv_sh;
1284 
1285       pr = ctx->r_line;
1286       pa = ctx->a_line;
1287       pb = ctx->b_line;
1288       pr_sh = ctx->r_line_sh;
1289       pa_sh = ctx->a_line_sh;
1290       pb_sh = ctx->b_line_sh;
1291 
1292       av = 1;
1293       bv = 1;
1294       rv = 0;
1295       av_sh = 1;
1296       bv_sh = 1;
1297       rv_sh = 0;
1298       for (i = xd - 1; i >= 0; i--)
1299         {
1300           if (rv + bv + av < pr[i] + pb[i])
1301             {
1302               rv += av;
1303               av += 2;
1304             }
1305           else
1306             {
1307               rv = pr[i];
1308               av = pa[i];
1309               bv = pb[i];
1310             }
1311           if (rv > even_rlimit) rv = even_rlimit;
1312           pa[i] = av;
1313           pb[i] = bv + (aspect2 << 1);
1314           pr[i] = rv + bv;
1315 
1316           if (rv_sh + bv_sh + av_sh < pr_sh[i] + pb_sh[i])
1317             {
1318               rv_sh += av_sh;
1319               av_sh += 2;
1320             }
1321           else
1322             {
1323               rv_sh = pr_sh[i];
1324               av_sh = pa_sh[i];
1325               bv_sh = pb_sh[i];
1326             }
1327           if (rv_sh > even_rlimit) rv_sh = even_rlimit;
1328           pa_sh[i] = av_sh;
1329           pb_sh[i] = bv_sh + (aspect2 << 1);
1330           pr_sh[i] = rv_sh + bv_sh;
1331         }
1332     }
1333 
1334    ebc->seed1 = seed1;
1335    ebc->seed2 = seed2;
1336 #endif
1337 }
1338 
1339 /**
1340  * even_better_line_rll: Screen a line using Even ToneFS screeing.
1341  * @ctx: An #EBPlaneCtx context.
1342  * @dest: Array of destination buffers, 8 bpp pixels each.
1343  * @src: Array of source buffers, runlength encoded.
1344  *
1345  * Screens a single line using Even ToneFS screening.
1346  **/
1347 void
even_better_line_rll(EvenBetterCtx * ebc,uchar ** dest,const ET_Rll * const * src)1348 even_better_line_rll (EvenBetterCtx *ebc, uchar **dest,
1349                       const ET_Rll *const *src)
1350 {
1351 
1352   if (ebc->dump_file && ebc->dump_level >= EB_DUMP_INPUT)
1353     {
1354       int i;
1355 
1356       /* Note: we should calculate the actual number of runlength
1357          codes here. As it is, it will just waste storage a bit. */
1358       for (i = 0; i < ebc->n_planes; i++)
1359         fwrite (src[i], sizeof(ET_Rll), ebc->source_width,
1360                 ebc->dump_file);
1361     }
1362 #ifdef USE_VECTOR
1363   if (ebc->using_vectors)
1364     even_better_line_vector(ebc, dest, src);
1365   else
1366 #endif
1367   if (ebc->do_shadows)
1368     even_better_line_both (ebc, dest, src);
1369   else
1370     even_better_line_hi (ebc, dest, src);
1371   if (ebc->dump_file && ebc->dump_level >= EB_DUMP_INPUT)
1372     {
1373       int i;
1374 
1375       for (i = 0; i < ebc->n_planes; i++)
1376         fwrite (dest[i], 1, ebc->dest_width,
1377                 ebc->dump_file);
1378     }
1379 }
1380 
1381 /**
1382  * even_better_compress_rll: Compress a single scan line to RLL format.
1383  * @dst: Destination buffer.
1384  * @src: Source buffer.
1385  * @width: Number of source pixels.
1386  *
1387  * Return value: number of runlength codes.
1388  **/
1389 static int
even_better_compress_rll(ET_Rll * dst,const ET_SrcPixel * src,int src_width,int dst_width)1390 even_better_compress_rll (ET_Rll *dst, const ET_SrcPixel *src,
1391                           int src_width, int dst_width)
1392 {
1393   int rll_idx;
1394   int i;
1395   int count;
1396   ET_SrcPixel last_val;
1397   int whole = dst_width / src_width;
1398   int frac = dst_width % src_width;
1399   int rem;
1400 
1401   rll_idx = 0;
1402   last_val = src[0];
1403   count = whole;
1404   if (frac == 0)
1405     {
1406       for (i = 1; i < src_width; i++)
1407         {
1408           ET_SrcPixel val = src[i];
1409 
1410           if (count > 0xffff - whole || val != last_val)
1411             {
1412               dst[rll_idx].length = count;
1413               dst[rll_idx].value = last_val;
1414               rll_idx++;
1415               last_val = val;
1416               count = 0;
1417             }
1418           count += whole;
1419         }
1420     }
1421   else
1422     {
1423       rem = frac;
1424       for (i = 1; i < src_width; i++)
1425         {
1426           ET_SrcPixel val = src[i];
1427 
1428           if (count >= 0xffff - whole || val != last_val)
1429             {
1430               dst[rll_idx].length = count;
1431               dst[rll_idx].value = last_val;
1432               rll_idx++;
1433               last_val = val;
1434               count = 0;
1435             }
1436           count += whole;
1437           rem += frac;
1438           if (rem >= src_width)
1439             {
1440               count++;
1441               rem -= src_width;
1442             }
1443         }
1444     }
1445   dst[rll_idx].length = count;
1446   dst[rll_idx].value = last_val;
1447   rll_idx++;
1448   return rll_idx;
1449 }
1450 
1451 /**
1452  * even_better_line: Screen a line using Even TonenFS screeing.
1453  * @ctx: An #EBPlaneCtx context.
1454  * @dest: Array of destination buffers, 8 bpp pixels each.
1455  * @src: Array of source buffer, ET_SrcPixel pixels each.
1456  *
1457  * Screens a single line using Even ToneFS screening.
1458  **/
1459 void
even_better_line(EvenBetterCtx * ebc,uchar ** dest,const ET_SrcPixel * const * src)1460 even_better_line (EvenBetterCtx *ebc, uchar **dest,
1461                       const ET_SrcPixel *const *src)
1462 {
1463   ET_Rll *rll_buf[M];
1464   int i;
1465   int source_width = ebc->source_width;
1466   int dest_width = ebc->dest_width;
1467 
1468 #ifdef USE_AVEC
1469   if (ebc->using_vectors == 2)
1470     {
1471       even_better_line_fastprep (ebc, dest, src);
1472     }
1473   else
1474 #endif
1475     {
1476       for (i = 0; i < ebc->n_planes; i++)
1477         {
1478           rll_buf[i] = (ET_Rll *)malloc (source_width * sizeof(ET_Rll));
1479           even_better_compress_rll (rll_buf[i], src[i], source_width, dest_width);
1480         }
1481       even_better_line_rll (ebc, dest, (const ET_Rll * const *)rll_buf);
1482       for (i = 0; i < ebc->n_planes; i++)
1483         free (rll_buf[i]);
1484     }
1485 }
1486 
1487 /**
1488  * even_better_plane_free: Free an #EBPlaneCtx context.
1489  * @ctx: The #EBPlaneCtx context to free.
1490  *
1491  * Frees @ctx.
1492  **/
1493 static void
even_better_plane_free(EBPlaneCtx * ctx)1494 even_better_plane_free (EBPlaneCtx *ctx)
1495 {
1496   free (ctx->rb_line);
1497   free (ctx->iir_line);
1498   free (ctx->r_line);
1499   free (ctx->a_line);
1500   free (ctx->b_line);
1501   free (ctx->lut);
1502   free (ctx->rb_lut);
1503   free (ctx->rs_lut);
1504   free (ctx->white_count_line);
1505   free (ctx);
1506 }
1507 
1508 static int
even_log2(int x)1509 even_log2 (int x)
1510 {
1511   int y = 0;
1512   int z;
1513 
1514   for (z = x; z > 1; z = z >> 1)
1515     y++;
1516   return y;
1517 }
1518 
1519 /**
1520  * even_better_new: Create new Even ToneFS screening context.
1521  * @source_width: Width of source buffer.
1522  * @dest_width: Width of destination buffer, in pixels.
1523  * @lut: Lookup table for gray values.
1524  *
1525  * Creates a new context for Even ToneFS screening.
1526  *
1527  * If @dest_width is larger than @source_width, then input lines will
1528  * be expanded using nearest-neighbor sampling.
1529  *
1530  * @lut should be an array of 256 values, one for each possible input
1531  * gray value. @lut is a lookup table for gray values. Each value
1532  * ranges from 0 (black) to 2^24 (white).
1533  *
1534  * Return value: The new #EBPlaneCtx context.
1535  **/
1536 static EBPlaneCtx *
even_better_plane_new(const EvenBetterParams * params,EvenBetterCtx * ebc,int plane_idx)1537 even_better_plane_new (const EvenBetterParams *params, EvenBetterCtx *ebc,
1538                        int plane_idx)
1539 {
1540   int source_width = params->source_width;
1541   int dest_width = params->dest_width;
1542   int *lut = params->luts[plane_idx];
1543   EBPlaneCtx *result;
1544   int i;
1545   int *new_lut;
1546   int *rb_lut;
1547   char *rs_lut;
1548   double rbscale = eb_compute_rbscale(params);
1549   int even_c1 = ebc->even_c1;
1550   int even_rlimit = 1 << (30 - EVEN_SHIFT + even_c1);
1551   int do_shadows = params->do_shadows;
1552   int log2_levels;
1553   int rs_base;
1554 
1555   result = (EBPlaneCtx *)malloc (sizeof(EBPlaneCtx));
1556 
1557   result->source_width = source_width;
1558   result->dest_width = dest_width;
1559 
1560   new_lut = (int *)malloc ((ET_SRC_MAX + 1) * sizeof(int));
1561   for (i = 0; i < ET_SRC_MAX + 1; i++)
1562     {
1563       int nli;
1564 
1565       if (lut == NULL)
1566         {
1567 #if ET_SRC_MAX == 255
1568           nli = (i * 65793 + (i >> 7)) >> (24 - EVEN_SHIFT);
1569 #else
1570           nli = (i * ((double) (1 << EVEN_SHIFT)) / ET_SRC_MAX) + 0.5;
1571 #endif
1572         }
1573       else
1574         nli = lut[i] >> (24 - EVEN_SHIFT);
1575       new_lut[i] = (1 << EVEN_SHIFT) - nli;
1576     }
1577 
1578   rb_lut = (int *)malloc ((ET_SRC_MAX + 1) * sizeof(int));
1579   rs_lut = (char *)malloc ((ET_SRC_MAX + 1) * sizeof(int));
1580 
1581   log2_levels = even_log2 (params->levels);
1582   rs_base = 35 - EVEN_SHIFT + log2_levels - params->rand_scale;
1583 
1584   for (i = 0; i <= ET_SRC_MAX; i++)
1585     {
1586       double rb;
1587       int nl = new_lut[i] * (params->levels - 1);
1588       int rs;
1589 
1590       if (nl == 0)
1591         rb = 0;
1592       else
1593         {
1594           rb = (rbscale * (1 << (2 * EVEN_SHIFT - even_c1))) / nl;
1595           if (rb > even_rlimit << (EVEN_SHIFT - even_c1))
1596             rb = even_rlimit << (EVEN_SHIFT - even_c1);
1597         }
1598 
1599       rs = eb_compute_randshift(nl, rs_base, do_shadows, params->levels);
1600       rs_lut[i] = rs;
1601 
1602       if (params->do_shadows)
1603         {
1604           nl = ((1 << EVEN_SHIFT) - new_lut[i]) * (params->levels - 1);
1605 
1606           if (nl == 0)
1607             rb = 0;
1608           else
1609             {
1610               int rb_sh;
1611               rb_sh = (rbscale * (1 << (2 * EVEN_SHIFT - even_c1))) / nl;
1612               if (rb_sh > even_rlimit << (EVEN_SHIFT - even_c1))
1613                 rb_sh = even_rlimit << (EVEN_SHIFT - even_c1);
1614               rb -= rb_sh;
1615             }
1616         }
1617       rb_lut[i] = rb;
1618 
1619     }
1620 
1621   result->lut = new_lut;
1622   result->rb_lut = rb_lut;
1623   result->rs_lut = rs_lut;
1624 
1625   result->rb_line = (int *)calloc (dest_width, sizeof(int));
1626   result->iir_line = (int *)calloc (dest_width, sizeof(int));
1627   result->r_line = (int *)calloc (dest_width, sizeof(int));
1628   result->a_line = (int *)calloc (dest_width, sizeof(int));
1629   result->b_line = (int *)calloc (dest_width, sizeof(int));
1630   result->white_count_line = (int *)calloc ((dest_width + 15) >> 4, sizeof(int));
1631   if (do_shadows)
1632     {
1633       result->r_line_sh = (int *)calloc (dest_width, sizeof(int));
1634       result->a_line_sh = (int *)calloc (dest_width, sizeof(int));
1635       result->b_line_sh = (int *)calloc (dest_width, sizeof(int));
1636     }
1637   else
1638     {
1639       result->r_line_sh = NULL;
1640       result->a_line_sh = NULL;
1641       result->b_line_sh = NULL;
1642     }
1643   for (i = 0; i < dest_width; i++)
1644     {
1645       result->a_line[i] = 1;
1646       result->b_line[i] = 1;
1647       result->iir_line[i] = -((rand () & 0x7fff) << 6) >> (24 - EVEN_SHIFT);
1648       if (do_shadows)
1649         {
1650           result->a_line_sh[i] = 1;
1651           result->b_line_sh[i] = 1;
1652         }
1653     }
1654 
1655   return result;
1656 }
1657 
1658 EvenBetterCtx *
even_better_new(const EvenBetterParams * params)1659 even_better_new (const EvenBetterParams *params)
1660 {
1661   EvenBetterCtx *result = (EvenBetterCtx *)malloc (sizeof(EvenBetterCtx));
1662   int n_planes = params->n_planes;
1663   int i;
1664   int log2_levels, log2_aspect;
1665   int using_vectors = 0;
1666 
1667   if (params->dump_file)
1668     {
1669       int header[5];
1670 
1671       header[0] = 0x70644245;
1672       header[1] = 'M' * 0x1010000 + 'I' * 0x101;
1673       header[2] = EVENBETTER_VERSION;
1674       header[3] = ET_SRC_MAX;
1675       header[4] = sizeof(ET_SrcPixel);
1676       fwrite (header, sizeof(int), sizeof(header) / sizeof(header[0]),
1677               params->dump_file);
1678       if (params->dump_level >= EB_DUMP_PARAMS)
1679         {
1680 
1681           fwrite (params, 1, sizeof(EvenBetterParams), params->dump_file);
1682         }
1683       if (params->dump_level >= EB_DUMP_LUTS)
1684         {
1685           int i;
1686           for (i = 0; i < params->n_planes; i++)
1687             fwrite (params->luts[i], sizeof(int), ET_SRC_MAX + 1,
1688                     params->dump_file);
1689         }
1690     }
1691 
1692   result->source_width = params->source_width;
1693   result->dest_width = params->dest_width;
1694   result->n_planes = n_planes;
1695   result->levels = params->levels;
1696 
1697   result->aspect = params->aspect;
1698 
1699   result->even_ehi = 0.6 * (1 << EVEN_SHIFT) / (params->levels - 1);
1700   result->even_elo = -result->even_ehi;
1701 
1702   result->strengths = (int *)malloc (sizeof(int) * n_planes);
1703   memcpy (result->strengths, params->strengths,
1704           sizeof(int) * n_planes);
1705 
1706   log2_levels = even_log2 (params->levels);
1707   log2_aspect = even_log2 (params->aspect);
1708   result->even_c1 = 6 + log2_aspect + log2_levels - params->even_c1_scale;
1709   result->do_shadows = params->do_shadows;
1710 
1711   result->c_line = (int *)calloc (params->dest_width, sizeof(int));
1712 
1713   result->seed1 = 0x5324879f;
1714   result->seed2 = 0xb78d0945;
1715 
1716   result->dump_file = params->dump_file;
1717   result->dump_level = params->dump_level;
1718 
1719 #ifdef USE_SSE2
1720   using_vectors = eb_test_sse2();
1721 #endif
1722 #ifdef USE_AVEC
1723   using_vectors = 1; /* todo: Altivec sensing */
1724 
1725   /* select fastprep */
1726   if (sizeof(ET_SrcPixel) == 1 && using_vectors && params->gamma != 0)
1727     using_vectors = 2;
1728 
1729 #endif
1730 
1731 #ifdef USE_VECTOR
1732   result->using_vectors = using_vectors;
1733 #endif
1734   if (using_vectors)
1735     {
1736 #ifdef USE_SSE2
1737       result->sse2_ctx = (eb_ctx_sse2 **)malloc(sizeof(eb_ctx_sse2 *) *
1738                                                 ((n_planes + 3) >> 2));
1739       for (i = 0; i < n_planes; i += 4)
1740         {
1741           int end_plane = i + 4 < n_planes ? i + 4 : n_planes;
1742           result->sse2_ctx[i >> 2] = eb_ctx_sse2_new(params, i, end_plane);
1743         }
1744 #endif
1745 #ifdef USE_AVEC
1746       result->avec_ctx = (eb_ctx_avec **)malloc(sizeof(eb_ctx_avec *) *
1747                                                 ((n_planes + 3) >> 2));
1748       for (i = 0; i < n_planes; i += 4)
1749         {
1750           int end_plane = i + 4 < n_planes ? i + 4 : n_planes;
1751           result->avec_ctx[i >> 2] = eb_ctx_avec_new(params, i, end_plane);
1752         }
1753 #endif
1754       result->plane_ctx = NULL;
1755     }
1756   else
1757     {
1758       result->plane_ctx = (EBPlaneCtx **)malloc(sizeof(EBPlaneCtx *) * n_planes);
1759       for (i = 0; i < n_planes; i++)
1760         result->plane_ctx[i] = even_better_plane_new (params, result, i);
1761     }
1762   return result;
1763 }
1764 
1765 /**
1766  * even_better_free: Free an #EvenBetterCtx context.
1767  * @ctx: The #EvenBetterCtx context to free.
1768  *
1769  * Frees @ctx.
1770  **/
1771 void
even_better_free(EvenBetterCtx * ctx)1772 even_better_free (EvenBetterCtx *ctx)
1773 {
1774   int i;
1775   int n_planes = ctx->n_planes;
1776 
1777   if (ctx->dump_file)
1778     fclose (ctx->dump_file);
1779 
1780 #ifdef USE_VECTOR
1781   if (ctx->using_vectors)
1782     {
1783 #ifdef USE_SSE2
1784       for (i = 0; i < n_planes; i += 4)
1785         eb_ctx_sse2_free(ctx->sse2_ctx[i >> 2]);
1786       free(ctx->sse2_ctx);
1787 #endif
1788 #ifdef USE_AVEC
1789       for (i = 0; i < n_planes; i += 4)
1790         eb_ctx_avec_free(ctx->avec_ctx[i >> 2]);
1791       free(ctx->avec_ctx);
1792 #endif
1793     }
1794   else
1795 #endif
1796     {
1797       for (i = 0; i < n_planes; i++)
1798         even_better_plane_free (ctx->plane_ctx[i]);
1799       free(ctx->plane_ctx);
1800     }
1801   free (ctx->strengths);
1802   free (ctx->c_line);
1803 
1804   free (ctx);
1805 }
1806