1 /*****************************************************************************
2  * This file is part of Kvazaar HEVC encoder.
3  *
4  * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without modification,
8  * are permitted provided that the following conditions are met:
9  *
10  * * Redistributions of source code must retain the above copyright notice, this
11  *   list of conditions and the following disclaimer.
12  *
13  * * Redistributions in binary form must reproduce the above copyright notice, this
14  *   list of conditions and the following disclaimer in the documentation and/or
15  *   other materials provided with the distribution.
16  *
17  * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18  *   contributors may be used to endorse or promote products derived from
19  *   this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31  ****************************************************************************/
32 
33 #include "image.h"
34 
35 #include <limits.h>
36 #include <stdlib.h>
37 
38 #include "strategies/strategies-ipol.h"
39 #include "strategies/strategies-picture.h"
40 #include "threads.h"
41 
42 /**
43 * \brief Allocate a new image with 420.
44 * This function signature is part of the libkvz API.
45 * \return image pointer or NULL on failure
46 */
kvz_image_alloc_420(const int32_t width,const int32_t height)47 kvz_picture * kvz_image_alloc_420(const int32_t width, const int32_t height)
48 {
49   return kvz_image_alloc(KVZ_CSP_420, width, height);
50 }
51 
52 /**
53  * \brief Allocate a new image.
54  * \return image pointer or NULL on failure
55  */
kvz_image_alloc(enum kvz_chroma_format chroma_format,const int32_t width,const int32_t height)56 kvz_picture * kvz_image_alloc(enum kvz_chroma_format chroma_format, const int32_t width, const int32_t height)
57 {
58   //Assert that we have a well defined image
59   assert((width % 2) == 0);
60   assert((height % 2) == 0);
61 
62   const size_t simd_padding_width = 64;
63 
64   kvz_picture *im = MALLOC(kvz_picture, 1);
65   if (!im) return NULL;
66 
67   unsigned int luma_size = width * height;
68   unsigned chroma_sizes[] = { 0, luma_size / 4, luma_size / 2, luma_size };
69   unsigned chroma_size = chroma_sizes[chroma_format];
70 
71   im->chroma_format = chroma_format;
72 
73   //Allocate memory, pad the full data buffer from both ends
74   im->fulldata_buf = MALLOC_SIMD_PADDED(kvz_pixel, (luma_size + 2 * chroma_size), simd_padding_width * 2);
75   if (!im->fulldata_buf) {
76     free(im);
77     return NULL;
78   }
79   im->fulldata = im->fulldata_buf + simd_padding_width / sizeof(kvz_pixel);
80 
81   im->base_image = im;
82   im->refcount = 1; //We give a reference to caller
83   im->width = width;
84   im->height = height;
85   im->stride = width;
86   im->chroma_format = chroma_format;
87 
88   im->y = im->data[COLOR_Y] = &im->fulldata[0];
89 
90   if (chroma_format == KVZ_CSP_400) {
91     im->u = im->data[COLOR_U] = NULL;
92     im->v = im->data[COLOR_V] = NULL;
93   } else {
94     im->u = im->data[COLOR_U] = &im->fulldata[luma_size];
95     im->v = im->data[COLOR_V] = &im->fulldata[luma_size + chroma_size];
96   }
97 
98   im->pts = 0;
99   im->dts = 0;
100 
101   im->interlacing = KVZ_INTERLACING_NONE;
102 
103   return im;
104 }
105 
106 /**
107  * \brief Free an image.
108  *
109  * Decrement reference count of the image and deallocate associated memory
110  * if no references exist any more.
111  *
112  * \param im image to free
113  */
kvz_image_free(kvz_picture * const im)114 void kvz_image_free(kvz_picture *const im)
115 {
116   if (im == NULL) return;
117 
118   int32_t new_refcount = KVZ_ATOMIC_DEC(&(im->refcount));
119   if (new_refcount > 0) {
120     // There are still references so we don't free the data yet.
121     return;
122   }
123 
124   if (im->base_image != im) {
125     // Free our reference to the base image.
126     kvz_image_free(im->base_image);
127   } else {
128     free(im->fulldata_buf);
129   }
130 
131   // Make sure freed data won't be used.
132   im->base_image = NULL;
133   im->fulldata_buf = NULL;
134   im->fulldata = NULL;
135   im->y = im->u = im->v = NULL;
136   im->data[COLOR_Y] = im->data[COLOR_U] = im->data[COLOR_V] = NULL;
137   free(im);
138 }
139 
140 /**
141  * \brief Get a new pointer to an image.
142  *
143  * Increment reference count and return the image.
144  */
kvz_image_copy_ref(kvz_picture * im)145 kvz_picture *kvz_image_copy_ref(kvz_picture *im)
146 {
147   int32_t new_refcount = KVZ_ATOMIC_INC(&im->refcount);
148   // The caller should have had another reference and we added one
149   // reference so refcount should be at least 2.
150   assert(new_refcount >= 2);
151   return im;
152 }
153 
kvz_image_make_subimage(kvz_picture * const orig_image,const unsigned x_offset,const unsigned y_offset,const unsigned width,const unsigned height)154 kvz_picture *kvz_image_make_subimage(kvz_picture *const orig_image,
155                              const unsigned x_offset,
156                              const unsigned y_offset,
157                              const unsigned width,
158                              const unsigned height)
159 {
160   // Assert that we have a well defined image
161   assert((width % 2) == 0);
162   assert((height % 2) == 0);
163 
164   assert((x_offset % 2) == 0);
165   assert((y_offset % 2) == 0);
166 
167   assert(x_offset + width <= orig_image->width);
168   assert(y_offset + height <= orig_image->height);
169 
170   kvz_picture *im = MALLOC(kvz_picture, 1);
171   if (!im) return NULL;
172 
173   im->base_image = kvz_image_copy_ref(orig_image->base_image);
174   im->refcount = 1; // We give a reference to caller
175   im->width = width;
176   im->height = height;
177   im->stride = orig_image->stride;
178   im->chroma_format = orig_image->chroma_format;
179 
180   im->y = im->data[COLOR_Y] = &orig_image->y[x_offset + y_offset * orig_image->stride];
181   if (orig_image->chroma_format != KVZ_CSP_400) {
182     im->u = im->data[COLOR_U] = &orig_image->u[x_offset / 2 + y_offset / 2 * orig_image->stride / 2];
183     im->v = im->data[COLOR_V] = &orig_image->v[x_offset / 2 + y_offset / 2 * orig_image->stride / 2];
184   }
185 
186   im->pts = 0;
187   im->dts = 0;
188 
189   return im;
190 }
191 
kvz_yuv_t_alloc(int luma_size,int chroma_size)192 yuv_t * kvz_yuv_t_alloc(int luma_size, int chroma_size)
193 {
194   yuv_t *yuv = (yuv_t *)malloc(sizeof(*yuv));
195   yuv->size = luma_size;
196 
197   // Get buffers with separate mallocs in order to take advantage of
198   // automatic buffer overrun checks.
199   yuv->y = (kvz_pixel *)malloc(luma_size * sizeof(*yuv->y));
200   if (chroma_size == 0) {
201     yuv->u = NULL;
202     yuv->v = NULL;
203   } else {
204     yuv->u = (kvz_pixel *)malloc(chroma_size * sizeof(*yuv->u));
205     yuv->v = (kvz_pixel *)malloc(chroma_size * sizeof(*yuv->v));
206   }
207 
208   return yuv;
209 }
210 
kvz_yuv_t_free(yuv_t * yuv)211 void kvz_yuv_t_free(yuv_t *yuv)
212 {
213   if (yuv) {
214     FREE_POINTER(yuv->y);
215     FREE_POINTER(yuv->u);
216     FREE_POINTER(yuv->v);
217   }
218   FREE_POINTER(yuv);
219 }
220 
kvz_hi_prec_buf_t_alloc(int luma_size)221 hi_prec_buf_t * kvz_hi_prec_buf_t_alloc(int luma_size)
222 {
223   // Get buffers with separate mallocs in order to take advantage of
224   // automatic buffer overrun checks.
225   hi_prec_buf_t *yuv = (hi_prec_buf_t *)malloc(sizeof(*yuv));
226   yuv->y = (int16_t *)malloc(luma_size * sizeof(*yuv->y));
227   yuv->u = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->u));
228   yuv->v = (int16_t *)malloc(luma_size / 2 * sizeof(*yuv->v));
229   yuv->size = luma_size;
230 
231   return yuv;
232 }
233 
kvz_hi_prec_buf_t_free(hi_prec_buf_t * yuv)234 void kvz_hi_prec_buf_t_free(hi_prec_buf_t * yuv)
235 {
236   free(yuv->y);
237   free(yuv->u);
238   free(yuv->v);
239   free(yuv);
240 }
241 
reg_sad_maybe_optimized(const kvz_pixel * const data1,const kvz_pixel * const data2,const int32_t width,const int32_t height,const uint32_t stride1,const uint32_t stride2,optimized_sad_func_ptr_t optimized_sad)242 static INLINE uint32_t reg_sad_maybe_optimized(const kvz_pixel * const data1, const kvz_pixel * const data2,
243                                   const int32_t width, const int32_t height, const uint32_t stride1,
244                                   const uint32_t stride2, optimized_sad_func_ptr_t optimized_sad)
245 {
246   if (optimized_sad != NULL)
247     return optimized_sad(data1, data2, height, stride1, stride2);
248   else
249     return kvz_reg_sad(data1, data2, width, height, stride1, stride2);
250 }
251 
252 /**
253  * \brief Diagonally interpolate SAD outside the frame.
254  *
255  * \param data1   Starting point of the first picture.
256  * \param data2   Starting point of the second picture.
257  * \param width   Width of the region for which SAD is calculated.
258  * \param height  Height of the region for which SAD is calculated.
259  * \param width  Width of the pixel array.
260  *
261  * \returns Sum of Absolute Differences
262  */
cor_sad(const kvz_pixel * pic_data,const kvz_pixel * ref_data,int block_width,int block_height,unsigned pic_stride)263 static unsigned cor_sad(const kvz_pixel *pic_data, const kvz_pixel *ref_data,
264                         int block_width, int block_height, unsigned pic_stride)
265 {
266   kvz_pixel ref = *ref_data;
267   int x, y;
268   unsigned sad = 0;
269 
270   for (y = 0; y < block_height; ++y) {
271     for (x = 0; x < block_width; ++x) {
272       sad += abs(pic_data[y * pic_stride + x] - ref);
273     }
274   }
275 
276   return sad;
277 }
278 
279 
280 /**
281  * \brief  Handle special cases of comparing blocks that are not completely
282  *         inside the frame.
283  *
284  * \param pic  First frame.
285  * \param ref  Second frame.
286  * \param pic_x  X coordinate of the first block.
287  * \param pic_y  Y coordinate of the first block.
288  * \param ref_x  X coordinate of the second block.
289  * \param ref_y  Y coordinate of the second block.
290  * \param block_width  Width of the blocks.
291  * \param block_height  Height of the blocks.
292  */
image_interpolated_sad(const kvz_picture * pic,const kvz_picture * ref,int pic_x,int pic_y,int ref_x,int ref_y,int block_width,int block_height,optimized_sad_func_ptr_t optimized_sad)293 static unsigned image_interpolated_sad(const kvz_picture *pic, const kvz_picture *ref,
294                                  int pic_x, int pic_y, int ref_x, int ref_y,
295                                  int block_width, int block_height,
296                                  optimized_sad_func_ptr_t optimized_sad)
297 {
298   kvz_pixel *pic_data, *ref_data;
299 
300   int left, right, top, bottom;
301   int result = 0;
302 
303   // Change the movement vector to point right next to the frame. This doesn't
304   // affect the result but removes some special cases.
305   if (ref_x > ref->width)            ref_x = ref->width;
306   if (ref_y > ref->height)           ref_y = ref->height;
307   if (ref_x + block_width < 0)  ref_x = -block_width;
308   if (ref_y + block_height < 0) ref_y = -block_height;
309 
310   // These are the number of pixels by how far the movement vector points
311   // outside the frame. They are always >= 0. If all of them are 0, the
312   // movement vector doesn't point outside the frame.
313   left   = (ref_x < 0) ? -ref_x : 0;
314   top    = (ref_y < 0) ? -ref_y : 0;
315   right  = (ref_x + block_width  > ref->width)  ? ref_x + block_width  - ref->width  : 0;
316   bottom = (ref_y + block_height > ref->height) ? ref_y + block_height - ref->height : 0;
317 
318   // Center picture to the current block and reference to the point where
319   // movement vector is pointing to. That point might be outside the buffer,
320   // but that is ok because we project the movement vector to the buffer
321   // before dereferencing the pointer.
322   pic_data = &pic->y[pic_y * pic->stride + pic_x];
323   ref_data = &ref->y[ref_y * ref->stride + ref_x];
324 
325   // The handling of movement vectors that point outside the picture is done
326   // in the following way.
327   // - Correct the index of ref_data so that it points to the top-left
328   //   of the area we want to compare against.
329   // - Correct the index of pic_data to point inside the current block, so
330   //   that we compare the right part of the block to the ref_data.
331   // - Reduce block_width and block_height so that the the size of the area
332   //   being compared is correct.
333   //
334   // NOTE: No more correct since hor_sad was modified to be a separate
335   // strategy
336   if (top && left) {
337     result += cor_sad(pic_data,
338                       &ref_data[top * ref->stride + left],
339                       left, top, pic->stride);
340     result += kvz_ver_sad(&pic_data[left],
341                       &ref_data[top * ref->stride + left],
342                       block_width - left, top, pic->stride);
343 
344     result += kvz_hor_sad(pic_data + top * pic->stride,
345                           ref_data + top * ref->stride,
346                           block_width, block_height - top,
347                           pic->stride, ref->stride,
348                           left, right);
349 
350   } else if (top && right) {
351     result += kvz_ver_sad(pic_data,
352                       &ref_data[top * ref->stride],
353                       block_width - right, top, pic->stride);
354     result += cor_sad(&pic_data[block_width - right],
355                       &ref_data[top * ref->stride + (block_width - right - 1)],
356                       right, top, pic->stride);
357 
358     result += kvz_hor_sad(pic_data + top * pic->stride,
359                           ref_data + top * ref->stride,
360                           block_width, block_height - top,
361                           pic->stride, ref->stride,
362                           left, right);
363 
364   } else if (bottom && left) {
365     result += kvz_hor_sad(pic_data, ref_data, block_width, block_height - bottom,
366                           pic->stride, ref->stride, left, right);
367 
368     result += cor_sad(&pic_data[(block_height - bottom) * pic->stride],
369                       &ref_data[(block_height - bottom - 1) * ref->stride + left],
370                       left, bottom, pic->stride);
371     result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride + left],
372                       &ref_data[(block_height - bottom - 1) * ref->stride + left],
373                       block_width - left, bottom, pic->stride);
374   } else if (bottom && right) {
375     result += kvz_hor_sad(pic_data, ref_data, block_width, block_height - bottom,
376                           pic->stride, ref->stride, left, right);
377 
378     result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
379                       &ref_data[(block_height - bottom - 1) * ref->stride],
380                       block_width - right, bottom, pic->stride);
381     result += cor_sad(&pic_data[(block_height - bottom) * pic->stride + block_width - right],
382                       &ref_data[(block_height - bottom - 1) * ref->stride + block_width - right - 1],
383                       right, bottom, pic->stride);
384   } else if (top) {
385     result += kvz_ver_sad(pic_data,
386                       &ref_data[top * ref->stride],
387                       block_width, top, pic->stride);
388     result += reg_sad_maybe_optimized(&pic_data[top * pic->stride],
389                       &ref_data[top * ref->stride],
390                       block_width, block_height - top, pic->stride, ref->stride,
391                       optimized_sad);
392   } else if (bottom) {
393     result += reg_sad_maybe_optimized(pic_data,
394                       ref_data,
395                       block_width, block_height - bottom, pic->stride, ref->stride,
396                       optimized_sad);
397     result += kvz_ver_sad(&pic_data[(block_height - bottom) * pic->stride],
398                       &ref_data[(block_height - bottom - 1) * ref->stride],
399                       block_width, bottom, pic->stride);
400   } else if (left | right) {
401     result += kvz_hor_sad(pic_data, ref_data,
402                           block_width, block_height, pic->stride,
403                           ref->stride, left, right);
404   } else {
405     result += reg_sad_maybe_optimized(pic_data, ref_data,
406                                       block_width, block_height,
407                                       pic->stride, ref->stride,
408                                       optimized_sad);
409   }
410   return result;
411 }
412 
413 /**
414 * \brief Calculate interpolated SAD between two blocks.
415 *
416 * \param pic        Image for the block we are trying to find.
417 * \param ref        Image where we are trying to find the block.
418 *
419 * \returns          Sum of absolute differences
420 */
kvz_image_calc_sad(const kvz_picture * pic,const kvz_picture * ref,int pic_x,int pic_y,int ref_x,int ref_y,int block_width,int block_height,optimized_sad_func_ptr_t optimized_sad)421 unsigned kvz_image_calc_sad(const kvz_picture *pic,
422                             const kvz_picture *ref,
423                             int pic_x,
424                             int pic_y,
425                             int ref_x,
426                             int ref_y,
427                             int block_width,
428                             int block_height,
429                             optimized_sad_func_ptr_t optimized_sad)
430 {
431   assert(pic_x >= 0 && pic_x <= pic->width - block_width);
432   assert(pic_y >= 0 && pic_y <= pic->height - block_height);
433 
434   uint32_t res;
435 
436   if (ref_x >= 0 && ref_x <= ref->width  - block_width &&
437       ref_y >= 0 && ref_y <= ref->height - block_height)
438   {
439     // Reference block is completely inside the frame, so just calculate the
440     // SAD directly. This is the most common case, which is why it's first.
441     const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
442     const kvz_pixel *ref_data = &ref->y[ref_y * ref->stride + ref_x];
443 
444     res = reg_sad_maybe_optimized(pic_data,
445                                   ref_data,
446                                   block_width,
447                                   block_height,
448                                   pic->stride,
449                                   ref->stride,
450                                   optimized_sad);
451   } else {
452     // Call a routine that knows how to interpolate pixels outside the frame.
453     res = image_interpolated_sad(pic, ref, pic_x, pic_y, ref_x, ref_y, block_width, block_height, optimized_sad);
454   }
455   return res >> (KVZ_BIT_DEPTH - 8);
456 }
457 
458 
459 /**
460 * \brief Calculate interpolated SATD between two blocks.
461 *
462 * \param pic        Image for the block we are trying to find.
463 * \param ref        Image where we are trying to find the block.
464 */
kvz_image_calc_satd(const kvz_picture * pic,const kvz_picture * ref,int pic_x,int pic_y,int ref_x,int ref_y,int block_width,int block_height)465 unsigned kvz_image_calc_satd(const kvz_picture *pic,
466                              const kvz_picture *ref,
467                              int pic_x,
468                              int pic_y,
469                              int ref_x,
470                              int ref_y,
471                              int block_width,
472                              int block_height)
473 {
474   assert(pic_x >= 0 && pic_x <= pic->width - block_width);
475   assert(pic_y >= 0 && pic_y <= pic->height - block_height);
476 
477   if (ref_x >= 0 && ref_x <= ref->width  - block_width &&
478       ref_y >= 0 && ref_y <= ref->height - block_height)
479   {
480     // Reference block is completely inside the frame, so just calculate the
481     // SAD directly. This is the most common case, which is why it's first.
482     const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
483     const kvz_pixel *ref_data = &ref->y[ref_y * ref->stride + ref_x];
484     return kvz_satd_any_size(block_width,
485                              block_height,
486                              pic_data,
487                              pic->stride,
488                              ref_data,
489                              ref->stride) >> (KVZ_BIT_DEPTH - 8);
490   } else {
491     // Extrapolate pixels from outside the frame.
492 
493     // Space for extrapolated pixels and the part from the picture
494     // The extrapolation function will set the pointers and stride.
495     kvz_pixel ext_buffer[LCU_LUMA_SIZE];
496     kvz_pixel *ext = NULL;
497     kvz_pixel *ext_origin = NULL;
498     int ext_s = 0;
499     kvz_epol_args epol_args = {
500       .src = ref->y,
501       .src_w = ref->width,
502       .src_h = ref->height,
503       .src_s = ref->stride,
504       .blk_x = ref_x,
505       .blk_y = ref_y,
506       .blk_w = block_width,
507       .blk_h = block_height,
508       .pad_l = 0,
509       .pad_r = 0,
510       .pad_t = 0,
511       .pad_b = 0,
512       .pad_b_simd = 0,
513     };
514 
515     // Initialize separately. Gets rid of warning
516     // about using nonstandard extension.
517     epol_args.buf = ext_buffer;
518     epol_args.ext = &ext;
519     epol_args.ext_origin = &ext_origin;
520     epol_args.ext_s = &ext_s;
521 
522     kvz_get_extended_block(&epol_args);
523 
524     const kvz_pixel *pic_data = &pic->y[pic_y * pic->stride + pic_x];
525 
526     unsigned satd = kvz_satd_any_size(block_width,
527       block_height,
528       pic_data,
529       pic->stride,
530       ext_origin,
531       ext_s) >> (KVZ_BIT_DEPTH - 8);
532 
533     return satd;
534   }
535 }
536 
537 
538 
539 
540 /**
541  * \brief BLock Image Transfer from one buffer to another.
542  *
543  * It's a stupidly simple loop that copies pixels.
544  *
545  * \param orig  Start of the originating buffer.
546  * \param dst  Start of the destination buffer.
547  * \param width  Width of the copied region.
548  * \param height  Height of the copied region.
549  * \param orig_stride  Width of a row in the originating buffer.
550  * \param dst_stride  Width of a row in the destination buffer.
551  *
552  * This should be inlined, but it's defined here for now to see if Visual
553  * Studios LTCG will inline it.
554  */
555 #define BLIT_PIXELS_CASE(n) case n:\
556   for (y = 0; y < n; ++y) {\
557     memcpy(&dst[y*dst_stride], &orig[y*orig_stride], n * sizeof(kvz_pixel));\
558   }\
559   break;
560 
kvz_pixels_blit(const kvz_pixel * const orig,kvz_pixel * const dst,const unsigned width,const unsigned height,const unsigned orig_stride,const unsigned dst_stride)561 void kvz_pixels_blit(const kvz_pixel * const orig, kvz_pixel * const dst,
562                          const unsigned width, const unsigned height,
563                          const unsigned orig_stride, const unsigned dst_stride)
564 {
565   unsigned y;
566   //There is absolutely no reason to have a width greater than the source or the destination stride.
567   assert(width <= orig_stride);
568   assert(width <= dst_stride);
569 
570 #ifdef CHECKPOINTS
571   char *buffer = malloc((3 * width + 1) * sizeof(char));
572   for (y = 0; y < height; ++y) {
573     int p;
574     for (p = 0; p < width; ++p) {
575       sprintf((buffer + 3*p), "%02X ", orig[y*orig_stride]);
576     }
577     buffer[3*width] = 0;
578     CHECKPOINT("kvz_pixels_blit_avx2: %04d: %s", y, buffer);
579   }
580   FREE_POINTER(buffer);
581 #endif //CHECKPOINTS
582 
583   if (width == orig_stride && width == dst_stride) {
584     memcpy(dst, orig, width * height * sizeof(kvz_pixel));
585     return;
586   }
587 
588   int nxn_width = (width == height) ? width : 0;
589   switch (nxn_width) {
590     BLIT_PIXELS_CASE(4)
591     BLIT_PIXELS_CASE(8)
592     BLIT_PIXELS_CASE(16)
593     BLIT_PIXELS_CASE(32)
594     BLIT_PIXELS_CASE(64)
595   default:
596 
597     if (orig == dst) {
598       //If we have the same array, then we should have the same stride
599       assert(orig_stride == dst_stride);
600       return;
601     }
602     assert(orig != dst || orig_stride == dst_stride);
603 
604     for (y = 0; y < height; ++y) {
605       memcpy(&dst[y*dst_stride], &orig[y*orig_stride], width * sizeof(kvz_pixel));
606     }
607     break;
608   }
609 }
610