1 
2 // Autogenerated by WaveGen.py, do not edit! //
3 #include <algorithm>
4 #include <cassert>
5 #include "common.h"
6 #include "wavelet_common.h"
7 
8 /// Boundaries (depends on wavelet)
9 /// This much is reserved at the sides of the signal
10 /// Must be even!
11 #define BLEFT 4
12 #define BRIGHT 4
13 
14 /// Initial shift (to keep precision in integer wavelets)
15 #define INITIAL_SHIFT 0
16 #define INITIAL_OFFSET 0
17 
18 //  static const int16_t stage1_weights[] = { -8, 21, -46, 161, 161, -46, 21, -8 };
19 //  static const int16_t stage2_weights[] = { 2, -10, 25, -81, -81, 25, -10, 2 };
20 
21 #define STAGE1_OFFSET 128
22 #define STAGE1_SHIFT 8
23 #define STAGE1_COEFF0 (-8)
24 #define STAGE1_COEFF1 (21)
25 #define STAGE1_COEFF2 (-46)
26 #define STAGE1_COEFF3 (161)
27 #define STAGE1_COEFF4 (161)
28 #define STAGE1_COEFF5 (-46)
29 #define STAGE1_COEFF6 (21)
30 #define STAGE1_COEFF7 (-8)
31 
32 #define STAGE2_OFFSET 127
33 #define STAGE2_SHIFT 8
34 #define STAGE2_COEFF0 (2)
35 #define STAGE2_COEFF1 (-10)
36 #define STAGE2_COEFF2 (25)
37 #define STAGE2_COEFF3 (-81)
38 #define STAGE2_COEFF4 (-81)
39 #define STAGE2_COEFF5 (25)
40 #define STAGE2_COEFF6 (-10)
41 #define STAGE2_COEFF7 (2)
42 
43 /// Vertical pass row management
44 #define RLEFT 6
45 #define RRIGHT 6
46 #define COPYROWS 8
47 
a_transform_h(DATATYPE * data,int width,int stride)48 static __global__ void a_transform_h( DATATYPE* data, int width, int stride )
49 {
50     extern __shared__ DATATYPE shared[];
51 
52     const int bid = blockIdx.x;    // row
53     const int tid = threadIdx.x;   // thread id within row
54     const int tidu16 = ((tid&16)>>4)|((tid&15)<<1)|(tid&~31);
55 
56     data += __mul24(bid, stride);
57 
58     // Load entire line into shared memory
59     // Deinterleave right here
60     int half = BLEFT+(width>>1)+BRIGHT;
61 
62     unsigned int ofs;
63 
64     // Shared memory output offset for this thread
65     i16_2 *row = (i16_2*)data;
66     for(ofs = tid; ofs < (width>>1); ofs += BSH)
67     {
68         i16_2 val = row[ofs];
69         shared[BLEFT + ofs]        = val.a << INITIAL_SHIFT; // even
70         shared[BLEFT + ofs + half] = val.b << INITIAL_SHIFT; // odd
71     }
72     __syncthreads();
73 
74     if(tidu16<4)
75     {
76         /*
77           lo[-4] = lo[0];
78           lo[-3] = lo[0];
79           lo[-2] = lo[0];
80           lo[-1] = lo[0];
81           lo[n] = lo[n-1];
82           lo[n+1] = lo[n-1];
83           lo[n+2] = lo[n-1];
84         */
85         shared[half+BLEFT-4+tidu16] = shared[half+BLEFT];
86         shared[half+BLEFT+(width>>1)+tidu16] = shared[half+BLEFT+(width>>1)-1];
87     }
88     __syncthreads();
89     // Now apply wavelet lifting to entire line at once
90     // Process even
91     const int end = BLEFT+(width>>1);
92     for(ofs = BLEFT+tidu16; ofs < end; ofs += BSH)
93     {
94         int acc = STAGE1_OFFSET;
95 
96         acc += __mul24(STAGE1_COEFF0,shared[half+ofs-4]);
97         acc += __mul24(STAGE1_COEFF1,shared[half+ofs-3]);
98         acc += __mul24(STAGE1_COEFF2,shared[half+ofs-2]);
99         acc += __mul24(STAGE1_COEFF3,shared[half+ofs-1]);
100         acc += __mul24(STAGE1_COEFF4,shared[half+ofs+0]);
101         acc += __mul24(STAGE1_COEFF5,shared[half+ofs+1]);
102         acc += __mul24(STAGE1_COEFF6,shared[half+ofs+2]);
103         acc += __mul24(STAGE1_COEFF7,shared[half+ofs+3]);
104 
105         shared[ofs] += acc >> STAGE1_SHIFT;
106     }
107 
108     __syncthreads();
109     if(tidu16<4)
110     {
111         /*
112           hi[-3] = hi[0];
113           hi[-2] = hi[0];
114           hi[-1] = hi[0];
115           hi[n] = hi[n-1];
116           hi[n+1] = hi[n-1];
117           hi[n+2] = hi[n-1];
118           hi[n+3] = hi[n-1];
119         */
120         shared[BLEFT-4+tidu16] = shared[BLEFT];
121         shared[BLEFT+(width>>1)+tidu16] = shared[BLEFT+(width>>1)-1];
122     }
123     __syncthreads();
124 
125     // Process odd
126     for(ofs = BLEFT+tidu16; ofs < end; ofs += BSH)
127     {
128         int acc = STAGE2_OFFSET;
129 
130         acc += __mul24(STAGE2_COEFF0, shared[ofs-3]);
131         acc += __mul24(STAGE2_COEFF1, shared[ofs-2]);
132         acc += __mul24(STAGE2_COEFF2, shared[ofs-1]);
133         acc += __mul24(STAGE2_COEFF3, shared[ofs-0]);
134         acc += __mul24(STAGE2_COEFF4, shared[ofs+1]);
135         acc += __mul24(STAGE2_COEFF5, shared[ofs+2]);
136         acc += __mul24(STAGE2_COEFF6, shared[ofs+3]);
137         acc += __mul24(STAGE2_COEFF7, shared[ofs+4]);
138 
139         shared[ofs + half] += acc >> STAGE2_SHIFT;
140     }
141 
142     __syncthreads();
143 
144 
145     /// Write line back to global memory, don't interleave again
146     /// Mind the gap between BLEFT+width/2 and half
147     if(width&3) // If width is not a multiple of 4, we need to use the slower method
148     {
149         /// Left part (even coefficients)
150         for(ofs = tid; ofs < (width>>1); ofs += BSH)
151             data[ofs] = shared[BLEFT+ofs];
152 
153         /// Right part (odd coefficients)
154         for(ofs = tid; ofs < (width>>1); ofs += BSH)
155             data[(width>>1)+ofs] = shared[half+BLEFT+ofs];
156     }
157     else
158     {
159         /// Left part (even coefficients)
160         for(ofs = tid; ofs < (width>>2); ofs += BSH)
161             row[ofs] = *((i16_2*)&shared[BLEFT+(ofs<<1)]);
162         row += (width>>2);
163         /// Right part (odd coefficients)
164         for(ofs = tid; ofs < (width>>2); ofs += BSH)
165             row[ofs] = *((i16_2*)&shared[half+BLEFT+(ofs<<1)]);
166     }
167 }
168 
169 #define BROWS (2*BSVY+COPYROWS) /* Rows to process at once */
170 #define SKIPTOP COPYROWS
171 
172 #define PAD_ROWS (WRITEBACK-SKIPTOP+RRIGHT+COPYROWS) /* Rows below which to use s_transform_v_pad */
173 /// tid is BCOLSxBROWS matrix
174 /// RLEFT+BROWS+RRIGHT rows
175 #define TOTALROWS (RLEFT+BROWS+RRIGHT)
176 #define OVERLAP (RLEFT+RRIGHT+COPYROWS)
177 #define OVERLAP_OFFSET (TOTALROWS-OVERLAP)
178 #define WRITEBACK (2*BSVY)
179 
doTransform(int xofs)180 __device__ void doTransform(int xofs)
181 {
182     const int tidx = (threadIdx.x<<1)+xofs;   // column
183     const int tidy = threadIdx.y;   // row
184 
185     extern __shared__ DATATYPE shared[];
186     int ofs;
187 
188     ofs = ((RLEFT+(tidy<<1)+8)<<BCOLS_SHIFT) + tidx;
189 
190     /* Phase 1 (even) at +8*BCOLS */
191     {
192         int acc = STAGE1_OFFSET;
193 
194         acc += __mul24(STAGE1_COEFF0, shared[ofs-7*BCOLS]);
195         acc += __mul24(STAGE1_COEFF1, shared[ofs-5*BCOLS]);
196         acc += __mul24(STAGE1_COEFF2, shared[ofs-3*BCOLS]);
197         acc += __mul24(STAGE1_COEFF3, shared[ofs-1*BCOLS]);
198         acc += __mul24(STAGE1_COEFF4, shared[ofs+1*BCOLS]);
199         acc += __mul24(STAGE1_COEFF5, shared[ofs+3*BCOLS]);
200         acc += __mul24(STAGE1_COEFF6, shared[ofs+5*BCOLS]);
201         acc += __mul24(STAGE1_COEFF7, shared[ofs+7*BCOLS]);
202 
203         shared[ofs] += acc >> STAGE1_SHIFT;
204     }
205 
206     __syncthreads();
207 
208     /* Phase 2 (odd) at +1*BCOLS */
209     ofs -= 7*BCOLS;
210     {
211         int acc = STAGE2_OFFSET;
212 
213         acc += __mul24(STAGE2_COEFF0, shared[ofs-7*BCOLS]);
214         acc += __mul24(STAGE2_COEFF1, shared[ofs-5*BCOLS]);
215         acc += __mul24(STAGE2_COEFF2, shared[ofs-3*BCOLS]);
216         acc += __mul24(STAGE2_COEFF3, shared[ofs-1*BCOLS]);
217         acc += __mul24(STAGE2_COEFF4, shared[ofs+1*BCOLS]);
218         acc += __mul24(STAGE2_COEFF5, shared[ofs+3*BCOLS]);
219         acc += __mul24(STAGE2_COEFF6, shared[ofs+5*BCOLS]);
220         acc += __mul24(STAGE2_COEFF7, shared[ofs+7*BCOLS]);
221 
222         shared[ofs] += acc >> STAGE2_SHIFT;
223     }
224 
225 }
226 
227 // Process 16 lines
doTransformTB(int xofs,unsigned int leftover)228 __device__ void doTransformTB(int xofs, unsigned int leftover)
229 {
230     const int tidx = (threadIdx.x<<1)+xofs;   // column
231     const int tidy = threadIdx.y;   // row
232     const int minn = (RLEFT<<BCOLS_SHIFT) + tidx;
233     const int maxx = leftover-(2<<BCOLS_SHIFT) + tidx;
234 
235     extern __shared__ DATATYPE shared[];
236     int ofs, ofs_t;
237 
238     /* Phase 1 (even) */
239     ofs = ((RLEFT+(tidy<<1))<<BCOLS_SHIFT) + tidx;
240 
241     for(ofs_t=ofs+0*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
242     {
243         int acc = STAGE1_OFFSET;
244 
245         acc += __mul24(STAGE1_COEFF0, shared[max(ofs_t-7*BCOLS, minn+BCOLS)]);
246         acc += __mul24(STAGE1_COEFF1, shared[max(ofs_t-5*BCOLS, minn+BCOLS)]);
247         acc += __mul24(STAGE1_COEFF2, shared[max(ofs_t-3*BCOLS, minn+BCOLS)]);
248         acc += __mul24(STAGE1_COEFF3, shared[max(ofs_t-1*BCOLS, minn+BCOLS)]);
249         acc += __mul24(STAGE1_COEFF4, shared[ofs_t+1*BCOLS]);
250         acc += __mul24(STAGE1_COEFF5, shared[min(ofs_t+3*BCOLS, maxx+BCOLS)]);
251         acc += __mul24(STAGE1_COEFF6, shared[min(ofs_t+5*BCOLS, maxx+BCOLS)]);
252         acc += __mul24(STAGE1_COEFF7, shared[min(ofs_t+7*BCOLS, maxx+BCOLS)]);
253 
254         shared[ofs_t] += acc >> STAGE1_SHIFT;
255     }
256 
257     __syncthreads();
258 
259     /* Phase 2 (odd) */
260     for(ofs_t=ofs+1*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
261     {
262         int acc = STAGE2_OFFSET;
263 
264         acc += __mul24(STAGE2_COEFF0, shared[max(ofs_t-7*BCOLS, minn)]);
265         acc += __mul24(STAGE2_COEFF1, shared[max(ofs_t-5*BCOLS, minn)]);
266         acc += __mul24(STAGE2_COEFF2, shared[max(ofs_t-3*BCOLS, minn)]);
267         acc += __mul24(STAGE2_COEFF3, shared[ofs_t-1*BCOLS]);
268         acc += __mul24(STAGE2_COEFF4, shared[min(ofs_t+1*BCOLS, maxx)]);
269         acc += __mul24(STAGE2_COEFF5, shared[min(ofs_t+3*BCOLS, maxx)]);
270         acc += __mul24(STAGE2_COEFF6, shared[min(ofs_t+5*BCOLS, maxx)]);
271         acc += __mul24(STAGE2_COEFF7, shared[min(ofs_t+7*BCOLS, maxx)]);
272 
273         shared[ofs_t] += acc >> STAGE2_SHIFT;
274     }
275 
276 }
277 
278 // Process BROWS-2 lines
doTransformT(int xofs)279 __device__ void doTransformT(int xofs)
280 {
281     const int tidx = (threadIdx.x<<1)+xofs;   // column
282     const int tidy = threadIdx.y;   // row
283     const int minn = ((RLEFT+SKIPTOP)<<BCOLS_SHIFT) + tidx;
284 
285     extern __shared__ DATATYPE shared[];
286     int ofs;
287 
288     /* Phase 1 (even), offset +0 */
289     ofs = ((SKIPTOP+RLEFT+(tidy<<1))<<BCOLS_SHIFT) + tidx;
290     {
291         int acc = STAGE1_OFFSET;
292 
293         acc += __mul24(STAGE1_COEFF0, shared[max(ofs-7*BCOLS, minn+BCOLS)]);
294         acc += __mul24(STAGE1_COEFF1, shared[max(ofs-5*BCOLS, minn+BCOLS)]);
295         acc += __mul24(STAGE1_COEFF2, shared[max(ofs-3*BCOLS, minn+BCOLS)]);
296         acc += __mul24(STAGE1_COEFF3, shared[max(ofs-1*BCOLS, minn+BCOLS)]);
297         acc += __mul24(STAGE1_COEFF4, shared[ofs+1*BCOLS]);
298         acc += __mul24(STAGE1_COEFF5, shared[ofs+3*BCOLS]);
299         acc += __mul24(STAGE1_COEFF6, shared[ofs+5*BCOLS]);
300         acc += __mul24(STAGE1_COEFF7, shared[ofs+7*BCOLS]);
301 
302         shared[ofs] += acc >> STAGE1_SHIFT;
303     }
304 
305     __syncthreads();
306 
307     /* Phase 2 (odd), offset +1, stop at -8 */
308     ofs += BCOLS;
309     if(tidy<(BSVY-4))
310     {
311         int acc = STAGE2_OFFSET;
312 
313         acc += __mul24(STAGE2_COEFF0, shared[max(ofs-7*BCOLS, minn)]);
314         acc += __mul24(STAGE2_COEFF1, shared[max(ofs-5*BCOLS, minn)]);
315         acc += __mul24(STAGE2_COEFF2, shared[max(ofs-3*BCOLS, minn)]);
316         acc += __mul24(STAGE2_COEFF3, shared[ofs-1*BCOLS]);
317         acc += __mul24(STAGE2_COEFF4, shared[ofs+1*BCOLS]);
318         acc += __mul24(STAGE2_COEFF5, shared[ofs+3*BCOLS]);
319         acc += __mul24(STAGE2_COEFF6, shared[ofs+5*BCOLS]);
320         acc += __mul24(STAGE2_COEFF7, shared[ofs+7*BCOLS]);
321 
322         shared[ofs] += acc >> STAGE2_SHIFT;
323     }
324 
325 }
326 
327 // Finish off leftover
doTransformB(int xofs,unsigned int leftover)328 __device__ void doTransformB(int xofs, unsigned int leftover)
329 {
330     const int tidx = (threadIdx.x<<1)+xofs;   // column
331     const int tidy = threadIdx.y;   // row
332     const int maxx = leftover-(2<<BCOLS_SHIFT) + tidx;
333 
334     extern __shared__ DATATYPE shared[];
335     int ofs, ofs_t;
336 
337     ofs = ((RLEFT+(tidy<<1))<<BCOLS_SHIFT) + tidx;
338 
339     for(ofs_t=ofs+8*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
340     {
341         int acc = STAGE1_OFFSET;
342 
343         acc += __mul24(STAGE1_COEFF0, shared[ofs_t-7*BCOLS]);
344         acc += __mul24(STAGE1_COEFF1, shared[ofs_t-5*BCOLS]);
345         acc += __mul24(STAGE1_COEFF2, shared[ofs_t-3*BCOLS]);
346         acc += __mul24(STAGE1_COEFF3, shared[ofs_t-1*BCOLS]);
347         acc += __mul24(STAGE1_COEFF4, shared[ofs_t+1*BCOLS]);
348         acc += __mul24(STAGE1_COEFF5, shared[min(ofs_t+3*BCOLS, maxx+BCOLS)]);
349         acc += __mul24(STAGE1_COEFF6, shared[min(ofs_t+5*BCOLS, maxx+BCOLS)]);
350         acc += __mul24(STAGE1_COEFF7, shared[min(ofs_t+7*BCOLS, maxx+BCOLS)]);
351 
352         shared[ofs_t] += acc >> STAGE1_SHIFT;
353     }
354 
355     __syncthreads();
356 
357     for(ofs_t=ofs+1*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
358     {
359         int acc = STAGE2_OFFSET;
360 
361         acc += __mul24(STAGE2_COEFF0, shared[ofs_t-7*BCOLS]);
362         acc += __mul24(STAGE2_COEFF1, shared[ofs_t-5*BCOLS]);
363         acc += __mul24(STAGE2_COEFF2, shared[ofs_t-3*BCOLS]);
364         acc += __mul24(STAGE2_COEFF3, shared[ofs_t-1*BCOLS]);
365         acc += __mul24(STAGE2_COEFF4, shared[min(ofs_t+1*BCOLS, maxx)]);
366         acc += __mul24(STAGE2_COEFF5, shared[min(ofs_t+3*BCOLS, maxx)]);
367         acc += __mul24(STAGE2_COEFF6, shared[min(ofs_t+5*BCOLS, maxx)]);
368         acc += __mul24(STAGE2_COEFF7, shared[min(ofs_t+7*BCOLS, maxx)]);
369 
370         shared[ofs_t] += acc >> STAGE2_SHIFT;
371     }
372 
373 }
374 
375 
376 
a_transform_v(DATATYPE * data,int width,int height,int stride)377 static __global__ void a_transform_v( DATATYPE* data, int width, int height, int stride )
378 {
379     extern __shared__ DATATYPE shared[];
380 
381     const unsigned int bid = blockIdx.x;    // slab (BCOLS columns)
382     const unsigned int tidx = threadIdx.x<<1;   // column
383     const unsigned int tidy = threadIdx.y;   // row
384     const unsigned int swidth = min(width-(bid<<BCOLS_SHIFT), BCOLS); // Width of this slab, usually BCOLS but can be less
385 
386     // Element offset in global memory
387     //int idata = tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
388     data += tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
389 
390     const unsigned int istride = __mul24(BSVY, stride);
391     const unsigned int sdata = tidx + (tidy<<BCOLS_SHIFT);
392     // First read BROWS+RRIGHT
393     // After that BROWS
394     unsigned int ref = height-(WRITEBACK-SKIPTOP+RRIGHT+COPYROWS);
395     unsigned int blocks = ref/WRITEBACK;
396     unsigned int leftover = (RLEFT+COPYROWS+RRIGHT+(ref%WRITEBACK))<<BCOLS_SHIFT;
397 
398     unsigned int gofs,sofs;
399 
400     /// More than one block
401     /// Read first block of BROWS+RRIGHT rows
402     /// Upper RLEFT rows are left unitialized for now, later they should be copied from top
403     if(tidx < swidth)
404     {
405         gofs = 0;
406         sofs = sdata + ((RLEFT+SKIPTOP)<<BCOLS_SHIFT);
407         for(; sofs < (TOTALROWS<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
408             *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
409     }
410 // idata_read = idata_write + __mul24(TOTALROWS-RLEFT, stride)
411 
412     __syncthreads();
413 
414     doTransformT(0);
415     doTransformT(1);
416 
417     __syncthreads();
418 
419     /// Write back WRITEBACK rows
420     if(tidx < swidth)
421     {
422         gofs = 0;
423         sofs = sdata + ((RLEFT+SKIPTOP)<<BCOLS_SHIFT);
424         for(; sofs < ((WRITEBACK+RLEFT)<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
425             *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
426     }
427 // idata_read = idata_write + __mul24((BROWS+RRIGHT)-WRITEBACK, stride)
428 
429 // Difference between global mem read and write pointer
430 #define DATA_READ_DIFF __mul24((BROWS+RRIGHT)-WRITEBACK, stride)
431 // Advance pointer with this amount after each block
432 #define DATA_INC __mul24(WRITEBACK, stride)
433 
434     data += __mul24(WRITEBACK-SKIPTOP, stride);
435     for(unsigned int block=0; block<blocks; ++block)
436     {
437         __syncthreads();
438         /// Move lower rows to top rows
439 #if OVERLAP <= BSVY
440         if(tidy < OVERLAP)
441         {
442             unsigned int l = (tidy<<BCOLS_SHIFT)+tidx;
443             *((uint32_t*)&shared[l]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+l]);
444         }
445 #else
446         for(sofs = (tidy<<BCOLS_SHIFT)+tidx; sofs < (OVERLAP<<BCOLS_SHIFT); sofs += (BSVY<<BCOLS_SHIFT))
447             *((uint32_t*)&shared[sofs]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+sofs]);
448 #endif
449 
450         /// Fill shared memory -- read next block of BROWS rows
451         /// We can skip RRIGHT rows as we've already copied them for the previous block
452         /// and moved them to the top
453         if(tidx < swidth)
454         {
455             gofs = DATA_READ_DIFF;
456             sofs = sdata + (OVERLAP<<BCOLS_SHIFT);
457             for(; sofs < (TOTALROWS<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
458                 *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
459         }
460 
461         __syncthreads();
462 
463         doTransform(0);
464         doTransform(1);
465 
466         __syncthreads();
467 
468         /// Write back BROWS rows
469         if(tidx < swidth)
470         {
471             gofs = 0;
472             sofs = sdata + (RLEFT<<BCOLS_SHIFT);
473             for(; sofs < ((WRITEBACK+RLEFT)<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
474                 *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
475         }
476 
477         data += DATA_INC;
478     }
479     __syncthreads();
480 
481     ///
482     /// Handle partial last block
483     /// Move lower rows to top rows
484 #if OVERLAP <= BSVY
485         if(tidy < OVERLAP)
486         {
487             unsigned int l = (tidy<<BCOLS_SHIFT)+tidx;
488             *((uint32_t*)&shared[l]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+l]);
489         }
490 #else
491         for(sofs = (tidy<<BCOLS_SHIFT)+tidx; sofs < (OVERLAP<<BCOLS_SHIFT); sofs += (BSVY<<BCOLS_SHIFT))
492             *((uint32_t*)&shared[sofs]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+sofs]);
493 #endif
494 
495     /// Fill shared memory -- read next block of BROWS rows
496     /// We can skip RRIGHT rows as we've already copied them for the previous block
497     /// and moved them to the top
498     if(tidx < swidth)
499     {
500         gofs = DATA_READ_DIFF;
501         sofs = sdata + (OVERLAP<<BCOLS_SHIFT);
502         for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
503             *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
504     }
505 
506     __syncthreads();
507 
508     doTransformB(0, leftover);
509     doTransformB(1, leftover);
510 
511     __syncthreads();
512 
513     /// Write back leftover
514     if(tidx < swidth)
515     {
516         gofs = 0;
517         sofs = sdata + (RLEFT<<BCOLS_SHIFT);
518         for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
519             *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
520     }
521 
522 }
523 
524 /// Use this if the image is lower than PAD_ROWS
a_transform_v_pad(DATATYPE * data,int width,int height,int stride)525 static __global__ void a_transform_v_pad( DATATYPE* data, int width, int height, int stride )
526 {
527     extern __shared__ DATATYPE shared[];
528 
529     const unsigned int bid = blockIdx.x;    // slab (BCOLS columns)
530     const unsigned int tidx = threadIdx.x<<1;   // column
531     const unsigned int tidy = threadIdx.y;   // row
532     const unsigned int swidth = min(width-(bid<<BCOLS_SHIFT), BCOLS); // Width of this slab, usually BCOLS but can be less
533 
534     // Element offset in global memory
535     //int idata = tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
536     data +=  tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
537     const unsigned int istride = __mul24(BSVY, stride);
538     const unsigned int sdata = tidx + ((tidy+RLEFT)<<BCOLS_SHIFT); // Does this get converted into a shift?
539     // First read BROWS+RRIGHT
540     // After that BROWS
541     unsigned int leftover = (RLEFT+height) << BCOLS_SHIFT; /// How far to fill buffer on last read
542     //unsigned int blocks = (height-RRIGHT)/BROWS;
543 
544     unsigned int gofs, sofs;
545 
546     /// Fill shared memory -- read next block of BROWS rows
547     /// We can skip RRIGHT rows as we've already copied them for the previous block
548     /// and moved them to the top
549     if(tidx < swidth)
550     {
551         gofs = 0; // Read from row (cur+RRIGHT)
552         sofs = sdata;
553         for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
554             *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
555     }
556 
557     __syncthreads();
558 
559     doTransformTB(0, leftover);
560     doTransformTB(1, leftover);
561 
562     __syncthreads();
563 
564     /// Write back leftover
565     if(tidx < swidth)
566     {
567         gofs = 0;
568         sofs = sdata;
569         for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
570             *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
571     }
572 }
573 
cuda_iwt_fidelity(int16_t * d_data,int lwidth,int lheight,int stride,cudaStream_t stream)574 void cuda_iwt_fidelity(int16_t *d_data, int lwidth, int lheight, int stride, cudaStream_t stream)
575 {
576     /** Invoke kernel */
577     dim3 block_size;
578     dim3 grid_size;
579     int shared_size;
580 
581 #ifdef HORIZONTAL
582     block_size.x = BSH;
583     block_size.y = 1;
584     block_size.z = 1;
585     grid_size.x = lheight;
586     grid_size.y = 1;
587     grid_size.z = 1;
588     shared_size = (lwidth+BLEFT*2+BRIGHT*2) * sizeof(DATATYPE);
589     a_transform_h<<<grid_size, block_size, shared_size, stream>>>(d_data, lwidth, stride);
590 #endif
591 #ifdef VERTICAL
592     block_size.x = BSVX;
593     block_size.y = BSVY;
594     block_size.z = 1;
595     grid_size.x = (lwidth+BCOLS-1)/BCOLS;
596     grid_size.y = 1;
597     grid_size.z = 1;
598     shared_size = BCOLS*(BROWS+RLEFT+RRIGHT)*2;
599 
600     if(lheight < PAD_ROWS)
601         a_transform_v_pad<<<grid_size, block_size, shared_size, stream>>>(d_data, lwidth, lheight, stride);
602     else
603         a_transform_v<<<grid_size, block_size, shared_size, stream>>>(d_data, lwidth, lheight, stride);
604 #endif
605 }
606