1
2 // Autogenerated by WaveGen.py, do not edit! //
3 #include <algorithm>
4 #include <cassert>
5 #include "common.h"
6 #include "wavelet_common.h"
7
8 /// Boundaries (depends on wavelet)
9 /// This much is reserved at the sides of the signal
10 /// Must be even!
11 #define BLEFT 4
12 #define BRIGHT 4
13
14 /// Initial shift (to keep precision in integer wavelets)
15 #define INITIAL_SHIFT 0
16 #define INITIAL_OFFSET 0
17
18 // static const int16_t stage1_weights[] = { -8, 21, -46, 161, 161, -46, 21, -8 };
19 // static const int16_t stage2_weights[] = { 2, -10, 25, -81, -81, 25, -10, 2 };
20
21 #define STAGE1_OFFSET 128
22 #define STAGE1_SHIFT 8
23 #define STAGE1_COEFF0 (-8)
24 #define STAGE1_COEFF1 (21)
25 #define STAGE1_COEFF2 (-46)
26 #define STAGE1_COEFF3 (161)
27 #define STAGE1_COEFF4 (161)
28 #define STAGE1_COEFF5 (-46)
29 #define STAGE1_COEFF6 (21)
30 #define STAGE1_COEFF7 (-8)
31
32 #define STAGE2_OFFSET 127
33 #define STAGE2_SHIFT 8
34 #define STAGE2_COEFF0 (2)
35 #define STAGE2_COEFF1 (-10)
36 #define STAGE2_COEFF2 (25)
37 #define STAGE2_COEFF3 (-81)
38 #define STAGE2_COEFF4 (-81)
39 #define STAGE2_COEFF5 (25)
40 #define STAGE2_COEFF6 (-10)
41 #define STAGE2_COEFF7 (2)
42
43 /// Vertical pass row management
44 #define RLEFT 6
45 #define RRIGHT 6
46 #define COPYROWS 8
47
a_transform_h(DATATYPE * data,int width,int stride)48 static __global__ void a_transform_h( DATATYPE* data, int width, int stride )
49 {
50 extern __shared__ DATATYPE shared[];
51
52 const int bid = blockIdx.x; // row
53 const int tid = threadIdx.x; // thread id within row
54 const int tidu16 = ((tid&16)>>4)|((tid&15)<<1)|(tid&~31);
55
56 data += __mul24(bid, stride);
57
58 // Load entire line into shared memory
59 // Deinterleave right here
60 int half = BLEFT+(width>>1)+BRIGHT;
61
62 unsigned int ofs;
63
64 // Shared memory output offset for this thread
65 i16_2 *row = (i16_2*)data;
66 for(ofs = tid; ofs < (width>>1); ofs += BSH)
67 {
68 i16_2 val = row[ofs];
69 shared[BLEFT + ofs] = val.a << INITIAL_SHIFT; // even
70 shared[BLEFT + ofs + half] = val.b << INITIAL_SHIFT; // odd
71 }
72 __syncthreads();
73
74 if(tidu16<4)
75 {
76 /*
77 lo[-4] = lo[0];
78 lo[-3] = lo[0];
79 lo[-2] = lo[0];
80 lo[-1] = lo[0];
81 lo[n] = lo[n-1];
82 lo[n+1] = lo[n-1];
83 lo[n+2] = lo[n-1];
84 */
85 shared[half+BLEFT-4+tidu16] = shared[half+BLEFT];
86 shared[half+BLEFT+(width>>1)+tidu16] = shared[half+BLEFT+(width>>1)-1];
87 }
88 __syncthreads();
89 // Now apply wavelet lifting to entire line at once
90 // Process even
91 const int end = BLEFT+(width>>1);
92 for(ofs = BLEFT+tidu16; ofs < end; ofs += BSH)
93 {
94 int acc = STAGE1_OFFSET;
95
96 acc += __mul24(STAGE1_COEFF0,shared[half+ofs-4]);
97 acc += __mul24(STAGE1_COEFF1,shared[half+ofs-3]);
98 acc += __mul24(STAGE1_COEFF2,shared[half+ofs-2]);
99 acc += __mul24(STAGE1_COEFF3,shared[half+ofs-1]);
100 acc += __mul24(STAGE1_COEFF4,shared[half+ofs+0]);
101 acc += __mul24(STAGE1_COEFF5,shared[half+ofs+1]);
102 acc += __mul24(STAGE1_COEFF6,shared[half+ofs+2]);
103 acc += __mul24(STAGE1_COEFF7,shared[half+ofs+3]);
104
105 shared[ofs] += acc >> STAGE1_SHIFT;
106 }
107
108 __syncthreads();
109 if(tidu16<4)
110 {
111 /*
112 hi[-3] = hi[0];
113 hi[-2] = hi[0];
114 hi[-1] = hi[0];
115 hi[n] = hi[n-1];
116 hi[n+1] = hi[n-1];
117 hi[n+2] = hi[n-1];
118 hi[n+3] = hi[n-1];
119 */
120 shared[BLEFT-4+tidu16] = shared[BLEFT];
121 shared[BLEFT+(width>>1)+tidu16] = shared[BLEFT+(width>>1)-1];
122 }
123 __syncthreads();
124
125 // Process odd
126 for(ofs = BLEFT+tidu16; ofs < end; ofs += BSH)
127 {
128 int acc = STAGE2_OFFSET;
129
130 acc += __mul24(STAGE2_COEFF0, shared[ofs-3]);
131 acc += __mul24(STAGE2_COEFF1, shared[ofs-2]);
132 acc += __mul24(STAGE2_COEFF2, shared[ofs-1]);
133 acc += __mul24(STAGE2_COEFF3, shared[ofs-0]);
134 acc += __mul24(STAGE2_COEFF4, shared[ofs+1]);
135 acc += __mul24(STAGE2_COEFF5, shared[ofs+2]);
136 acc += __mul24(STAGE2_COEFF6, shared[ofs+3]);
137 acc += __mul24(STAGE2_COEFF7, shared[ofs+4]);
138
139 shared[ofs + half] += acc >> STAGE2_SHIFT;
140 }
141
142 __syncthreads();
143
144
145 /// Write line back to global memory, don't interleave again
146 /// Mind the gap between BLEFT+width/2 and half
147 if(width&3) // If width is not a multiple of 4, we need to use the slower method
148 {
149 /// Left part (even coefficients)
150 for(ofs = tid; ofs < (width>>1); ofs += BSH)
151 data[ofs] = shared[BLEFT+ofs];
152
153 /// Right part (odd coefficients)
154 for(ofs = tid; ofs < (width>>1); ofs += BSH)
155 data[(width>>1)+ofs] = shared[half+BLEFT+ofs];
156 }
157 else
158 {
159 /// Left part (even coefficients)
160 for(ofs = tid; ofs < (width>>2); ofs += BSH)
161 row[ofs] = *((i16_2*)&shared[BLEFT+(ofs<<1)]);
162 row += (width>>2);
163 /// Right part (odd coefficients)
164 for(ofs = tid; ofs < (width>>2); ofs += BSH)
165 row[ofs] = *((i16_2*)&shared[half+BLEFT+(ofs<<1)]);
166 }
167 }
168
169 #define BROWS (2*BSVY+COPYROWS) /* Rows to process at once */
170 #define SKIPTOP COPYROWS
171
172 #define PAD_ROWS (WRITEBACK-SKIPTOP+RRIGHT+COPYROWS) /* Rows below which to use s_transform_v_pad */
173 /// tid is BCOLSxBROWS matrix
174 /// RLEFT+BROWS+RRIGHT rows
175 #define TOTALROWS (RLEFT+BROWS+RRIGHT)
176 #define OVERLAP (RLEFT+RRIGHT+COPYROWS)
177 #define OVERLAP_OFFSET (TOTALROWS-OVERLAP)
178 #define WRITEBACK (2*BSVY)
179
doTransform(int xofs)180 __device__ void doTransform(int xofs)
181 {
182 const int tidx = (threadIdx.x<<1)+xofs; // column
183 const int tidy = threadIdx.y; // row
184
185 extern __shared__ DATATYPE shared[];
186 int ofs;
187
188 ofs = ((RLEFT+(tidy<<1)+8)<<BCOLS_SHIFT) + tidx;
189
190 /* Phase 1 (even) at +8*BCOLS */
191 {
192 int acc = STAGE1_OFFSET;
193
194 acc += __mul24(STAGE1_COEFF0, shared[ofs-7*BCOLS]);
195 acc += __mul24(STAGE1_COEFF1, shared[ofs-5*BCOLS]);
196 acc += __mul24(STAGE1_COEFF2, shared[ofs-3*BCOLS]);
197 acc += __mul24(STAGE1_COEFF3, shared[ofs-1*BCOLS]);
198 acc += __mul24(STAGE1_COEFF4, shared[ofs+1*BCOLS]);
199 acc += __mul24(STAGE1_COEFF5, shared[ofs+3*BCOLS]);
200 acc += __mul24(STAGE1_COEFF6, shared[ofs+5*BCOLS]);
201 acc += __mul24(STAGE1_COEFF7, shared[ofs+7*BCOLS]);
202
203 shared[ofs] += acc >> STAGE1_SHIFT;
204 }
205
206 __syncthreads();
207
208 /* Phase 2 (odd) at +1*BCOLS */
209 ofs -= 7*BCOLS;
210 {
211 int acc = STAGE2_OFFSET;
212
213 acc += __mul24(STAGE2_COEFF0, shared[ofs-7*BCOLS]);
214 acc += __mul24(STAGE2_COEFF1, shared[ofs-5*BCOLS]);
215 acc += __mul24(STAGE2_COEFF2, shared[ofs-3*BCOLS]);
216 acc += __mul24(STAGE2_COEFF3, shared[ofs-1*BCOLS]);
217 acc += __mul24(STAGE2_COEFF4, shared[ofs+1*BCOLS]);
218 acc += __mul24(STAGE2_COEFF5, shared[ofs+3*BCOLS]);
219 acc += __mul24(STAGE2_COEFF6, shared[ofs+5*BCOLS]);
220 acc += __mul24(STAGE2_COEFF7, shared[ofs+7*BCOLS]);
221
222 shared[ofs] += acc >> STAGE2_SHIFT;
223 }
224
225 }
226
227 // Process 16 lines
doTransformTB(int xofs,unsigned int leftover)228 __device__ void doTransformTB(int xofs, unsigned int leftover)
229 {
230 const int tidx = (threadIdx.x<<1)+xofs; // column
231 const int tidy = threadIdx.y; // row
232 const int minn = (RLEFT<<BCOLS_SHIFT) + tidx;
233 const int maxx = leftover-(2<<BCOLS_SHIFT) + tidx;
234
235 extern __shared__ DATATYPE shared[];
236 int ofs, ofs_t;
237
238 /* Phase 1 (even) */
239 ofs = ((RLEFT+(tidy<<1))<<BCOLS_SHIFT) + tidx;
240
241 for(ofs_t=ofs+0*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
242 {
243 int acc = STAGE1_OFFSET;
244
245 acc += __mul24(STAGE1_COEFF0, shared[max(ofs_t-7*BCOLS, minn+BCOLS)]);
246 acc += __mul24(STAGE1_COEFF1, shared[max(ofs_t-5*BCOLS, minn+BCOLS)]);
247 acc += __mul24(STAGE1_COEFF2, shared[max(ofs_t-3*BCOLS, minn+BCOLS)]);
248 acc += __mul24(STAGE1_COEFF3, shared[max(ofs_t-1*BCOLS, minn+BCOLS)]);
249 acc += __mul24(STAGE1_COEFF4, shared[ofs_t+1*BCOLS]);
250 acc += __mul24(STAGE1_COEFF5, shared[min(ofs_t+3*BCOLS, maxx+BCOLS)]);
251 acc += __mul24(STAGE1_COEFF6, shared[min(ofs_t+5*BCOLS, maxx+BCOLS)]);
252 acc += __mul24(STAGE1_COEFF7, shared[min(ofs_t+7*BCOLS, maxx+BCOLS)]);
253
254 shared[ofs_t] += acc >> STAGE1_SHIFT;
255 }
256
257 __syncthreads();
258
259 /* Phase 2 (odd) */
260 for(ofs_t=ofs+1*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
261 {
262 int acc = STAGE2_OFFSET;
263
264 acc += __mul24(STAGE2_COEFF0, shared[max(ofs_t-7*BCOLS, minn)]);
265 acc += __mul24(STAGE2_COEFF1, shared[max(ofs_t-5*BCOLS, minn)]);
266 acc += __mul24(STAGE2_COEFF2, shared[max(ofs_t-3*BCOLS, minn)]);
267 acc += __mul24(STAGE2_COEFF3, shared[ofs_t-1*BCOLS]);
268 acc += __mul24(STAGE2_COEFF4, shared[min(ofs_t+1*BCOLS, maxx)]);
269 acc += __mul24(STAGE2_COEFF5, shared[min(ofs_t+3*BCOLS, maxx)]);
270 acc += __mul24(STAGE2_COEFF6, shared[min(ofs_t+5*BCOLS, maxx)]);
271 acc += __mul24(STAGE2_COEFF7, shared[min(ofs_t+7*BCOLS, maxx)]);
272
273 shared[ofs_t] += acc >> STAGE2_SHIFT;
274 }
275
276 }
277
278 // Process BROWS-2 lines
doTransformT(int xofs)279 __device__ void doTransformT(int xofs)
280 {
281 const int tidx = (threadIdx.x<<1)+xofs; // column
282 const int tidy = threadIdx.y; // row
283 const int minn = ((RLEFT+SKIPTOP)<<BCOLS_SHIFT) + tidx;
284
285 extern __shared__ DATATYPE shared[];
286 int ofs;
287
288 /* Phase 1 (even), offset +0 */
289 ofs = ((SKIPTOP+RLEFT+(tidy<<1))<<BCOLS_SHIFT) + tidx;
290 {
291 int acc = STAGE1_OFFSET;
292
293 acc += __mul24(STAGE1_COEFF0, shared[max(ofs-7*BCOLS, minn+BCOLS)]);
294 acc += __mul24(STAGE1_COEFF1, shared[max(ofs-5*BCOLS, minn+BCOLS)]);
295 acc += __mul24(STAGE1_COEFF2, shared[max(ofs-3*BCOLS, minn+BCOLS)]);
296 acc += __mul24(STAGE1_COEFF3, shared[max(ofs-1*BCOLS, minn+BCOLS)]);
297 acc += __mul24(STAGE1_COEFF4, shared[ofs+1*BCOLS]);
298 acc += __mul24(STAGE1_COEFF5, shared[ofs+3*BCOLS]);
299 acc += __mul24(STAGE1_COEFF6, shared[ofs+5*BCOLS]);
300 acc += __mul24(STAGE1_COEFF7, shared[ofs+7*BCOLS]);
301
302 shared[ofs] += acc >> STAGE1_SHIFT;
303 }
304
305 __syncthreads();
306
307 /* Phase 2 (odd), offset +1, stop at -8 */
308 ofs += BCOLS;
309 if(tidy<(BSVY-4))
310 {
311 int acc = STAGE2_OFFSET;
312
313 acc += __mul24(STAGE2_COEFF0, shared[max(ofs-7*BCOLS, minn)]);
314 acc += __mul24(STAGE2_COEFF1, shared[max(ofs-5*BCOLS, minn)]);
315 acc += __mul24(STAGE2_COEFF2, shared[max(ofs-3*BCOLS, minn)]);
316 acc += __mul24(STAGE2_COEFF3, shared[ofs-1*BCOLS]);
317 acc += __mul24(STAGE2_COEFF4, shared[ofs+1*BCOLS]);
318 acc += __mul24(STAGE2_COEFF5, shared[ofs+3*BCOLS]);
319 acc += __mul24(STAGE2_COEFF6, shared[ofs+5*BCOLS]);
320 acc += __mul24(STAGE2_COEFF7, shared[ofs+7*BCOLS]);
321
322 shared[ofs] += acc >> STAGE2_SHIFT;
323 }
324
325 }
326
327 // Finish off leftover
doTransformB(int xofs,unsigned int leftover)328 __device__ void doTransformB(int xofs, unsigned int leftover)
329 {
330 const int tidx = (threadIdx.x<<1)+xofs; // column
331 const int tidy = threadIdx.y; // row
332 const int maxx = leftover-(2<<BCOLS_SHIFT) + tidx;
333
334 extern __shared__ DATATYPE shared[];
335 int ofs, ofs_t;
336
337 ofs = ((RLEFT+(tidy<<1))<<BCOLS_SHIFT) + tidx;
338
339 for(ofs_t=ofs+8*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
340 {
341 int acc = STAGE1_OFFSET;
342
343 acc += __mul24(STAGE1_COEFF0, shared[ofs_t-7*BCOLS]);
344 acc += __mul24(STAGE1_COEFF1, shared[ofs_t-5*BCOLS]);
345 acc += __mul24(STAGE1_COEFF2, shared[ofs_t-3*BCOLS]);
346 acc += __mul24(STAGE1_COEFF3, shared[ofs_t-1*BCOLS]);
347 acc += __mul24(STAGE1_COEFF4, shared[ofs_t+1*BCOLS]);
348 acc += __mul24(STAGE1_COEFF5, shared[min(ofs_t+3*BCOLS, maxx+BCOLS)]);
349 acc += __mul24(STAGE1_COEFF6, shared[min(ofs_t+5*BCOLS, maxx+BCOLS)]);
350 acc += __mul24(STAGE1_COEFF7, shared[min(ofs_t+7*BCOLS, maxx+BCOLS)]);
351
352 shared[ofs_t] += acc >> STAGE1_SHIFT;
353 }
354
355 __syncthreads();
356
357 for(ofs_t=ofs+1*BCOLS; ofs_t<leftover; ofs_t += BSVY*2*BCOLS)
358 {
359 int acc = STAGE2_OFFSET;
360
361 acc += __mul24(STAGE2_COEFF0, shared[ofs_t-7*BCOLS]);
362 acc += __mul24(STAGE2_COEFF1, shared[ofs_t-5*BCOLS]);
363 acc += __mul24(STAGE2_COEFF2, shared[ofs_t-3*BCOLS]);
364 acc += __mul24(STAGE2_COEFF3, shared[ofs_t-1*BCOLS]);
365 acc += __mul24(STAGE2_COEFF4, shared[min(ofs_t+1*BCOLS, maxx)]);
366 acc += __mul24(STAGE2_COEFF5, shared[min(ofs_t+3*BCOLS, maxx)]);
367 acc += __mul24(STAGE2_COEFF6, shared[min(ofs_t+5*BCOLS, maxx)]);
368 acc += __mul24(STAGE2_COEFF7, shared[min(ofs_t+7*BCOLS, maxx)]);
369
370 shared[ofs_t] += acc >> STAGE2_SHIFT;
371 }
372
373 }
374
375
376
a_transform_v(DATATYPE * data,int width,int height,int stride)377 static __global__ void a_transform_v( DATATYPE* data, int width, int height, int stride )
378 {
379 extern __shared__ DATATYPE shared[];
380
381 const unsigned int bid = blockIdx.x; // slab (BCOLS columns)
382 const unsigned int tidx = threadIdx.x<<1; // column
383 const unsigned int tidy = threadIdx.y; // row
384 const unsigned int swidth = min(width-(bid<<BCOLS_SHIFT), BCOLS); // Width of this slab, usually BCOLS but can be less
385
386 // Element offset in global memory
387 //int idata = tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
388 data += tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
389
390 const unsigned int istride = __mul24(BSVY, stride);
391 const unsigned int sdata = tidx + (tidy<<BCOLS_SHIFT);
392 // First read BROWS+RRIGHT
393 // After that BROWS
394 unsigned int ref = height-(WRITEBACK-SKIPTOP+RRIGHT+COPYROWS);
395 unsigned int blocks = ref/WRITEBACK;
396 unsigned int leftover = (RLEFT+COPYROWS+RRIGHT+(ref%WRITEBACK))<<BCOLS_SHIFT;
397
398 unsigned int gofs,sofs;
399
400 /// More than one block
401 /// Read first block of BROWS+RRIGHT rows
402 /// Upper RLEFT rows are left unitialized for now, later they should be copied from top
403 if(tidx < swidth)
404 {
405 gofs = 0;
406 sofs = sdata + ((RLEFT+SKIPTOP)<<BCOLS_SHIFT);
407 for(; sofs < (TOTALROWS<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
408 *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
409 }
410 // idata_read = idata_write + __mul24(TOTALROWS-RLEFT, stride)
411
412 __syncthreads();
413
414 doTransformT(0);
415 doTransformT(1);
416
417 __syncthreads();
418
419 /// Write back WRITEBACK rows
420 if(tidx < swidth)
421 {
422 gofs = 0;
423 sofs = sdata + ((RLEFT+SKIPTOP)<<BCOLS_SHIFT);
424 for(; sofs < ((WRITEBACK+RLEFT)<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
425 *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
426 }
427 // idata_read = idata_write + __mul24((BROWS+RRIGHT)-WRITEBACK, stride)
428
429 // Difference between global mem read and write pointer
430 #define DATA_READ_DIFF __mul24((BROWS+RRIGHT)-WRITEBACK, stride)
431 // Advance pointer with this amount after each block
432 #define DATA_INC __mul24(WRITEBACK, stride)
433
434 data += __mul24(WRITEBACK-SKIPTOP, stride);
435 for(unsigned int block=0; block<blocks; ++block)
436 {
437 __syncthreads();
438 /// Move lower rows to top rows
439 #if OVERLAP <= BSVY
440 if(tidy < OVERLAP)
441 {
442 unsigned int l = (tidy<<BCOLS_SHIFT)+tidx;
443 *((uint32_t*)&shared[l]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+l]);
444 }
445 #else
446 for(sofs = (tidy<<BCOLS_SHIFT)+tidx; sofs < (OVERLAP<<BCOLS_SHIFT); sofs += (BSVY<<BCOLS_SHIFT))
447 *((uint32_t*)&shared[sofs]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+sofs]);
448 #endif
449
450 /// Fill shared memory -- read next block of BROWS rows
451 /// We can skip RRIGHT rows as we've already copied them for the previous block
452 /// and moved them to the top
453 if(tidx < swidth)
454 {
455 gofs = DATA_READ_DIFF;
456 sofs = sdata + (OVERLAP<<BCOLS_SHIFT);
457 for(; sofs < (TOTALROWS<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
458 *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
459 }
460
461 __syncthreads();
462
463 doTransform(0);
464 doTransform(1);
465
466 __syncthreads();
467
468 /// Write back BROWS rows
469 if(tidx < swidth)
470 {
471 gofs = 0;
472 sofs = sdata + (RLEFT<<BCOLS_SHIFT);
473 for(; sofs < ((WRITEBACK+RLEFT)<<BCOLS_SHIFT); sofs += (BCOLS*BSVY), gofs += istride)
474 *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
475 }
476
477 data += DATA_INC;
478 }
479 __syncthreads();
480
481 ///
482 /// Handle partial last block
483 /// Move lower rows to top rows
484 #if OVERLAP <= BSVY
485 if(tidy < OVERLAP)
486 {
487 unsigned int l = (tidy<<BCOLS_SHIFT)+tidx;
488 *((uint32_t*)&shared[l]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+l]);
489 }
490 #else
491 for(sofs = (tidy<<BCOLS_SHIFT)+tidx; sofs < (OVERLAP<<BCOLS_SHIFT); sofs += (BSVY<<BCOLS_SHIFT))
492 *((uint32_t*)&shared[sofs]) = *((uint32_t*)&shared[(WRITEBACK<<BCOLS_SHIFT)+sofs]);
493 #endif
494
495 /// Fill shared memory -- read next block of BROWS rows
496 /// We can skip RRIGHT rows as we've already copied them for the previous block
497 /// and moved them to the top
498 if(tidx < swidth)
499 {
500 gofs = DATA_READ_DIFF;
501 sofs = sdata + (OVERLAP<<BCOLS_SHIFT);
502 for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
503 *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
504 }
505
506 __syncthreads();
507
508 doTransformB(0, leftover);
509 doTransformB(1, leftover);
510
511 __syncthreads();
512
513 /// Write back leftover
514 if(tidx < swidth)
515 {
516 gofs = 0;
517 sofs = sdata + (RLEFT<<BCOLS_SHIFT);
518 for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
519 *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
520 }
521
522 }
523
524 /// Use this if the image is lower than PAD_ROWS
a_transform_v_pad(DATATYPE * data,int width,int height,int stride)525 static __global__ void a_transform_v_pad( DATATYPE* data, int width, int height, int stride )
526 {
527 extern __shared__ DATATYPE shared[];
528
529 const unsigned int bid = blockIdx.x; // slab (BCOLS columns)
530 const unsigned int tidx = threadIdx.x<<1; // column
531 const unsigned int tidy = threadIdx.y; // row
532 const unsigned int swidth = min(width-(bid<<BCOLS_SHIFT), BCOLS); // Width of this slab, usually BCOLS but can be less
533
534 // Element offset in global memory
535 //int idata = tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
536 data += tidx + (bid<<BCOLS_SHIFT) + __mul24(tidy, stride);
537 const unsigned int istride = __mul24(BSVY, stride);
538 const unsigned int sdata = tidx + ((tidy+RLEFT)<<BCOLS_SHIFT); // Does this get converted into a shift?
539 // First read BROWS+RRIGHT
540 // After that BROWS
541 unsigned int leftover = (RLEFT+height) << BCOLS_SHIFT; /// How far to fill buffer on last read
542 //unsigned int blocks = (height-RRIGHT)/BROWS;
543
544 unsigned int gofs, sofs;
545
546 /// Fill shared memory -- read next block of BROWS rows
547 /// We can skip RRIGHT rows as we've already copied them for the previous block
548 /// and moved them to the top
549 if(tidx < swidth)
550 {
551 gofs = 0; // Read from row (cur+RRIGHT)
552 sofs = sdata;
553 for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
554 *((uint32_t*)&shared[sofs]) = *((uint32_t*)&data[gofs]);
555 }
556
557 __syncthreads();
558
559 doTransformTB(0, leftover);
560 doTransformTB(1, leftover);
561
562 __syncthreads();
563
564 /// Write back leftover
565 if(tidx < swidth)
566 {
567 gofs = 0;
568 sofs = sdata;
569 for(; sofs < leftover; sofs += (BCOLS*BSVY), gofs += istride)
570 *((uint32_t*)&data[gofs]) = *((uint32_t*)&shared[sofs]);
571 }
572 }
573
cuda_iwt_fidelity(int16_t * d_data,int lwidth,int lheight,int stride,cudaStream_t stream)574 void cuda_iwt_fidelity(int16_t *d_data, int lwidth, int lheight, int stride, cudaStream_t stream)
575 {
576 /** Invoke kernel */
577 dim3 block_size;
578 dim3 grid_size;
579 int shared_size;
580
581 #ifdef HORIZONTAL
582 block_size.x = BSH;
583 block_size.y = 1;
584 block_size.z = 1;
585 grid_size.x = lheight;
586 grid_size.y = 1;
587 grid_size.z = 1;
588 shared_size = (lwidth+BLEFT*2+BRIGHT*2) * sizeof(DATATYPE);
589 a_transform_h<<<grid_size, block_size, shared_size, stream>>>(d_data, lwidth, stride);
590 #endif
591 #ifdef VERTICAL
592 block_size.x = BSVX;
593 block_size.y = BSVY;
594 block_size.z = 1;
595 grid_size.x = (lwidth+BCOLS-1)/BCOLS;
596 grid_size.y = 1;
597 grid_size.z = 1;
598 shared_size = BCOLS*(BROWS+RLEFT+RRIGHT)*2;
599
600 if(lheight < PAD_ROWS)
601 a_transform_v_pad<<<grid_size, block_size, shared_size, stream>>>(d_data, lwidth, lheight, stride);
602 else
603 a_transform_v<<<grid_size, block_size, shared_size, stream>>>(d_data, lwidth, lheight, stride);
604 #endif
605 }
606