1Index: dcraw/dcraw_lz.c
2===================================================================
3--- dcraw.orig/dcraw_lz.c
4+++ dcraw/dcraw_lz.c
5@@ -1,4 +1,8 @@
6 /*
7+ * OpenMP multithread version of dcraw
8+ * based on UFRaw dcraw_indi.c by Udi Fuchs
9+ */
10+/*
11    dcraw.c -- Dave Coffin's raw photo decoder
12    Copyright 1997-2018 by Dave Coffin, dcoffin a cybercom o net
13
14@@ -96,6 +100,15 @@ typedef unsigned long long UINT64;
15 #define ushort unsigned short
16 #endif
17
18+#ifdef _OPENMP
19+#include <omp.h>
20+#define uf_omp_get_thread_num() omp_get_thread_num()
21+#define uf_omp_get_num_threads() omp_get_num_threads()
22+#else
23+#define uf_omp_get_thread_num() 0
24+#define uf_omp_get_num_threads() 1
25+#endif
26+
27 /*
28    All global variables are defined here, and all functions that
29    access them are prefixed with "CLASS".  For thread-safety, all
30@@ -4132,6 +4145,13 @@ void CLASS wavelet_denoise()
31   merror (fimg, "wavelet_denoise()");
32   temp = fimg + size*3;
33   if ((nc = colors) == 3 && filters) nc++;
34+#if defined(__sun) && !defined(__GNUC__)	/* Fix UFRaw bug #3205673 - NKBJ */
35+    #pragma omp parallel for default(shared) \
36+    private(c,i,hpass,lev,lpass,row,col,thold,fimg,temp)
37+#else
38+    #pragma omp parallel for default(shared) \
39+    private(c,i,hpass,lev,lpass,row,col,thold,fimg,temp)
40+#endif
41   FORC(nc) {			/* denoise R,G1,B,G3 individually */
42     for (i=0; i < size; i++)
43       fimg[i] = 256 * sqrt(image[i][c] << scale);
44@@ -4355,8 +4375,10 @@ void CLASS pre_interpolate()
45
46 void CLASS border_interpolate (int border)
47 {
48-  unsigned row, col, y, x, f, c, sum[8];
49+  int row;
50+  unsigned col, y, x, f, c, sum[8];
51
52+#pragma omp for
53   for (row=0; row < height; row++)
54     for (col=0; col < width; col++) {
55       if (col==border && row >= border && row < height-border)
56@@ -4383,7 +4405,11 @@ void CLASS lin_interpolate()
57
58   if (verbose) fprintf (stderr,_("Bilinear interpolation...\n"));
59   if (filters == 9) size = 6;
60+#pragma omp parallel default(shared) \
61+  private(ip, sum, f, c, i, x, y, row, col, shift, color, pix)
62+{
63   border_interpolate(1);
64+#pragma omp for
65   for (row=0; row < size; row++)
66     for (col=0; col < size; col++) {
67       ip = code[row][col]+1;
68@@ -4406,6 +4432,7 @@ void CLASS lin_interpolate()
69 	  *ip++ = 256 / sum[c];
70 	}
71     }
72+#pragma omp for
73   for (row=1; row < height-1; row++)
74     for (col=1; col < width-1; col++) {
75       pix = image[row*width+col];
76@@ -4416,6 +4443,7 @@ void CLASS lin_interpolate()
77       for (i=colors; --i; ip+=2)
78 	pix[ip[0]] = sum[ip[0]] * ip[1] >> 8;
79     }
80+} /* pragma omp parallel */
81 }
82
83 /*
84@@ -4454,7 +4482,7 @@ void CLASS vng_interpolate()
85     +1,-1,+1,+1,0,0x88, +1,+0,+1,+2,0,0x08, +1,+0,+2,-1,0,0x40,
86     +1,+0,+2,+1,0,0x10
87   }, chood[] = { -1,-1, -1,0, -1,+1, 0,+1, +1,+1, +1,0, +1,-1, 0,-1 };
88-  ushort (*brow[5])[4], *pix;
89+  ushort (*brow[4])[4], *pix;
90   int prow=8, pcol=2, *ip, *code[16][16], gval[8], gmin, gmax, sum[4];
91   int row, col, x, y, x1, x2, y1, y2, t, weight, grads, color, diag;
92   int g, diff, thold, num, c;
93@@ -4464,7 +4492,7 @@ void CLASS vng_interpolate()
94
95   if (filters == 1) prow = pcol = 16;
96   if (filters == 9) prow = pcol =  6;
97-  ip = (int *) calloc (prow*pcol, 1280);
98+  int *ipalloc = ip = (int *) calloc (prow*pcol, 1280);
99   merror (ip, "vng_interpolate()");
100   for (row=0; row < prow; row++)		/* Precalculate for VNG */
101     for (col=0; col < pcol; col++) {
102@@ -4496,11 +4524,18 @@ void CLASS vng_interpolate()
103 	  *ip++ = 0;
104       }
105     }
106-  brow[4] = (ushort (*)[4]) calloc (width*3, sizeof **brow);
107-  merror (brow[4], "vng_interpolate()");
108-  for (row=0; row < 3; row++)
109-    brow[row] = brow[4] + row*width;
110-  for (row=2; row < height-2; row++) {		/* Do VNG interpolation */
111+#pragma omp parallel default(shared) \
112+  private(row,col,g,brow,pix,ip,gval,diff,gmin,gmax,thold,sum,color,num,c,t)
113+{
114+  ushort (*rowtmp)[4];
115+  rowtmp = (ushort(*)[4])malloc(4 * width * 4 * sizeof(ushort));
116+  int slice = (height - 4) / uf_omp_get_num_threads();
117+  int start_row = 2 + slice * uf_omp_get_thread_num();
118+  int end_row = MIN(start_row + slice, height - 2);
119+  for (row = start_row; row < end_row; row++) {	/* Do VNG interpolation */
120+
121+    for (g = 0; g < 4; g++)
122+      brow[g] = &rowtmp[(row + g - 2) % 4];
123     for (col=2; col < width-2; col++) {
124       pix = image[row*width+col];
125       ip = code[row % prow][col % pcol];
126@@ -4544,15 +4579,15 @@ void CLASS vng_interpolate()
127 	brow[2][col][c] = CLIP(t);
128       }
129     }
130-    if (row > 3)				/* Write buffer to image */
131+    if (row > start_row + 1)				/* Write buffer to image */
132       memcpy (image[(row-2)*width+2], brow[0]+2, (width-4)*sizeof *image);
133-    for (g=0; g < 4; g++)
134-      brow[(g-1) & 3] = brow[g];
135   }
136-  memcpy (image[(row-2)*width+2], brow[0]+2, (width-4)*sizeof *image);
137-  memcpy (image[(row-1)*width+2], brow[1]+2, (width-4)*sizeof *image);
138-  free (brow[4]);
139-  free (code[0][0]);
140+  if (row == height - 2) {
141+    memcpy (image[(row-2)*width+2], brow[0]+2, (width-4)*sizeof *image);
142+    memcpy (image[(row-1)*width+2], brow[1]+2, (width-4)*sizeof *image);
143+  }
144+} /* pragma omp parallel */
145+  free(ipalloc);
146 }
147
148 /*
149@@ -4564,10 +4599,13 @@ void CLASS ppg_interpolate()
150   int row, col, diff[2], guess[2], c, d, i;
151   ushort (*pix)[4];
152
153-  border_interpolate(3);
154   if (verbose) fprintf (stderr,_("PPG interpolation...\n"));
155
156+#pragma omp parallel default(shared) private(row,col,i,d,c,pix,guess)
157+{
158+  border_interpolate(3);
159 /*  Fill in the green layer with gradients and pattern recognition: */
160+#pragma omp for
161   for (row=3; row < height-3; row++)
162     for (col=3+(FC(row,3) & 1), c=FC(row,col); col < width-3; col+=2) {
163       pix = image + row*width+col;
164@@ -4584,6 +4622,7 @@ void CLASS ppg_interpolate()
165       pix[0][1] = ULIM(guess[i] >> 2, pix[d][1], pix[-d][1]);
166     }
167 /*  Calculate red and blue for each green pixel:		*/
168+#pragma omp for
169   for (row=1; row < height-1; row++)
170     for (col=1+(FC(row,2) & 1), c=FC(row,col+1); col < width-1; col+=2) {
171       pix = image + row*width+col;
172@@ -4592,6 +4631,7 @@ void CLASS ppg_interpolate()
173 			- pix[-d][1] - pix[d][1]) >> 1);
174     }
175 /*  Calculate blue for red pixels and vice versa:		*/
176+#pragma omp for
177   for (row=1; row < height-1; row++)
178     for (col=1+(FC(row,1) & 1), c=2-FC(row,col); col < width-1; col+=2) {
179       pix = image + row*width+col;
180@@ -4607,6 +4647,7 @@ void CLASS ppg_interpolate()
181       else
182 	pix[0][c] = CLIP((guess[0]+guess[1]) >> 2);
183     }
184+} /* pragma omp parallel */
185 }
186
187 void CLASS cielab (ushort rgb[3], short lab[3])
188@@ -4616,13 +4657,16 @@ void CLASS cielab (ushort rgb[3], short
189   static float cbrt[0x10000], xyz_cam[3][4];
190
191   if (!rgb) {
192+#pragma omp for
193     for (i=0; i < 0x10000; i++) {
194       r = i / 65535.0;
195       cbrt[i] = r > 0.008856 ? pow(r,1/3.0) : 7.787*r + 16/116.0;
196     }
197+#pragma omp for
198     for (i=0; i < 3; i++)
199       for (j=0; j < colors; j++)
200 	for (xyz_cam[i][j] = k=0; k < 3; k++)
201+#pragma omp atomic
202 	  xyz_cam[i][j] += xyz_rgb[i][k] * rgb_cam[k][j] / d65_white[i];
203     return;
204   }
205@@ -4888,6 +4932,10 @@ void CLASS ahd_interpolate()
206
207   if (verbose) fprintf (stderr,_("AHD interpolation...\n"));
208
209+#pragma omp parallel default(shared) \
210+  private(top,left,row,col,pix,rix,lix,c,val,d,tc,tr,i,j,ldiff,abdiff,leps, \
211+          abeps,hm,buffer,rgb,lab,homo)
212+{
213   cielab (0,0);
214   border_interpolate(5);
215   buffer = (char *) malloc (26*TS*TS);
216@@ -4896,6 +4944,7 @@ void CLASS ahd_interpolate()
217   lab  = (short (*)[TS][TS][3])(buffer + 12*TS*TS);
218   homo = (char  (*)[TS][TS])   (buffer + 24*TS*TS);
219
220+#pragma omp for
221   for (top=2; top < height-5; top += TS-6)
222     for (left=2; left < width-5; left += TS-6) {
223
224@@ -4978,6 +5027,7 @@ void CLASS ahd_interpolate()
225       }
226     }
227   free (buffer);
228+} /* pragma omp parallel */
229 }
230 #undef TS
231
232@@ -9825,6 +9875,7 @@ void CLASS fuji_rotate()
233   img = (ushort (*)[4]) calloc (high, wide*sizeof *img);
234   merror (img, "fuji_rotate()");
235
236+#pragma omp parallel for default(shared) private(row,col,ur,uc,r,c,fr,fc,pix,i)
237   for (row=0; row < high; row++)
238     for (col=0; col < wide; col++) {
239       ur = r = fuji_width + (row-col)*step;
240