1 /* -*- mode: c++ -*- */
2 /* ------------------------------ */
3 /* driver for lmder example. */
4 /* ------------------------------ */
5
6 #include <stdio.h>
7 #include <math.h>
8 #include <string.h>
9 #include <cminpack.h>
10
11 #include <lmder.cu>
12 #include <covar1.cu>
13 #define real __cminpack_real__
14
15 #define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
__cudaSafeCall(cudaError err,const char * file,const int line)16 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
17 {
18 if( cudaSuccess != err) {
19 fprintf(stderr, "cudaSafeCall() Runtime API error in file <%s>, line %i : %s.\n",
20 file, line, cudaGetErrorString( err) );
21 exit(-1);
22 }
23 }
24
25 //--------------------------------------------------------------------------
26 //--------------------------------------------------------------------------
27 const unsigned int NUM_OBSERVATIONS = 15; // m
28 const unsigned int NUM_PARAMS = 3; // 3 = n
29
30 //--------------------------------------------------------------------------
31 //--------------------------------------------------------------------------
32 //
33 // fixed arrangement of threads to be run
34 //
35 const unsigned int NUM_THREADS = 2048;
36 const unsigned int NUM_THREADS_PER_BLOCK = 128;
37 const unsigned int NUM_BLOCKS = NUM_THREADS / NUM_THREADS_PER_BLOCK;
38
39 //--------------------------------------------------------------------------
40 //
41 // the struct for returning results from the GPU
42 //
43
44 typedef struct
45 {
46 real fnorm;
47 int nfev;
48 int njev;
49 int info;
50 int rankJ;
51 real solution[NUM_PARAMS];
52 real covar[NUM_PARAMS][NUM_PARAMS];
53 } ResultType;
54
55 //--------------------------------------------------------------------------
56 // the cost function
57 //--------------------------------------------------------------------------
58 __cminpack_attr__ /* __device__ */
fcnder_mn(void * p,int m,int n,const real * x,real * fvec,real * fjac,int ldfjac,int iflag)59 int fcnder_mn(
60 void *p, int m, int n, const real *x,
61 real *fvec, real *fjac,
62 int ldfjac, int iflag)
63 {
64
65 /* subroutine fcn for lmder example. */
66
67 int i;
68 real tmp1, tmp2, tmp3, tmp4;
69 real y[NUM_OBSERVATIONS]={1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1,
70 2.9e-1, 3.2e-1, 3.5e-1, 3.9e-1, 3.7e-1,
71 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
72
73 if (iflag == 0) {
74 /* insert print statements here when nprint is positive. */
75 return 0;
76 }
77
78 if (iflag != 2) {
79
80 for (i = 1; i <= NUM_OBSERVATIONS; i++) {
81 tmp1 = i;
82 tmp2 = (NUM_OBSERVATIONS+1) - i;
83 tmp3 = tmp1;
84 if (i > 8) tmp3 = tmp2;
85 fvec[i-1] = y[i-1] - (x[1-1] + tmp1/(x[2-1]*tmp2 + x[3-1]*tmp3));
86 } // for
87
88 } else {
89
90 for (i=1; i<=NUM_OBSERVATIONS; i++) {
91 tmp1 = i;
92 tmp2 = (NUM_OBSERVATIONS+1) - i;
93 tmp3 = tmp1;
94 if (i > 8) tmp3 = tmp2;
95 tmp4 = (x[2-1]*tmp2 + x[3-1]*tmp3); tmp4 = tmp4*tmp4;
96 fjac[i-1 + ldfjac*(1-1)] = -1.;
97 fjac[i-1 + ldfjac*(2-1)] = tmp1*tmp2/tmp4;
98 fjac[i-1 + ldfjac*(3-1)] = tmp1*tmp3/tmp4;
99 } // for
100 } // if
101
102 return 0;
103 }
104
105 //--------------------------------------------------------------------------
106 // the kernel in the GPU
107 //--------------------------------------------------------------------------
mainKernel(ResultType pResults[])108 __global__ void mainKernel(ResultType pResults[])
109 {
110 int ldfjac, maxfev, mode, nprint, info, nfev, njev;
111 int ipvt[NUM_PARAMS];
112 real ftol, xtol, gtol, factor, fnorm;
113 real x[NUM_PARAMS], fvec[NUM_OBSERVATIONS],
114 diag[NUM_PARAMS], fjac[NUM_OBSERVATIONS*NUM_PARAMS], qtf[NUM_PARAMS],
115 wa1[NUM_PARAMS], wa2[NUM_PARAMS], wa3[NUM_PARAMS], wa4[NUM_OBSERVATIONS];
116 int k;
117
118 // m = NUM_OBSERVATIONS;
119 // n = NUM_PARAMS;
120
121 /* the following starting values provide a rough fit. */
122
123 x[1-1] = 1.;
124 x[2-1] = 1.;
125 x[3-1] = 1.;
126
127 ldfjac = NUM_OBSERVATIONS;
128
129 /* set ftol and xtol to the square root of the machine */
130 /* and gtol to zero. unless high solutions are */
131 /* required, these are the recommended settings. */
132
133 ftol = sqrt(__cminpack_func__(dpmpar)(1));
134 xtol = sqrt(__cminpack_func__(dpmpar)(1));
135 gtol = 0.;
136
137 maxfev = 400;
138 mode = 1;
139 factor = 1.e2;
140 nprint = 0;
141
142 // -------------------------------
143 // call lmder, enorm, and covar1
144 // -------------------------------
145 info = __cminpack_func__(lmder)(__cminpack_param_fcnder_mn__ 0, NUM_OBSERVATIONS, NUM_PARAMS,
146 x, fvec, fjac, ldfjac, ftol, xtol, gtol,
147 maxfev, diag, mode, factor, nprint, &nfev, &njev,
148 ipvt, qtf, wa1, wa2, wa3, wa4);
149
150 fnorm = __cminpack_func__(enorm)(NUM_OBSERVATIONS, fvec);
151
152 // NOTE: REMOVED THE TEST OF ORIGINAL MINPACK covar routine
153
154 /* test covar1, which also estimates the rank of the Jacobian */
155 ftol = __cminpack_func__(dpmpar)(1);
156 k = __cminpack_func__(covar1)(NUM_OBSERVATIONS, NUM_PARAMS,
157 fnorm*fnorm, fjac, ldfjac, ipvt, ftol, wa1);
158
159 // ----------------------------------
160 // save the results in global memory
161 // ----------------------------------
162 int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
163
164 pResults[threadId].fnorm = fnorm;
165 pResults[threadId].nfev = nfev;
166 pResults[threadId].njev = njev;
167 pResults[threadId].info = info;
168
169 for (int j=1; j<=NUM_PARAMS; j++) {
170 pResults[threadId].solution[j-1] = x[j-1];
171 }
172
173 for (int i=1; i<=NUM_PARAMS; i++) {
174 for (int j=1; j<=NUM_PARAMS; j++) {
175 pResults[threadId].covar[i-1][j-1] = fjac[(i-1)*ldfjac+j-1];
176 } // for
177 } // for
178
179 pResults[threadId].rankJ = (k != 0 ? k : NUM_PARAMS);
180
181 } // ()
182
183 //--------------------------------------------------------------------------
184 //--------------------------------------------------------------------------
main(int argc,char ** argv)185 int main (int argc, char** argv)
186 {
187
188 fprintf (stderr, "\ntlmder starts ! \n");
189 // ...............................................................
190 // choose the fastest GPU device
191 // ...............................................................
192 unsigned int GPU_ID = 1;
193 // unsigned int GPU_ID = cutGetMaxGflopsDeviceId() ;
194 cudaSetDevice(GPU_ID);
195 fprintf (stderr, " CUDA device chosen = %d \n", GPU_ID);
196
197 // .......................................................
198 // get memory in the GPU to store the results
199 // .......................................................
200 ResultType * results_GPU = 0;
201 cutilSafeCall(cudaMalloc( &results_GPU, NUM_THREADS * sizeof(ResultType) ));
202
203 // .......................................................
204 // get memory in the CPU to store the results
205 // .......................................................
206 ResultType * results_CPU = 0;
207 cutilSafeCall(cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(ResultType) ));
208
209 // .......................................................
210 // launch the kernel
211 // .......................................................
212 fprintf (stderr, " \nlaunching the kernel num. blocks = %d, threads per block = %d\n total threads = %d\n\n",
213 NUM_BLOCKS, NUM_THREADS_PER_BLOCK, NUM_THREADS);
214
215 mainKernel<<<NUM_BLOCKS,NUM_THREADS_PER_BLOCK>>> ( results_GPU );
216
217 // .......................................................
218 // wait for termination
219 // .......................................................
220 cudaThreadSynchronize();
221 fprintf (stderr, " GPU processing done \n\n");
222
223 // .......................................................
224 // copy back to CPU the results
225 // .......................................................
226 cutilSafeCall(cudaMemcpy( results_CPU, results_GPU,
227 NUM_THREADS * sizeof(ResultType),
228 cudaMemcpyDeviceToHost
229 ));
230
231 // .......................................................
232 // check all the threads computed the same results
233 // .......................................................
234 bool ok = true;
235 for (unsigned int i = 1; i<NUM_THREADS; i++) {
236 if ( memcmp (&results_CPU[0], &results_CPU[i], sizeof(ResultType)) != 0) {
237 // warning: may the padding bytes be different ?
238 ok = false;
239 }
240 } // for
241
242 if (ok) {
243 fprintf (stderr, " !!! all threads computed the same results !!! \n\n");
244 } else {
245 fprintf (stderr, "ERROR in results of threads \n");
246 }
247
248 // .......................................................
249 // show the results !
250 // .......................................................
251
252 printf(" final l2 norm of the residuals%15.7g\n\n", results_CPU[0].fnorm);
253 printf(" number of function evaluations%10i\n\n", results_CPU[0].nfev);
254 printf(" number of Jacobian evaluations%10i\n\n", results_CPU[0].njev);
255 printf(" exit parameter %10i\n\n", results_CPU[0].info);
256 printf(" final approximate solution\n");
257
258 for (int j=0; j<NUM_PARAMS; j++) {
259 printf("%15.7g", results_CPU[0].solution[j]);
260 }
261 printf("\n");
262
263 printf(" covariance\n");
264
265 for (unsigned int i=0; i<NUM_PARAMS; i++) {
266 for (unsigned int j=0; j<NUM_PARAMS; j++) {
267 printf("%15.7g", results_CPU[0].covar[i][j]);
268 } // for
269 printf ("\n");
270 } // for
271
272 printf("\n");
273 printf(" rank(J) = %d\n", results_CPU[0].rankJ );
274
275 cutilSafeCall(cudaFree(results_GPU));
276 cutilSafeCall(cudaFreeHost(results_CPU));
277 cudaThreadExit();
278 //cutilExit(argc, argv);
279 } // ()
280